APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader

The new common code of compute shader is in file helpers_cs.h Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code. It will also remove the dependence of the token pasting operator (##). We'll remove the "##" support after we rewrite all of the compute shader code. Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Frank Lei <frank.lei@arm.com> (cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883) Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601 Reviewed-by: Jim He <jim.he@arm.com>
author: Joel Liang <joel.liang@arm.com> 2017-11-10 09:59:19 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: f1f3ebd517089e934cf3f06e64d90619a395ad87 (patch)
tree: 8dac05909b5f522a1c78e0ac4423cb6f65254391 /src/core/GLES_COMPUTE/cs_shaders
parent: 283c1790da45ab562ecfb2aa7741297191886d85 (diff)
download: ComputeLibrary-f1f3ebd517089e934cf3f06e64d90619a395ad87.tar.gz
10 files changed, 826 insertions, 704 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index fc9da114f7..38ba183d2a 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
@@ -129,22 +129,22 @@ BUFFER_DECLARATION(dst, 2, float, writeonly);
  * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
  * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
  *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_ptr                              Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         ride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 void main(void)
 {
@@ -193,22 +193,22 @@ BUFFER_DECLARATION(dst, 2, uint, writeonly);
  * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
  * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
  *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_ptr                              Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         ride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 void main(void)
 {
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 54880926cc..c3df5d5c4d 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -60,38 +60,38 @@ BUFFER_DECLARATION(gamma, 6, float, readonly);
  *
  * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
  *
- * @param[in]  src_ptr                              Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src_stride_x                         Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the first source tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p src_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  src_ptr                             Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                        Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                          src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                        Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                          src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                        Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                          src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes   The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                             Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                        Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                        Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                          dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                            Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                       Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                         mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes  The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                             Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in]  var_stride_x                        Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                          var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes   The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                            Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  beta_stride_x                       Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                         beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes  The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                           Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_stride_x                      Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                        gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
  */
 void main(void)
 {
@@ -138,38 +138,38 @@ BUFFER_DECLARATION(gamma, 6, uint, );
  *
  * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
  *
- * @param[in]  src_ptr                              Pointer to the first source tensor. Supported data types: F16
- * @param[in]  src_stride_x                         Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the first source tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p src_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  src_ptr                             Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                        Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                          src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                        Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                          src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                        Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                          src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes   The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                             Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                        Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                        Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                          dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                            Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                       Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                         mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes  The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                             Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in]  var_stride_x                        Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                          var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes   The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                            Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  beta_stride_x                       Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                         beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes  The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                           Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_stride_x                      Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                        gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
  */
 void main(void)
 {
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index 3a31cb80a7..071c1858bc 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -24,107 +24,88 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
 #ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
+    VectorAttributes biases_attrs;
 #endif /* BIAS */
     uint weights_stride_w;
     uint weights_depth;
 };
 
 #if defined(DATA_TYPE_FP32)
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    float pixels  = CONVERT(0, float);
+    float pixels  = 0.f;
     uint  z_index = gl_GlobalInvocationID.z;
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
     float temp;
     float temp_weight;
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        temp        = LOAD4(src, CURRENT_OFFSET(src));
-        temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+        temp        = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+        temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
         pixels += temp * temp_weight;
 
-        src.current_offset += (src_stride_z >> 2);
-        weights.current_offset += (weights_stride_z >> 2);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
-    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 #elif defined(DATA_TYPE_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #if STRIDE_X == 2
@@ -135,15 +116,10 @@ BUFFER_DECLARATION(biases, 4, uint, readonly);
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 2 */
 
-vec4[2] convolve_stride1(Image src, float w)
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
 {
-    uvec4 packed_s;
-    vec4  s[2];
-
-    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+    vec4 s[2];
+    s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
 
     s[0] *= w;
     s[1] *= w;
@@ -151,22 +127,14 @@ vec4[2] convolve_stride1(Image src, float w)
     return s;
 }
 
-vec4[2] convolve_stride2(Image src, float w)
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
 {
-    uvec4 packed_s;
-    vec4  s[2];
-    vec4  r[2];
-
-    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+    vec4 s[2];
+    vec4 r[2];
 
+    s    = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
     r[0] = vec4(s[0].xz, s[1].xz);
-
-    GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
-    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
-    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
-
+    s    = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
     r[1] = vec4(s[0].xz, s[1].xz);
 
     r[0] *= w;
@@ -175,51 +143,14 @@ vec4[2] convolve_stride2(Image src, float w)
     return r;
 }
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     vec4 pixels[2];
@@ -227,48 +158,41 @@ void main()
     pixels[1] = vec4(0.f);
 
     uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
-    weights.current_offset += z_index * weights_stride_w;
-
-    uint  packed_w;
     float w;
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
-        w = unpackHalf2x16(packed_w).x;
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
 
-        vec4 r[2] = CONVOLVE(src, w);
+        vec4 r[2] = CONVOLVE(src_iter, w);
         pixels[0] += r[0];
         pixels[1] += r[1];
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
 
-    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
-    pixels[0] += vec4(b);
-    pixels[1] += vec4(b);
+    pixels[0] += b;
+    pixels[1] += b;
 #endif /* BIAS */
 
-    uvec4 packed_d;
-    packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
-                     packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #else  /* DATA_TYPE_FP32 */
 #error Data type not supported
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index 67b92cb8cf..d450ac17e1 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
@@ -82,7 +82,7 @@ BUFFER_DECLARATION(biases, 4, float, readonly);
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -230,7 +230,7 @@ vec4[2] convolve1x3_stride2(uint offset, vec3 w)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -376,7 +376,7 @@ vec4 convolve1x3_stride2(uint offset, vec3 w)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -481,7 +481,7 @@ vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -647,7 +647,7 @@ vec4[3] load_and_unpack(uint offset)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -903,7 +903,7 @@ vec4[3] load_and_unpack_stride2(uint offset)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -1063,7 +1063,7 @@ vec4[2] load_and_unpack(uint offset)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -1248,7 +1248,7 @@ vec4[2] load_and_unpack(uint offset)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -1444,7 +1444,7 @@ vec4[2] load_and_unpack(uint offset)
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index 4fdbf0d19e..f3b843de73 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -77,7 +77,7 @@ BUFFER_DECLARATION(biases, 4, float, readonly);
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -249,7 +249,7 @@ vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index 3313b88718..ffa0ebb2af 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -618,6 +618,6 @@ void main(void)
     GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
 }
 #endif /* GEMM_ACCUMULATE_BIASES */
-#else  /* DATA_TYPE_F32 */
+#else  /* DATA_TYPE_FP32 */
 #error Data type not supported
-#endif /* DATA_TYPE_F32 */
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
index 86dedf5a9c..ba27eec716 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
@@ -56,7 +56,9 @@
     uint name##_stride_y;                      \
     uint name##_step_y;                        \
     uint name##_offset_first_element_in_bytes; \
-    uint name##_buffer_data_type_size
+    uint name##_buffer_data_type_size;         \
+    uint name##_padding1;                      \
+    uint name##_padding2
 
 #define TENSOR3D_PARAM_DECLARATION(name)       \
     uint name##_stride_x;                      \
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
new file mode 100644
index 0000000000..ad67681067
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_CS_H
+#define ARM_COMPUTE_HELPER_CS_H
+
+#define SHADER_PARAMS_DECLARATION \
+    layout(std140, binding = 0) uniform shader_params
+
+#define TENSOR_DECLARATION(location, buffer_type, type, ptr_name, shift_name, element_shift, access) \
+    layout(std430, binding = location) access buffer buffer_type                                     \
+    {                                                                                                \
+        type ptr_name[];                                                                             \
+    };                                                                                               \
+    const uint shift_name = uint(element_shift)
+
+struct VectorAttributes
+{
+    uint stride_x;                      /**< Stride of the vector in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the vector (in bytes) */
+    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct ImageAttributes
+{
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the image (in bytes) */
+    uint padding1;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+    uint padding2;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+    uint padding3;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct Tensor3DAttributes
+{
+    uint stride_x;                      /**< Stride of the tensor in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint stride_y;                      /**< Stride of the tensor in Y dimension (in bytes) */
+    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+    uint stride_z;                      /**< Stride of the tensor in Z dimension (in bytes) */
+    uint step_z;                        /**< stride_z * number of elements along Z processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the tensor (in bytes) */
+    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct VectorIterator
+{
+    int current_offset_in_bytes; /**< Current offset of vector (in bytes) */
+    int stride_x;                /**< Stride of the vector in X dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+struct ImageIterator
+{
+    int current_offset_in_bytes; /**< Current offset of image (in bytes) */
+    int stride_x;                /**< Stride of the image in X dimension (in bytes) */
+    int stride_y;                /**< Stride of the image in Y dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+struct Tensor3DIterator
+{
+    int current_offset_in_bytes; /**< Current offset of tensor (in bytes) */
+    int stride_x;                /**< Stride of the tensor in X dimension (in bytes) */
+    int stride_y;                /**< Stride of the tensor in Y dimension (in bytes) */
+    int stride_z;                /**< Stride of the tensor in Z dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+#define CONVERT_TO_VECTOR_ITERATOR(attrs, element_shift)                          \
+    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                              attrs.stride_x, attrs.step_x)
+
+#define CONVERT_TO_VECTOR_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                              attrs.stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_ITERATOR(attrs, element_shift)                          \
+    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                             attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y)
+
+#define CONVERT_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                             attrs.stride_x, uint(0), attrs.stride_y, uint(0))
+
+#define CONVERT_TO_TENSOR3D_ITERATOR(attrs, element_shift)                          \
+    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(attrs, element_shift)                               \
+    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                           attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                       \
+    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                           attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, attrs.step_z)
+
+/** Wrap vector information into a VectorIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ *
+ * @return A VectorIterator object
+ */
+VectorIterator update_vector_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    VectorIterator vector_iter;
+    vector_iter.element_shift           = int(element_shift);
+    vector_iter.stride_x                = int(stride_x);
+    vector_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x);
+
+    return vector_iter;
+}
+
+/** Wrap image information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    ImageIterator image_iter;
+    image_iter.element_shift           = int(element_shift);
+    image_iter.stride_x                = int(stride_x);
+    image_iter.stride_y                = int(stride_y);
+    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y);
+
+    return image_iter;
+}
+
+/** Wrap 3D tensor information into a Tensor3DIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tersor
+ * @param[in] stride_x                      Stride of the tersor in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the tersor in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z                      Stride of the tersor in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return A 3D Tensor3DIterator object
+ */
+Tensor3DIterator update_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3DIterator tensor_iter;
+    tensor_iter.element_shift           = int(element_shift);
+    tensor_iter.stride_x                = int(stride_x);
+    tensor_iter.stride_y                = int(stride_y);
+    tensor_iter.stride_z                = int(stride_z);
+    tensor_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+    return tensor_iter;
+}
+
+/** Wrap 3D tensor information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] stride_x                      Stride of the tensor in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the tensor in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z                      Stride of the tensor in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_from_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    ImageIterator image_iter;
+    image_iter.element_shift           = int(element_shift);
+    image_iter.stride_x                = int(stride_x);
+    image_iter.stride_y                = int(stride_y);
+    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+    return image_iter;
+}
+
+#define VECTOR_OFFSET(tensor_iter, x) \
+    uint(vector_offset_in_bytes(tensor_iter, int(x)) >> tensor_iter.element_shift)
+
+#define IMAGE_OFFSET(tensor_iter, x, y) \
+    uint(image_offset_in_bytes(tensor_iter, int(x), int(y)) >> tensor_iter.element_shift)
+
+#define TENSOR3D_OFFSET(tensor_iter, x, y, z) \
+    uint(tensor3D_offset_in_bytes(tensor_iter, int(x), int(y), int(z)) >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET(tensor_iter) \
+    uint(tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET_IN_BYTES(tensor_iter) \
+    uint(tensor_iter.current_offset_in_bytes)
+
+#define TENSOR_ITERATOR_ADVANCE_IN_BYTES(tensor_iter, n) \
+    tensor_iter.current_offset_in_bytes += int(n)
+
+/** Get the offset of a VectorIterator
+ *
+ * @param[in] vector_iter The VectorIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ *
+ * @return The relative offset of the VectorIterator object (in bytes)
+ */
+uint vector_offset_in_bytes(VectorIterator vector_iter, int x)
+{
+    return uint(vector_iter.current_offset_in_bytes + x * vector_iter.stride_x);
+}
+
+/** Get the offset of an ImageIterator
+ *
+ * @param[in] vector_iter The ImageIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ * @param[in] y           Relative Y position
+ *
+ * @return The relative offset of the ImageIterator object (in bytes)
+ */
+uint image_offset_in_bytes(ImageIterator image_iter, int x, int y)
+{
+    return uint(image_iter.current_offset_in_bytes + x * image_iter.stride_x + y * image_iter.stride_y);
+}
+
+/** Get the offset of a Tensor3DIterator
+ *
+ * @param[in] vector_iter The Tensor3DIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ * @param[in] y           Relative Y position
+ * @param[in] z           Relative Z position
+ *
+ * @return The relative offset of the Tensor3DIterator object (in bytes)
+ */
+uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
+{
+    return uint(tensor_iter.current_offset_in_bytes + x * tensor_iter.stride_x + y * tensor_iter.stride_y + z * tensor_iter.stride_z);
+}
+
+#define LOAD(tensor_ptr, offset) tensor_ptr[offset]
+#define STORE(tensor_ptr, offset, data) tensor_ptr[offset] = data
+#define LOAD_CURRENT_ITEM(tensor_ptr, tensor_iter) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)]
+#define STORE_CURRENT_ITEM(tensor_ptr, tensor_iter, data) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)] = data
+
+#define VLOAD2(return_type, tensor_ptr, offset) \
+    return_type(LOAD(tensor_ptr, offset),       \
+                LOAD(tensor_ptr, (offset) + uint(1)))
+
+#define VSTORE2(tensor_ptr, offset, data) \
+    STORE(tensor_ptr, offset, data[0]);   \
+    STORE(tensor_ptr, (offset) + uint(1), data[1])
+
+#define VLOAD2_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD2(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE2(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD3(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)))
+
+#define VSTORE3(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2])
+
+#define VLOAD3_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD3(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE3_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE3(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)), \
+                LOAD(tensor_ptr, (offset) + uint(3)))
+
+#define VSTORE4(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+    STORE(tensor_ptr, (offset) + uint(3), data[3])
+
+#define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+/** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
+ *
+ * @param[in] data The vec4 object to be packed
+ *
+ * @return The packed uvec2 object
+ */
+highp uvec2 pack4_half(mediump vec4 data)
+{
+    return uvec2(packHalf2x16(data.xy), packHalf2x16(data.zw));
+}
+
+/** Unpacking the uvec2 object to 4 half-precision (16-bits) floating point values and converting to a vec4 object
+ *
+ * @param[in] packed_data The uvec2 object to be unpacked
+ *
+ * @return The unpacked vec4 object
+ */
+mediump vec4 unpack4_half(highp uvec2 packed_data)
+{
+    return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
+}
+
+/** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
+ *
+ * @param[in] data The vec4[2] object to be packed
+ *
+ * @return The packed uvec4 object
+ */
+highp uvec4 pack8_half(mediump vec4 data[2])
+{
+    return uvec4(packHalf2x16(data[0].xy), packHalf2x16(data[0].zw),
+                 packHalf2x16(data[1].xy), packHalf2x16(data[1].zw));
+}
+
+/** Unpacking the uvec4 object to 8 half-precision (16-bits) floating point values and converting to a vec4[2] object
+ *
+ * @param[in] packed_data The uvec4 object to be unpacked
+ *
+ * @return The unpacked vec4[2] object
+ */
+mediump vec4[2] unpack8_half(highp uvec4 packed_data)
+{
+    return vec4[2](vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y)),
+                   vec4(unpackHalf2x16(packed_data.z), unpackHalf2x16(packed_data.w)));
+}
+
+// For half-precision (16-bits) floating point packed into a "uint" element
+#define LOAD_UNPACK2_HALF(tensor_ptr, offset) unpackHalf2x16(LOAD(tensor_ptr, offset))
+#define STORE_PACK2_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, packHalf2x16(data))
+#define LOAD_UNPACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(VLOAD2(uvec2, tensor_ptr, offset))
+#define VSTORE2_PACK4_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack4_half(data))
+#define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
+#define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
+#define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE4_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec2" element
+#define LOAD_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(LOAD(tensor_ptr, offset))
+#define STORE_PACK4_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_half(data))
+#define LOAD_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD2(uvec4, tensor_ptr, offset))
+#define VSTORE2_PACK8_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack8_half(data))
+#define VLOAD2_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec4" element
+#define LOAD_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(LOAD(tensor_ptr, offset))
+#define STORE_PACK8_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack8_half(data))
+#define LOAD_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#endif // ARM_COMPUTE_HELPER_CS_H
diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
index 5699340c14..166953ffc0 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
@@ -45,30 +45,30 @@ BUFFER_DECLARATION(dst, 3, float, writeonly);
  * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
  * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
  *
- * @param[in]  src1_ptr                                    Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_stride_x                               Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                                 src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                               Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                                 src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                               Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                                 src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes          The offset of the first element in the first source tensor
- * @param[in]  src2_ptr                                    Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_stride_x                               Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                                 src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                               Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                                 src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                               Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                                 src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes          The offset of the second element in the second source tensor
- * @param[out] dst_ptr                                     Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_stride_x                                Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                  dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                  dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                  dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes           The offset of the first element in the destination tensor
+ * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
  */
 void main(void)
 {
@@ -104,30 +104,30 @@ void main(void)
  * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
  * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
  *
- * @param[in]  src1_ptr                                    Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_stride_x                               Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                                 src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                               Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                                 src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                               Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                                 src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes          The offset of the first element in the first source tensor
- * @param[in]  src2_ptr                                    Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_stride_x                               Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                                 src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                               Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                                 src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                               Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                                 src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes          The offset of the second element in the second source tensor
- * @param[out] dst_ptr                                     Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_stride_x                                Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                  dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                  dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                  dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes           The offset of the first element in the destination tensor
+ * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
  */
 void main(void)
 {
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index 0bbabeaafc..1a2c3f7b20 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
@@ -24,99 +24,60 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+// Common definitions
 #define MAX_OP(x, y) max((x), (y))
 #define ADD_OP(x, y) ((x) + (y))
 #define SUB_OP(x, y) ((x) - (y))
 #define DIV_OP(x, y) ((x) / (y))
 #define EXP_OP(x) exp((x))
 
-#if defined(DATA_TYPE_FP32)
-const float MINVAL   = -1.0 / 0.0;
-vec4        type_min = CONVERT(MINVAL, vec4);
-
-#define LOAD16(name, offset)            \
-    vec4(LOAD4(name, offset),           \
-         LOAD4(name, offset + uint(1)), \
-         LOAD4(name, offset + uint(2)), \
-         LOAD4(name, offset + uint(3)))
-
-#define STORE16(name, offset, value)         \
-    STORE4(name, offset, value.x);           \
-    STORE4(name, offset + uint(1), value.y); \
-    STORE4(name, offset + uint(2), value.z); \
-    STORE4(name, offset + uint(3), value.w)
+const float float_min = -1.0 / 0.0;
+const vec4  vec4_min  = vec4(float_min);
 
 #ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(max, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-BUFFER_DECLARATION(sum, 4, float, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(sum, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-#endif // SOFTMAX_LAYER_MAX
 
-layout(std140) uniform shader_params
-{
-#ifdef SOFTMAX_LAYER_MAX
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(max);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(sum);
-    uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(sum);
-    TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
-};
-
-#ifdef SOFTMAX_LAYER_MAX
 /** Identifies the maximum value across the 1st dimension.
  *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             Input image width
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ * @param[in]  width     Input image width
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    uint               width;
+};
+
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     // Initialize local maximum
-    vec4 max_val = CONVERT(type_min, vec4);
+    vec4 max_val = vec4_min;
 
     // Calculate max of row
     uint width2 = width >> 2;
     for(int i = 0; i < int(width2); i++)
     {
-        vec4 data = LOAD16(src, offset(src, i << 2, 0));
+        vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
         max_val   = MAX_OP(data, max_val);
     }
 
@@ -124,7 +85,7 @@ void main(void)
     // Handle non multiple of 4
     for(int i = int(width2 << 2); i < int(width); i++)
     {
-        float data = LOAD4(src, offset(src, i, 0));
+        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
         max_val.x  = MAX_OP(data, max_val.x);
     }
 #endif /* NON_MULTIPLE_OF_4 */
@@ -134,408 +95,247 @@ void main(void)
     max_val.x  = MAX_OP(max_val.x, max_val.y);
 
     // Store result
-    STORE4(dst, CURRENT_OFFSET(dst), max_val.x);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, max_val.x);
 }
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in]  width                             Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
-    // Load max value of 1D logits vector (row)
-    vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4);
-
-    // Set sum vector
-    vec4 sum1D = CONVERT(0, vec4);
+    // Initialize local maximum
+    vec4 max_val = vec4_min;
 
-    // Shift values, exp and sum
+    // Calculate max of row
     uint width2 = width >> 2;
     for(int i = 0; i < int(width2); i++)
     {
-        vec4 data = LOAD16(src, offset(src, i << 2, 0));
-        data      = SUB_OP(data, max_val);
-        data      = EXP_OP(data);
-        STORE16(dst, offset(dst, i << 2, 0), data);
-        sum1D = ADD_OP(sum1D, data);
+        vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+        max_val   = MAX_OP(data, max_val);
     }
 
 #ifdef NON_MULTIPLE_OF_4
     // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i++)
+    for(int i = int(width2 << 2); i < int(width); i = i + 2)
     {
-        float data;
-        data = LOAD4(src, offset(src, i, 0));
-        data = SUB_OP(data, max_val.x);
-        data = EXP_OP(data);
-        STORE4(dst, offset(dst, i, 0), data);
-        sum1D.x = ADD_OP(sum1D.x, data);
+        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        max_val.x = MAX_OP(data.x, max_val.x);
+        if((i + 1) < int(width))
+        {
+            max_val.x = MAX_OP(data.y, max_val.x);
+        }
     }
-#endif                            /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
 
-    // Perform min/max reduction
-    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
-    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
+    // Perform max reduction
+    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+    max_val.x  = MAX_OP(max_val.x, max_val.y);
 
-    // Calculate and store result
-    STORE4(sum, CURRENT_OFFSET(sum), sum1D.x);
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, max_val.xy);
 }
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+#else  // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
  *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[in]  max_ptr   Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_attrs The attributes of the max values tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ * @param[out] sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_attrs The attributes of the sum values tensor
+ * @param[in]  width     Input image width
  */
-void main(void)
-{
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4);
-    vec4 data    = LOAD16(src, CURRENT_OFFSET(src));
-    STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val));
-}
-#endif                            // SOFTMAX_LAYER_MAX
-
-#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-
-const float MINVAL1   = -1.0 / 0.0;
-vec4        type_min1 = CONVERT(MINVAL1, vec4);
-
-#define GC_LOAD4_IMAGE(r, name, x, y)  \
-    load_and_unpack(r.xy, name, x, y); \
-    load_and_unpack(r.zw, name, (x + 2), y)
-
-#define GC_STORE4_IMAGE(r, name, x, y)                         \
-    GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \
-    GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y)
-
-#ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(max, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-BUFFER_DECLARATION(sum, 4, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(sum, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-#endif // SOFTMAX_LAYER_MAX
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
 {
-#ifdef SOFTMAX_LAYER_MAX
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(max);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(sum);
-    uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(sum);
-    TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes max_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes sum_attrs;
+    uint               width;
 };
+#if defined(DATA_TYPE_FP32)
 
-#define load_and_unpack(rs, names, xs, ys)           \
-    do                                               \
-    {                                                \
-        uint packed_s;                               \
-        GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \
-        rs = vec2(unpackHalf2x16(packed_s));         \
-    } while(false)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
 
-#ifdef SOFTMAX_LAYER_MAX
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             Input image width
- */
 void main(void)
 {
-    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
 
-    // Initialize local maximum
-    vec4 max_val1 = CONVERT(type_min1, vec4);
+    // Load max value of 1D logits vector (row)
+    vec4 max_val = vec4(LOAD_CURRENT_ITEM(max_ptr, max_iter));
 
-    // Calculate max of row
+    // Set sum vector
+    vec4 sum1D = vec4(0);
+
+    // Shift values, exp and sum
     uint width2 = width >> 2;
     for(int i = 0; i < int(width2); i++)
     {
-        vec4 data1;
-        GC_LOAD4_IMAGE(data1, src, (i << 2), 0);
-        max_val1 = MAX_OP(data1, max_val1);
+        vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+        data      = SUB_OP(data, max_val);
+        data      = EXP_OP(data);
+        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+        sum1D = ADD_OP(sum1D, data);
     }
 
 #ifdef NON_MULTIPLE_OF_4
     // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i = i + 2)
+    for(int i = int(width2 << 2); i < int(width); i++)
     {
-        vec2 data;
-        load_and_unpack(data, src, i, 0);
-        max_val1.x = MAX_OP(data.x, max_val1.x);
-        if((i + 1) < int(width))
-        {
-            max_val1.x = MAX_OP(data.y, max_val1.x);
-        }
+        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        data       = SUB_OP(data, max_val.x);
+        data       = EXP_OP(data);
+        STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+        sum1D.x = ADD_OP(sum1D.x, data);
     }
-#endif                                     /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
 
-    // Perform max reduction
-    max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw);
-    max_val1.x  = MAX_OP(max_val1.x, max_val1.y);
-    vec2 res1   = vec2(max_val1.x, 0.f);
-    uint res;
-    res = uint(packHalf2x16(res1));
+    // Perform min/max reduction
+    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
 
-    // Store result
-    GC_STORE1_2D_OFFSET(res, dst, 0, 0);
+    // Calculate and store result
+    STORE_CURRENT_ITEM(sum_ptr, sum_iter, sum1D.x);
 }
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in]  width                             Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
+
 void main(void)
 {
-    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
-    Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
 
     // Load max value of 1D logits vector (row)
-    vec2 datamaxinit;
-    load_and_unpack(datamaxinit, max, 0, 0);
-    vec4 max_val = CONVERT(datamaxinit.x, vec4);
+    vec2 datamaxinit = LOAD_UNPACK2_CURRENT_ITEM_HALF(max_ptr, max_iter);
+    vec4 max_val     = vec4(datamaxinit.x);
 
     // Set sum vector
-    vec4 sum1D1 = CONVERT(0.f, vec4);
+    vec4 sum1D = vec4(0.f);
 
     // Shift values, exp and sum
     uint width2 = width >> 2;
     for(int i = 0; i < int(width2); i++)
     {
-        vec4 data;
-        GC_LOAD4_IMAGE(data, src, (i << 2), 0);
-        data = SUB_OP(data, max_val);
-        data = EXP_OP(data);
-        GC_STORE4_IMAGE(data, dst, (i << 2), 0);
-        sum1D1 = ADD_OP(sum1D1, data);
+        vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+        data      = SUB_OP(data, max_val);
+        data      = EXP_OP(data);
+        VSTORE2_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+        sum1D = ADD_OP(sum1D, data);
     }
 
 #ifdef NON_MULTIPLE_OF_4
     // Handle non multiple of 4
     for(int i = int(width2 << 2); i < int(width); i = i + 2)
     {
-        vec2  datamiddle;
-        float data1;
-        load_and_unpack(datamiddle, src, i, 0);
-        data1 = SUB_OP(datamiddle.x, max_val.x);
-        data1 = EXP_OP(data1);
-        vec2 datares1;
+        float data;
+        vec2  datamiddle = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        data             = SUB_OP(datamiddle.x, max_val.x);
+        data             = EXP_OP(data);
+        vec2 datares;
         if((i + 1) < int(width))
         {
             float data2;
-            data2    = SUB_OP(datamiddle.y, max_val.x);
-            data2    = EXP_OP(data2);
-            datares1 = vec2(data1, data2);
-            data1    = ADD_OP(data2, data1);
+            data2   = SUB_OP(datamiddle.y, max_val.x);
+            data2   = EXP_OP(data2);
+            datares = vec2(data, data2);
+            data    = ADD_OP(data2, data);
         }
         else
         {
-            datares1 = vec2(data1, 0.f);
+            datares = vec2(data, 0.f);
         }
-        uint datares;
-        datares = uint(packHalf2x16(datares1));
-        GC_STORE1_2D_OFFSET(datares, dst, i, 0);
-        sum1D1.x = ADD_OP(sum1D1.x, data1);
+
+        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), datares);
+
+        sum1D.x = ADD_OP(sum1D.x, data);
     }
-#endif                            /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
 
     // Perform min/max reduction
-    sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw);
-    sum1D1.x  = ADD_OP(sum1D1.x, sum1D1.y);
-    vec2 res1 = vec2(sum1D1.x, 0.f);
-    uint res;
-    res = uint(packHalf2x16(res1));
+    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
+
     // Calculate and store result
-    GC_STORE1_2D_OFFSET(res, sum, 0, 0);
+    STORE_PACK2_CURRENT_ITEM_HALF(sum_ptr, sum_iter, sum1D.xy);
 }
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+#else  // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_NORM)
+
 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[in]  sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_attrs The attributes of the sum values tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes sum_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
 void main(void)
 {
-    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
 
     // Load max value of 1D logits vector (row)
-    vec2 sum1;
-    load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y));
-    vec4 sum_val1 = CONVERT(sum1.x, vec4);
-
-    vec4 data1;
-    GC_LOAD4_IMAGE(data1, src, 0, 0);
-    vec4 res = DIV_OP(data1, sum_val1);
-    GC_STORE4_IMAGE(res, dst, 0, 0);
+    vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
+    vec4 data    = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, DIV_OP(data, sum_val));
 }
-#endif                            // SOFTMAX_LAYER_MAX
-#endif                            // DATA_TYPE_FP32
-\ No newline at end of file
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
+
+    // Load max value of 1D logits vector (row)
+    vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
+    vec4 data    = VLOAD2_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, DIV_OP(data, sum_val));
+}
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#endif // SOFTMAX_LAYER_MAX
author	Joel Liang <joel.liang@arm.com>	2017-11-10 09:59:19 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	f1f3ebd517089e934cf3f06e64d90619a395ad87 (patch)
tree	8dac05909b5f522a1c78e0ac4423cb6f65254391 /src/core/GLES_COMPUTE/cs_shaders
parent	283c1790da45ab562ecfb2aa7741297191886d85 (diff)
download	ComputeLibrary-f1f3ebd517089e934cf3f06e64d90619a395ad87.tar.gz