aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2017-10-26 15:23:08 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit7068f9900d136312318ff430aef588b14e0c87ad (patch)
treeb57ca81231860f1d8755e6f18e5be7c959fb60c6 /src/core/GLES_COMPUTE/cs_shaders
parentd60737592736715dcfd0520535c48190d4ac77d2 (diff)
downloadComputeLibrary-7068f9900d136312318ff430aef588b14e0c87ad.tar.gz
COMPMID-631: Merge branches/gles_compute branch
Last commit: commit b25c5f68042b0c81bf611d59a1bb8535e1c42497 Author: Xinghang Zhou <xinghang.zhou@arm.com> Date: Wed Oct 25 18:48:10 2017 +0800 Synced validation's tolerances of GCSoftmax from cl side Change-Id: Ibe72054205c1c8721845d679a31af7ed0a7c5cf6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93283 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/absdiff.cs71
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs262
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs222
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/concatenate.cs106
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs302
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs275
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs1583
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs313
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/dropout.cs204
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/fill_border.cs553
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/gemm.cs623
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/helpers.h582
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs157
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs75
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs1444
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs541
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/transpose.cs187
17 files changed, 7500 insertions, 0 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
new file mode 100644
index 0000000000..f6113e13eb
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src1);
+ IMAGE_PARAM_DECLARATION(src2);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, uint, readonly);
+BUFFER_DECLARATION(src2, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @param[in] src1_ptr Pointer to the first source image. Supported data types: U8
+ * @param[in] src1_stride_x Stride of the first source image in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the first source image in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] src2_ptr Pointer to the second source image. Supported data types: Same as @p in1_ptr
+ * @param[in] src2_stride_x Stride of the second source image in X dimension (in bytes)
+ * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the second source image in Y dimension (in bytes)
+ * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: Same as @p in1_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+ Image src2 = CONVERT_TO_IMAGE_STRUCT(src2);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ uvec4 tmp1 = UNPACK(LOAD4(src1, CURRENT_OFFSET(src1)), uint, uvec4);
+ uvec4 tmp2 = UNPACK(LOAD4(src2, CURRENT_OFFSET(src2)), uint, uvec4);
+ uvec4 diff = uvec4(abs(ivec4(tmp1 - tmp2)));
+
+ STORE4(dst, CURRENT_OFFSET(dst), PACK(diff, uvec4, uint));
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
new file mode 100644
index 0000000000..fc9da114f7
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+ return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+ float tmp = float(B_VAL) * x;
+ if(tmp > 10.f)
+ {
+ return MUL_OP(float(A_VAL), 1.f);
+ }
+ else if(tmp < -10.f)
+ {
+ return MUL_OP(float(A_VAL), -1.f);
+ }
+ else
+ {
+ return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+ }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+ return max(0.f, x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+ return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+ return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+ return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+ return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+ return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+ return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+ return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+ return MLA_OP(float(B_VAL), float(A_VAL), x);
+}
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#ifdef DATA_TYPE_FP32
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float data = src_ptr[src.current_offset];
+ float data_out = 0.f;
+ // Perform activation
+
+#ifdef LOGISTIC
+ data_out = logistic_op(data);
+#elif defined(TANH) /*LOGISTIC*/
+ data_out = tanh_op(data);
+#elif defined(RELU) /*RELU*/
+ data_out = relu_op(data);
+#elif defined(BRELU) /*BRELU*/
+ data_out = brelu_op(data);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+ data_out = lu_brelu_op(data);
+#elif defined(LRELU) /*LRELU*/
+ data_out = lrelu_op(data);
+#elif defined(SRELU) /*SRELU*/
+ data_out = srelu_op(data);
+#elif defined(ABS) /*ABS*/
+ data_out = abs_op(data);
+#elif defined(SQUARE) /*SQUARE*/
+ data_out = square_op(data);
+#elif defined(SQRT) /*SQRT*/
+ data_out = sqrt_op(data);
+#elif defined(LINEAR) /*LINEAR*/
+ data_out = linear_op(data);
+#else /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+ dst_ptr[dst.current_offset] = data_out;
+}
+
+#elif defined(DATA_TYPE_FP16)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ uint data = src_ptr[src.current_offset >> 2];
+ // Perform activation
+ float a = unpackHalf2x16(data).x;
+ float b = unpackHalf2x16(data).y;
+ vec2 data_out;
+#ifdef LOGISTIC /*LOGISTIC*/
+ data_out.x = logistic_op(a);
+ data_out.y = logistic_op(b);
+#elif defined(TANH) /*TANH*/
+ data_out.x = tanh_op(a);
+ data_out.y = tanh_op(b);
+#elif defined(RELU) /*RELU*/
+ data_out.x = relu_op(a);
+ data_out.y = relu_op(b);
+#elif defined(BRELU) /*BRELU*/
+ data_out.x = brelu_op(a);
+ data_out.y = brelu_op(b);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+ data_out.x = lu_brelu_op(a);
+ data_out.y = lu_brelu_op(b);
+#elif defined(LRELU) /*LRELU*/
+ data_out.x = lrelu_op(a);
+ data_out.y = lrelu_op(b);
+#elif defined(SRELU) /*SRELU*/
+ data_out.x = srelu_op(a);
+ data_out.y = srelu_op(b);
+#elif defined(ABS) /*ABS*/
+ data_out.x = abs_op(a);
+ data_out.y = abs_op(b);
+#elif defined(SQUARE) /*SQUARE*/
+ data_out.x = square_op(a);
+ data_out.y = square_op(b);
+#elif defined(SQRT) /*SQRT*/
+ data_out.x = sqrt_op(a);
+ data_out.y = sqrt_op(b);
+#elif defined(LINEAR) /*LINEAR*/
+ data_out.x = linear_op(a);
+ data_out.y = linear_op(b);
+#else /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+ dst_ptr[dst.current_offset >> 2] = packHalf2x16(data_out);
+}
+#endif /*DATA_TYPE_FP32*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
new file mode 100644
index 0000000000..54880926cc
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif /*DATA_TYPE_FP32*/
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) inversesqrt((a))
+#define SQCVT_SAT(a) (a)
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ VECTOR_PARAM_DECLARATION(mean);
+ VECTOR_PARAM_DECLARATION(var);
+ VECTOR_PARAM_DECLARATION(beta);
+ VECTOR_PARAM_DECLARATION(gamma);
+};
+
+#ifdef DATA_TYPE_FP32
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(mean, 3, float, readonly);
+BUFFER_DECLARATION(var, 4, float, readonly);
+BUFFER_DECLARATION(beta, 5, float, readonly);
+BUFFER_DECLARATION(gamma, 6, float, readonly);
+
+/** Apply batch normalization.
+ *
+ * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ */
+void main(void)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+ float input_value = 0.f;
+ float denominator = 0.f;
+ float numerator = 0.f;
+ float x_bar = 0.f;
+ float gamma_param = 0.f;
+ float beta_param = 0.f;
+
+ uint current_slice = gl_GlobalInvocationID.z;
+
+ input_value = src_ptr[src.current_offset];
+ denominator = var_ptr[var.current_offset + (current_slice * var.stride_x) >> 2];
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+ // Calculate x bar and store results
+ numerator = mean_ptr[mean.current_offset + (current_slice * mean.stride_x) >> 2];
+ numerator = SUB_OP(input_value, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+ gamma_param = gamma_ptr[gamma.current_offset + (current_slice * beta.stride_x) >> 2];
+ beta_param = beta_ptr[beta.current_offset + (current_slice * beta.stride_x) >> 2];
+
+ dst_ptr[dst.current_offset] = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+}
+
+#elif defined(DATA_TYPE_FP16)
+BUFFER_DECLARATION(src, 1, uint, );
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+BUFFER_DECLARATION(mean, 3, uint, );
+BUFFER_DECLARATION(var, 4, uint, );
+BUFFER_DECLARATION(beta, 5, uint, );
+BUFFER_DECLARATION(gamma, 6, uint, );
+
+/** Apply batch normalization.
+ *
+ * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ */
+void main(void)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT_FP16(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT_FP16(var);
+ Vector beta = CONVERT_TO_VECTOR_STRUCT_FP16(beta);
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma);
+
+ vec2 input_value;
+ float denominator;
+ float numerator;
+ vec2 x_bar;
+ float gamma_param;
+ float beta_param;
+
+ uint current_slice = gl_GlobalInvocationID.z;
+ if((current_slice % uint(2)) == uint(0))
+ {
+ input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
+ denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x;
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+ //Calculate x bar and store results
+ numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x;
+ x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator);
+
+ gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x;
+ beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x;
+
+ dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+ }
+ else
+ {
+ input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
+ denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y;
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+ //Calculate x bar and store results
+ numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y;
+ x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator);
+
+ gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y;
+ beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y;
+
+ dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+ }
+}
+#endif /*DATA_TYPE_FP32*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
new file mode 100644
index 0000000000..65000f2de2
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ dst_ptr[dst.current_offset + uint(OFFSETS_Z >> 2)] = src_ptr[tensor3D_offset(src, -OFFSETS_X, -OFFSETS_Y, 0)];
+}
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ uvec2 packed_s;
+ GC_LOAD1_3D_OFFSET(packed_s, src, -OFFSETS_X, -OFFSETS_Y, 0);
+ dst_ptr[(dst.current_offset + uint(OFFSETS_Z)) >> 3] = packed_s;
+}
+#endif /*DATA_TYPE_FP32*/ \ No newline at end of file
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
new file mode 100644
index 0000000000..1a0c9f1d30
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP16
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#else // DATA_TYPE_FP16
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+#endif // DATA_TYPE_FP16
+
+layout(std140) uniform shader_params
+{
+#ifdef IM2COL_GENERIC
+ TENSOR3D_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+ uint filter_depth;
+ uint src_stride_w;
+ uint dst_stride_w;
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+ TENSOR3D_PARAM_DECLARATION(src);
+ VECTOR_PARAM_DECLARATION(dst);
+ uint width;
+ uint height;
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+ IMAGE_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ uint width;
+#endif // COL2IM
+};
+
+#ifdef DATA_TYPE_FP16
+
+precision mediump float;
+
+#ifdef IM2COL_REDUCED
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] width The width of the input tensor
+ * @param[in] height The height of the input tensor
+ */
+void main(void)
+{
+ uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
+ uvec3 size = uvec3(gl_WorkGroupSize.xyz);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D src_nostep = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(src);
+ Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+ uint image_size = width * height;
+ uint element_count = src_step_x / src_stride_x;
+ uint tmp_out_offset = dst.current_offset + ((pos.x * element_count + pos.y * width + pos.z * image_size) * dst.stride_x);
+ uint width_fp16 = ((width + uint(1)) >> uint(1));
+ uint tmp;
+
+ // odd width
+ if(width % uint(2) != uint(0))
+ {
+ // even row
+ if((pos.y + pos.z * height) % uint(2) == uint(0))
+ {
+ LOAD1(tmp, src, src.current_offset >> uint(2));
+ STORE1(dst, tmp_out_offset >> uint(2), tmp);
+ }
+ else
+ {
+ // special op
+ uint tmpleft = uint(0);
+ uint tmpright = uint(0);
+ LOAD1(tmpright, src, src.current_offset >> uint(2)); // right half
+ if(pos.x == uint(0))
+ {
+ LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, int(width), int(pos.y) - 1, int(pos.z)) >> uint(2)); // left half
+ tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16));
+ }
+ else
+ {
+ LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)) >> uint(2)); // left half
+ tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16)));
+ }
+ STORE1(dst, tmp_out_offset >> uint(2), tmpright);
+ }
+ }
+ else
+ {
+ LOAD1(tmp, src, src.current_offset >> uint(2));
+ STORE1(dst, tmp_out_offset >> uint(2), tmp);
+ }
+
+#ifdef HAS_BIAS
+ // If it is the last thread in the 3 dimensional workgroup
+ if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+ {
+ tmp_out_offset += dst.stride_x;
+
+ // FIXME: need odd/even detection for tmp_out_offset?
+ mediump vec2 bias_vec = vec2(1.0f, 1.0f);
+ uint bias_u = packHalf2x16(bias_vec);
+ STORE1(dst, tmp_out_offset >> uint(2), bias_u);
+ }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_REDUCED
+
+#elif defined(DATA_TYPE_FP32)
+
+#ifdef IM2COL_GENERIC
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] filter_depth The depth of the used filter
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+void main(void)
+{
+ uint xc = gl_GlobalInvocationID.x; // x coordinate in the convolved tensor
+ uint yc = gl_GlobalInvocationID.y; // y coordinate in the convolved tensor
+ uint ch = gl_GlobalInvocationID.z % filter_depth; // input feature map
+ uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch
+
+ // Calculate input indeces
+ uint xi = xc * uint(STRIDE_X) - uint(PAD_X);
+ uint yi = yc * uint(STRIDE_Y) - uint(PAD_Y);
+ uint input_offset = (src_offset_first_element_in_bytes + (ch * src_stride_z) + (batch * src_stride_w)) >> uint(2);
+
+ // Calculate output indeces
+ uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
+ uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
+ uint output_offset = (dst_offset_first_element_in_bytes + (yo * dst_stride_y) + (batch * dst_stride_w) + xo) >> uint(2);
+
+ // Linearize convolution elements
+ for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+ {
+ for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x)
+ {
+#if PAD_X == 0 && PAD_Y == 0
+ output_offset = input_offset + ((x * src_stride_x + y * src_stride_y) >> uint(2));
+ STORE4(dst, output_offset, LOAD4(src, input_offset));
+#else // PAD_X == 0 && PAD_Y == 0
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+ {
+ STORE4(dst, output_offset, 0.0f);
+ }
+ else
+ {
+ output_offset = input_offset + (x * src_stride_x + y * src_stride_y) >> uint(2));
+ STORE4(dst, output_offset, LOAD4(src, input_offset));
+ }
+#endif // PAD_X == 0 && PAD_Y == 0
+ }
+ }
+
+#ifdef HAS_BIAS
+ if(ch == (uint(KERNEL_DEPTH) - 1))
+ {
+ STORE4(dst, output_offset, 1.0f);
+ }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] width The width of the input tensor
+ * @param[in] height The height of the input tensor
+ */
+void main(void)
+{
+ uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
+ uvec3 size = uvec3(gl_WorkGroupSize.xyz);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP(dst);
+ uint image_size = width * height;
+ uint tmp_out_offset = dst.current_offset + (((pos.x + pos.y * width + pos.z * image_size) * dst.stride_x) >> 2);
+
+ STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+
+#ifdef HAS_BIAS
+ // If it is the last thread in the 3 dimensional workgroup
+ if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+ {
+ tmp_out_offset += (dst.stride_x >> uint(2));
+ STORE4(dst, tmp_out_offset, 1.f);
+ }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+void main(void)
+{
+ uvec2 pos = uvec2(gl_GlobalInvocationID.xy);
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ uint idx = pos.x * dst.stride_z + (pos.y / width) * dst.stride_y + (pos.y % width) * dst.stride_x;
+ uint tmp_out_offset = dst.current_offset + (idx >> 2);
+
+ STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+}
+#endif // COL2IM
+
+#else // DATA_TYPE_FP16
+#error Data type not supported
+#endif // DATA_TYPE_FP16
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
new file mode 100644
index 0000000000..3a31cb80a7
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+ VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+ uint weights_stride_w;
+ uint weights_depth;
+};
+
+#if defined(DATA_TYPE_FP32)
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ float pixels = CONVERT(0, float);
+ uint z_index = gl_GlobalInvocationID.z;
+ weights.current_offset += z_index * weights_stride_w >> 2;
+ float temp;
+ float temp_weight;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ temp = LOAD4(src, CURRENT_OFFSET(src));
+ temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+ pixels += temp * temp_weight;
+
+ src.current_offset += (src_stride_z >> 2);
+ weights.current_offset += (weights_stride_z >> 2);
+ }
+
+#ifdef BIAS
+ pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+ STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(Image src, float w)
+{
+ uvec4 packed_s;
+ vec4 s[2];
+
+ GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
+
+ s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+ s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+ s[0] *= w;
+ s[1] *= w;
+
+ return s;
+}
+
+vec4[2] convolve_stride2(Image src, float w)
+{
+ uvec4 packed_s;
+ vec4 s[2];
+ vec4 r[2];
+
+ GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
+ s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+ s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+ r[0] = vec4(s[0].xz, s[1].xz);
+
+ GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
+ s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+ s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+ r[1] = vec4(s[0].xz, s[1].xz);
+
+ r[0] *= w;
+ r[1] *= w;
+
+ return r;
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ vec4 pixels[2];
+ pixels[0] = vec4(0.f);
+ pixels[1] = vec4(0.f);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ uint packed_w;
+ float w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
+ w = unpackHalf2x16(packed_w).x;
+
+ vec4 r[2] = CONVOLVE(src, w);
+ pixels[0] += r[0];
+ pixels[1] += r[1];
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+
+ GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ pixels[0] += vec4(b);
+ pixels[1] += vec4(b);
+#endif /* BIAS */
+
+ uvec4 packed_d;
+ packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
+ packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+ GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+#else /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
new file mode 100644
index 0000000000..67b92cb8cf
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
@@ -0,0 +1,1583 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+ VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+ uint weights_stride_w;
+ uint weights_depth;
+};
+
+#define LOAD12(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1)); \
+ r.z = LOAD4(name, offset + uint(2))
+
+#define LOAD3X3(r, name) \
+ r[0] = LOAD4(name, tensor3D_offset(name, 0, 0, 0)); \
+ r[1] = LOAD4(name, tensor3D_offset(name, 1, 0, 0)); \
+ r[2] = LOAD4(name, tensor3D_offset(name, 2, 0, 0)); \
+ r[3] = LOAD4(name, tensor3D_offset(name, 0, 1, 0)); \
+ r[4] = LOAD4(name, tensor3D_offset(name, 1, 1, 0)); \
+ r[5] = LOAD4(name, tensor3D_offset(name, 2, 1, 0)); \
+ r[6] = LOAD4(name, tensor3D_offset(name, 0, 2, 0)); \
+ r[7] = LOAD4(name, tensor3D_offset(name, 1, 2, 0)); \
+ r[8] = LOAD4(name, tensor3D_offset(name, 2, 2, 0))
+
+#if defined(PROCESS_1_ELEMENT)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ float pixels = CONVERT(0, float);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w >> 2;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ vec3 temp;
+ vec3 w;
+
+ LOAD12(temp, src, offset(src, 0, 0));
+ LOAD12(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+ pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+ LOAD12(temp, src, offset(src, 0, 1));
+ LOAD12(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+ pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+ LOAD12(temp, src, offset(src, 0, 2));
+ LOAD12(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+ pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+ src.current_offset += src_stride_z >> 2;
+ weights.current_offset += weights_stride_z >> 2;
+ }
+
+#ifdef BIAS
+ pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+ STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+#elif defined(PROCESS_8_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve1x3_stride1(uint offset, vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 tmp[3];
+ vec4 r[2];
+
+ LOAD3(tmp, src, offset);
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ middle = vec4(tmp[1].yzw, tmp[2].x);
+ right = vec4(tmp[1].zw, tmp[2].xy);
+
+ r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[2] convolve1x3_stride2(uint offset, vec3 w)
+{
+ vec4 left;
+ vec4 middle;
+ vec4 right;
+ vec4 tmp[3];
+ vec4 r[2];
+
+ LOAD3(tmp, src, offset);
+
+ left = vec4(tmp[0].xz, tmp[1].xz);
+ middle = vec4(tmp[0].yw, tmp[1].yw);
+ right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+ r[0] = left * w[0] + middle * w[1] + right * w[2];
+
+ LOAD2(tmp, src, offset + ((uint(3) * src_stride_x) >> 2));
+
+ left = vec4(tmp[2].xz, tmp[0].xz);
+ middle = vec4(tmp[2].yw, tmp[0].yw);
+ right = vec4(tmp[2].z, tmp[0].xz, tmp[1].x);
+
+ r[1] = left * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ vec4 pixels[2];
+ pixels[0] = vec4(0);
+ pixels[1] = vec4(0);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w >> 2;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ vec3 w;
+ vec4 r[2];
+
+ // first line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+ r = CONVOLVE1x3(src.current_offset >> uint(2), w);
+ pixels[0] += r[0];
+ pixels[1] += r[1];
+
+ // second line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+ r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+ pixels[0] += r[0];
+ pixels[1] += r[1];
+
+ // third line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+ r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+ pixels[0] += r[0];
+ pixels[1] += r[1];
+
+ src.current_offset += src_stride_z >> 2;
+ weights.current_offset += weights_stride_z >> 2;
+ }
+
+#ifdef BIAS
+ float b;
+ LOAD1(b, biases, vector_offset(biases, int(z_index)));
+ pixels[0] += vec4(b);
+ pixels[1] += vec4(b);
+#endif /* BIAS */
+
+ STORE2(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_4_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(uint offset, vec3 w)
+{
+ vec4 tmp[2];
+ vec4 middle;
+ vec4 right;
+
+ LOAD2(tmp, src, offset);
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ return tmp[1];
+}
+
+vec4 convolve1x3_stride2(uint offset, vec3 w)
+{
+ vec4 left;
+ vec4 middle;
+ vec4 right;
+
+ vec4 tmp[3];
+
+ LOAD3(tmp, src, offset);
+
+ left = vec4(tmp[0].xz, tmp[1].xz);
+ middle = vec4(tmp[0].yw, tmp[1].yw);
+ right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+ tmp[0] = left * w[0] + middle * w[1] + right * w[2];
+
+ return tmp[0];
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ vec4 pixels;
+ pixels = vec4(0);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w >> 2;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ vec3 w;
+
+ // first line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+ pixels += CONVOLVE1x3(src.current_offset >> uint(2), w);
+
+ // second line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+ pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+
+ // third line
+ LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+ pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+
+ src.current_offset += src_stride_z >> 2;
+ weights.current_offset += weights_stride_z >> 2;
+ }
+
+#ifdef BIAS
+ float b;
+ LOAD1(b, biases, vector_offset(biases, int(z_index)));
+ pixels += vec4(b);
+#endif /* BIAS */
+
+ STORE1(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
+
+vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
+{
+ vec4 r;
+
+ r = left * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ vec4 pixels[3];
+ pixels[0] = vec4(0);
+ pixels[1] = vec4(0);
+ pixels[2] = vec4(0);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w >> 2;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ vec3 w[3];
+
+ LOAD3(w[0], weights, tensor3D_offset(weights, 0, 0, 0));
+ LOAD3(w[1], weights, tensor3D_offset(weights, 0, 1, 0));
+ LOAD3(w[2], weights, tensor3D_offset(weights, 0, 2, 0));
+
+ vec4 s[2];
+ vec4 middle;
+ vec4 right;
+ // first line
+ LOAD2(s, src, src.current_offset >> uint(2));
+ middle = vec4(s[0].yzw, s[1].x);
+ right = vec4(s[0].zw, s[1].xy);
+ pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+ // second line
+ LOAD2(s, src, (src.current_offset + (src_stride_y >> 2)) >> uint(2));
+ middle = vec4(s[0].yzw, s[1].x);
+ right = vec4(s[0].zw, s[1].xy);
+ pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
+ pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+ // third line
+ LOAD2(s, src, (src.current_offset + (src_stride_y >> 1)) >> uint(2));
+ middle = vec4(s[0].yzw, s[1].x);
+ right = vec4(s[0].zw, s[1].xy);
+ pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
+ pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]);
+ pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+ // forth line
+ LOAD2(s, src, (src.current_offset + (uint(3) * (src_stride_y >> 2))) >> uint(2));
+ middle = vec4(s[0].yzw, s[1].x);
+ right = vec4(s[0].zw, s[1].xy);
+ pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
+ pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
+
+ // fifth line
+ LOAD2(s, src, (src.current_offset + (src_stride_y)) >> uint(2));
+ middle = vec4(s[0].yzw, s[1].x);
+ right = vec4(s[0].zw, s[1].xy);
+ pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
+
+ src.current_offset += src_stride_z >> 2;
+ weights.current_offset += weights_stride_z >> 2;
+ }
+
+#ifdef BIAS
+ float b;
+ LOAD1(b, biases, vector_offset(biases, int(z_index)));
+
+ pixels[0] += vec4(b);
+ pixels[1] += vec4(b);
+ pixels[2] += vec4(b);
+#endif /* BIAS */
+
+ STORE1(dst, dst.current_offset >> uint(2), pixels[0]);
+ STORE1(dst, (dst.current_offset + (dst_stride_y >> 2)) >> uint(2), pixels[1]);
+ STORE1(dst, (dst.current_offset + (dst_stride_y >> 1)) >> uint(2), pixels[2]);
+}
+#elif defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 r[2];
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ middle = vec4(tmp[1].yzw, tmp[2].x);
+ right = vec4(tmp[1].zw, tmp[2].xy);
+
+ r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[3] load_and_unpack(uint offset)
+{
+ uvec4 packed_s[2];
+ vec4 s[3];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+ ;
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
+ s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+ uvec2 packed_d[2];
+ uvec4 vd;
+
+ vec4 pixels[3][2];
+ int i, j;
+ for(i = 0; i < 3; i++)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ pixels[i][j] = vec4(0);
+ }
+ }
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ uvec2 packed_w[3];
+
+ LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+ LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+ LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+ vec3 w[3];
+ w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+ w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+ w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+ uvec4 packed_s[2];
+ vec4 s[3];
+ vec4 r[2];
+ uint offset;
+ // first line
+ offset = src.current_offset >> uint(4);
+ s = load_and_unpack(offset);
+
+ r = CONVOLVE1x3(s, w[0]);
+ pixels[0][0] += r[0];
+ pixels[0][1] += r[1];
+
+ // second line
+ offset = (src.current_offset + src_stride_y) >> uint(4);
+ s = load_and_unpack(offset);
+
+ r = CONVOLVE1x3(s, w[1]);
+ pixels[0][0] += r[0];
+ pixels[0][1] += r[1];
+ r = CONVOLVE1x3(s, w[0]);
+ pixels[1][0] += r[0];
+ pixels[1][1] += r[1];
+
+ // third line
+ offset = (src.current_offset + (src_stride_y << 1)) >> uint(4);
+ s = load_and_unpack(offset);
+
+ r = CONVOLVE1x3(s, w[2]);
+ pixels[0][0] += r[0];
+ pixels[0][1] += r[1];
+ r = CONVOLVE1x3(s, w[1]);
+ pixels[1][0] += r[0];
+ pixels[1][1] += r[1];
+ r = CONVOLVE1x3(s, w[0]);
+ pixels[2][0] += r[0];
+ pixels[2][1] += r[1];
+
+ // forth line
+ offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(4);
+ s = load_and_unpack(offset);
+
+ r = CONVOLVE1x3(s, w[2]);
+ pixels[1][0] += r[0];
+ pixels[1][1] += r[1];
+ r = CONVOLVE1x3(s, w[1]);
+ pixels[2][0] += r[0];
+ pixels[2][1] += r[1];
+
+ // fifth line
+ offset = (src.current_offset + (src_stride_y << 2)) >> uint(4);
+ s = load_and_unpack(offset);
+
+ r = CONVOLVE1x3(s, w[2]);
+ pixels[2][0] += r[0];
+ pixels[2][1] += r[1];
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+ LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ for(i = 0; i < 3; i++)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ pixels[i][j] += vec4(b);
+ }
+ }
+#endif /* BIAS */
+
+ packed_d[0] = uvec2(packHalf2x16(pixels[0][0].xy), packHalf2x16(pixels[0][0].zw));
+ packed_d[1] = uvec2(packHalf2x16(pixels[0][1].xy), packHalf2x16(pixels[0][1].zw));
+ vd = uvec4(packed_d[0], packed_d[1]);
+ STORE1(dst, dst.current_offset >> uint(4), vd);
+
+ packed_d[0] = uvec2(packHalf2x16(pixels[1][0].xy), packHalf2x16(pixels[1][0].zw));
+ packed_d[1] = uvec2(packHalf2x16(pixels[1][1].xy), packHalf2x16(pixels[1][1].zw));
+ vd = uvec4(packed_d[0], packed_d[1]);
+ STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(4), vd);
+
+ packed_d[0] = uvec2(packHalf2x16(pixels[2][0].xy), packHalf2x16(pixels[2][0].zw));
+ packed_d[1] = uvec2(packHalf2x16(pixels[2][1].xy), packHalf2x16(pixels[2][1].zw));
+ vd = uvec4(packed_d[0], packed_d[1]);
+ STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(4), vd);
+}
+#elif defined(PROCESS_X_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride2(offset)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride1(offset)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 r;
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w)
+{
+ vec4 left;
+ vec4 middle;
+ vec4 right;
+ vec4 r;
+
+ left = vec4(tmp[0].xz, tmp[1].xz);
+ middle = vec4(tmp[0].yw, tmp[1].yw);
+ right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+ r = left * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[2] load_and_unpack_stride1(uint offset)
+{
+ uvec2 packed_s[2];
+ vec4 s[2];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ return s;
+}
+
+vec4[3] load_and_unpack_stride2(uint offset)
+{
+ uvec2 packed_s[3];
+ vec4 s[3];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+ LOAD1(packed_s[2], src, offset + uint(2));
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+ s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+
+ return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+ uvec2 packed_d;
+
+ vec4 pixels = vec4(0);
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ uvec2 packed_w[3];
+
+ LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+ LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+ LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+ vec3 w[3];
+ w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+ w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+ w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+#if STRIDE_X == 2
+ vec4 s[3];
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+ vec4 s[2];
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+ vec4 r;
+ uint offset;
+ // first line
+ offset = src.current_offset >> uint(3);
+ s = LOAD_AND_UNPACK(offset);
+
+ pixels += CONVOLVE1x3(s, w[0]);
+
+ // second line
+ offset = (src.current_offset + src_stride_y) >> uint(3);
+ s = LOAD_AND_UNPACK(offset);
+
+ pixels += CONVOLVE1x3(s, w[1]);
+
+ // third line
+ offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+ s = LOAD_AND_UNPACK(offset);
+
+ pixels += CONVOLVE1x3(s, w[2]);
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+ LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ pixels += vec4(b);
+#endif /* BIAS */
+
+ packed_d = uvec2(packHalf2x16(pixels.xy), packHalf2x16(pixels.zw));
+ STORE1(dst, dst.current_offset >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 r;
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+ uvec2 packed_s[2];
+ vec4 s[2];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+ uvec2 packed_d;
+
+ vec4 pixels[3];
+ int i;
+
+ for(i = 0; i < 3; i++)
+ {
+ pixels[i] = vec4(0);
+ }
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ uvec2 packed_w[3];
+
+ LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+ LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+ LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+ vec3 w[3];
+ w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+ w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+ w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+ vec4 s[2];
+ vec4 r;
+ uint offset;
+ // first line
+ offset = src.current_offset >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[0]);
+
+ // second line
+ offset = (src.current_offset + src_stride_y) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[1]);
+ pixels[1] += CONVOLVE1x3(s, w[0]);
+
+ // third line
+ offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[2]);
+ pixels[1] += CONVOLVE1x3(s, w[1]);
+ pixels[2] += CONVOLVE1x3(s, w[0]);
+
+ // forth line
+ offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[1] += CONVOLVE1x3(s, w[2]);
+ pixels[2] += CONVOLVE1x3(s, w[1]);
+
+ // fifth line
+ offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[2] += CONVOLVE1x3(s, w[2]);
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+ LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ for(i = 0; i < 3; i++)
+ {
+ pixels[i] += vec4(b);
+ }
+#endif /* BIAS */
+
+ packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+ STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+ STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+ STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 r;
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+ uvec2 packed_s[2];
+ vec4 s[2];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+ uvec2 packed_d;
+
+ vec4 pixels[4];
+ int i;
+
+ for(i = 0; i < 4; i++)
+ {
+ pixels[i] = vec4(0);
+ }
+
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ uvec2 packed_w[3];
+
+ LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+ LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+ LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+ vec3 w[3];
+ w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+ w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+ w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+ vec4 s[2];
+ vec4 r;
+ uint offset;
+ // first line
+ offset = src.current_offset >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[0]);
+
+ // second line
+ offset = (src.current_offset + src_stride_y) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[1]);
+ pixels[1] += CONVOLVE1x3(s, w[0]);
+
+ // third line
+ offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[2]);
+ pixels[1] += CONVOLVE1x3(s, w[1]);
+ pixels[2] += CONVOLVE1x3(s, w[0]);
+
+ // forth line
+ offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[1] += CONVOLVE1x3(s, w[2]);
+ pixels[2] += CONVOLVE1x3(s, w[1]);
+ pixels[3] += CONVOLVE1x3(s, w[0]);
+
+ // fifth line
+ offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[2] += CONVOLVE1x3(s, w[2]);
+ pixels[3] += CONVOLVE1x3(s, w[1]);
+
+ // sixth line
+ offset = (src.current_offset + uint(5) * (src_stride_y)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[3] += CONVOLVE1x3(s, w[2]);
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+ LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ for(i = 0; i < 4; i++)
+ {
+ pixels[i] += vec4(b);
+ }
+#endif /* BIAS */
+
+ packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+ STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+ STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+ STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[3].xy), packHalf2x16(pixels[3].zw));
+ STORE1(dst, (dst.current_offset + uint(3) * (dst_stride_y)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+ vec4 middle;
+ vec4 right;
+ vec4 r;
+
+ middle = vec4(tmp[0].yzw, tmp[1].x);
+ right = vec4(tmp[0].zw, tmp[1].xy);
+
+ r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+ return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+ uvec2 packed_s[2];
+ vec4 s[2];
+
+ LOAD1(packed_s[0], src, offset);
+ LOAD1(packed_s[1], src, offset + uint(1));
+
+ s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3x2 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+ uvec2 packed_d;
+
+ vec4 pixels[3];
+ int i;
+
+ uint z_base_index = gl_GlobalInvocationID.z << 1;
+
+ // store orginal src current offset
+ uint s_offset = src.current_offset;
+
+ weights.current_offset += z_base_index * weights_stride_w;
+
+ for(int z = 0; z < 2; ++z)
+ {
+ uint z_index = z_base_index + uint(z);
+
+ src.current_offset = s_offset;
+ //weights.current_offset = z_index * weights_stride_w;
+
+ for(i = 0; i < 3; i++)
+ {
+ pixels[i] = vec4(0);
+ }
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ // load 3 weights once
+ uvec2 packed_w[3];
+
+ LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+ LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+ LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+ vec3 w[3];
+ w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+ w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+ w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+ vec4 s[2];
+ vec4 r;
+ uint offset;
+ // first line
+ offset = src.current_offset >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[0]);
+
+ // second line
+ offset = (src.current_offset + src_stride_y) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[1]);
+ pixels[1] += CONVOLVE1x3(s, w[0]);
+
+ // third line
+ offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[0] += CONVOLVE1x3(s, w[2]);
+ pixels[1] += CONVOLVE1x3(s, w[1]);
+ pixels[2] += CONVOLVE1x3(s, w[0]);
+
+ // forth line
+ offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[1] += CONVOLVE1x3(s, w[2]);
+ pixels[2] += CONVOLVE1x3(s, w[1]);
+
+ // fifth line
+ offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+ s = load_and_unpack(offset);
+
+ pixels[2] += CONVOLVE1x3(s, w[2]);
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+ LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+ if(z_index % uint(2) == uint(0))
+ {
+ b = unpackHalf2x16(packed_b).x;
+ }
+ else
+ {
+ b = unpackHalf2x16(packed_b).y;
+ }
+
+ for(i = 0; i < 3; i++)
+ {
+ pixels[i] += vec4(b);
+ }
+#endif /* BIAS */
+
+ packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+ STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+ STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+ packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+ STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+ dst.current_offset += dst_stride_z;
+ }
+}
+#endif /* PROCESS_1_ELEMENT */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
new file mode 100644
index 0000000000..4fdbf0d19e
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+ VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+ uint weights_stride_w;
+ uint weights_depth;
+};
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define LOAD20(r, name, offset) \
+ r[0] = LOAD4(name, offset); \
+ r[1] = LOAD4(name, offset + uint(1)); \
+ r[2] = LOAD4(name, offset + uint(2)); \
+ r[3] = LOAD4(name, offset + uint(3)); \
+ r[4] = LOAD4(name, offset + uint(4))
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ float pixels = CONVERT(0, float);
+ uint z_index = gl_GlobalInvocationID.z;
+ weights.current_offset += z_index * weights_stride_w >> 2;
+ float temp[5];
+ float temp_weight[5];
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ LOAD20(temp, src, offset(src, 0, 0));
+ LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+ pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+ LOAD20(temp, src, offset(src, 0, 1));
+ LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+ pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+ LOAD20(temp, src, offset(src, 0, 2));
+ LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+ pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+ LOAD20(temp, src, offset(src, 0, 3));
+ LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+ pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+ LOAD20(temp, src, offset(src, 0, 4));
+ LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+ pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+ src.current_offset += (src_stride_z >> 2);
+ weights.current_offset += (weights_stride_z >> 2);
+ }
+
+#ifdef BIAS
+ pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+ STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[2] load_src_stride1(Image src, int row)
+{
+ uvec2 packed[2];
+ vec4 ret[2];
+
+ GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+ ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+ ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+ return ret;
+}
+
+vec4[3] load_src_stride2(Image src, int row)
+{
+ uvec2 packed[3];
+ vec4 ret[3];
+
+ GC_LOAD3_2D_OFFSET(packed, src, 0, row);
+
+ ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+ ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+ ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
+
+ return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+ uvec3 packed_w;
+ vec2 ret[3];
+
+ GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+ ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+ ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+ ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+ return ret;
+}
+
+// output 4 element per thread
+vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
+{
+ vec4 src0 = tmp[0];
+ vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+ vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+ vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+ vec4 src4 = tmp[1];
+ vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+ return ret;
+}
+
+vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
+{
+ vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
+ vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
+ vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+ vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
+ vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
+ vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+ return ret;
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+void main()
+{
+ Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+ Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+ vec4 res = vec4(0);
+ vec2 w[3];
+ vec4 s[STRIDE_X + 1];
+ uvec2 packed_d;
+ uint z_index = gl_GlobalInvocationID.z;
+
+ weights.current_offset += z_index * weights_stride_w;
+
+ for(int d = 0; d < int(weights_depth); ++d)
+ {
+ for(int row = 0; row < 5; row++)
+ {
+ w = load_weight(weights, row);
+ s = LOAD_SRC(src, row);
+ res += CONVOLVE1x5(s, w);
+ }
+
+ src.current_offset += src_stride_z;
+ weights.current_offset += weights_stride_z;
+ }
+
+#ifdef BIAS
+ uint packed_b;
+ float b;
+
+ GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+ b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+ res += vec4(b);
+#endif /* BIAS */
+
+ packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
+ GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
new file mode 100644
index 0000000000..54e08b1306
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(mask);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+uint hash(uint x)
+{
+ x += (x << 10u);
+ x ^= (x >> 6u);
+ x += (x << 3u);
+ x ^= (x >> 11u);
+ x += (x << 15u);
+ return x;
+}
+
+uint hash(uvec3 v)
+{
+ return hash(v.x ^ hash(v.y) ^ hash(v.z));
+}
+
+float float_construct(uint m)
+{
+ const uint ieee_mantissa = 0x007FFFFFu;
+ const uint ieee_one = 0x3F800000u;
+
+ m &= ieee_mantissa;
+ m |= ieee_one;
+
+ float f = uintBitsToFloat(m);
+ return f - 1.0;
+}
+
+float rand(vec3 v, float seed)
+{
+ return float_construct(hash(floatBitsToUint(v + seed)));
+}
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(mask, 2, float, );
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] mask_ptr Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in] mask_stride_x Stride of the mask tensor in X dimension (in bytes)
+ * @param[in] mask_step_x mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mask_stride_y Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in] mask_step_y mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] mask_stride_z Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in] mask_step_z mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+ Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float random = 0.f;
+ float inputv = 0.f;
+ float maskv = 0.f;
+ float outputv = 0.f;
+
+#ifdef FORWARD
+ random = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+ maskv = (random > RATIO) ? 1.f : 0.f;
+ GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else /* FORWARD */
+ GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#endif /* FORWARD */
+
+ GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+ outputv = maskv * inputv * float(SCALE);
+ GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(mask, 2, uint, );
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] mask_ptr Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in] mask_stride_x Stride of the mask tensor in X dimension (in bytes)
+ * @param[in] mask_step_x mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mask_stride_y Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in] mask_step_y mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] mask_stride_z Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in] mask_step_z mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+ Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float random1 = 0.f;
+ float random2 = 0.f;
+ uint inputv = uint(0);
+ uint outputv = uint(0);
+ uint maskv = uint(0);
+ vec2 input_vec = vec2(0, 0);
+ vec2 output_vec = vec2(0, 0);
+ vec2 mask_vec = vec2(0, 0);
+
+#ifdef FORWARD
+ random1 = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+ random2 = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED);
+ mask_vec.x = (random1 > RATIO) ? 1.f : 0.f;
+ mask_vec.y = (random2 > RATIO) ? 1.f : 0.f;
+ maskv = packHalf2x16(mask_vec);
+ GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else /* FORWARD */
+ GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+ mask_vec = unpackHalf2x16(maskv);
+#endif /* FORWARD */
+
+ GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+
+ input_vec = unpackHalf2x16(inputv);
+ output_vec = mask_vec * input_vec * float(SCALE);
+ outputv = packHalf2x16(output_vec);
+
+ GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#else /* DATA_TYPE_FP32 */
+
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
new file mode 100644
index 0000000000..01a39866c7
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+#ifdef FILL_IMAGE_BORDERS_REPLICATE
+BUFFER_DECLARATION(buf, 1, float, restrict);
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(buf);
+ uint width;
+ uint height;
+ int start_pos_x;
+ int start_pos_y;
+};
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] width Width of the valid region of the image
+ * @param[in] height Height of the valid region of the image
+ * @param[in] start_pos_x X coordinate indicating the start point of the valid region
+ * @param[in] start_pos_y Y coordinate indicating the start point of the valid region
+ */
+void main()
+{
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+ // Update pointer to point to the starting point of the valid region
+ buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2));
+
+ int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+ int gid0 = int(gl_GlobalInvocationID.x);
+ int gidH = gid0 - total_width;
+ int gidW = gid0 - BORDER_SIZE_LEFT;
+
+ if(gidH >= 0)
+ {
+ // Handle left border
+ float left_val = LOAD4(buf, offset(buf, 0, gidH));
+ for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+ {
+ STORE4(buf, offset(buf, i, gidH), left_val);
+ }
+ // Handle right border
+ float right_val = LOAD4(buf, offset(buf, int(width) - 1, gidH));
+ for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+ {
+ STORE4(buf, offset(buf, int(width) + i, gidH), right_val);
+ }
+ }
+ else
+ {
+ // Get value for corners
+ int val_idx = gidW;
+ if(gidW < 0 || gidW > (int(width) - 1))
+ {
+ val_idx = gidW < 0 ? 0 : int(width) - 1;
+ }
+
+ // Handle top border
+ float top_val = LOAD4(buf, offset(buf, val_idx, 0));
+ for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+ {
+ STORE4(buf, offset(buf, gidW, i), top_val);
+ }
+ // Handle bottom border
+ float bottom_val = LOAD4(buf, offset(buf, val_idx, int(height) - 1));
+ for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+ {
+ STORE4(buf, offset(buf, gidW, int(height) + i), bottom_val);
+ }
+ }
+}
+#endif /* FILL_IMAGE_BORDERS_REPLICATE */
+
+#ifdef FILL_IMAGE_BORDERS_CONSTANT
+BUFFER_DECLARATION(buf, 1, float, writeonly);
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(buf);
+ uint width;
+ uint height;
+ int start_pos_x;
+ int start_pos_y;
+ float constant_value;
+};
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] width Width of the valid region of the image
+ * @param[in] height Height of the valid region of the image
+ * @param[in] start_pos_x X coordinate indicating the start point of the valid region
+ * @param[in] start_pos_y Y coordinate indicating the start point of the valid region
+ * @param[in] constant_value Constant value to use to fill the edges
+ */
+void main()
+{
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+ // Update pointer to point to the starting point of the valid region
+ buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2));
+
+ int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+ int gid0 = int(gl_GlobalInvocationID.x);
+ int gidH = gid0 - total_width;
+ int gidW = gid0 - BORDER_SIZE_LEFT;
+
+ if(gidH >= 0)
+ {
+ // Handle left border
+ for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+ {
+ STORE1(buf, offset(buf, i, gidH), constant_value);
+ }
+ // Handle right border
+ for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+ {
+ STORE1(buf, offset(buf, int(width) + i, gidH), constant_value);
+ }
+ }
+ else
+ {
+ // Handle top border
+ for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+ {
+ STORE1(buf, offset(buf, gidW, i), constant_value);
+ }
+ // Handle bottom border
+ for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+ {
+ STORE1(buf, offset(buf, gidW, int(height) + i), constant_value);
+ }
+ }
+}
+#endif /* FILL_IMAGE_BORDERS_CONSTANT */
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+#ifdef FILL_IMAGE_BORDERS_REPLICATE
+BUFFER_DECLARATION(buf, 1, uint, restrict);
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(buf);
+ uint width;
+ uint height;
+ int start_pos_x;
+ int start_pos_y;
+};
+
+void set_replicate(uint offset, int pos, uint replicate_value)
+{
+ uint packed_b;
+ LOAD1(packed_b, buf, offset);
+
+ vec2 b = unpackHalf2x16(packed_b);
+ vec2 c = unpackHalf2x16(replicate_value);
+
+ if(pos % 2 == 0)
+ {
+ b.x = c.y;
+ }
+ else
+ {
+ b.y = c.x;
+ }
+
+ packed_b = packHalf2x16(b);
+
+ STORE1(buf, offset, packed_b);
+}
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] width Width of the valid region of the image
+ * @param[in] height Height of the valid region of the image
+ * @param[in] start_pos_x X coordinate indicating the start point of the valid region
+ * @param[in] start_pos_y Y coordinate indicating the start point of the valid region
+ */
+void main()
+{
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf);
+
+ // Update pointer to point to the starting point of the valid region
+ buf.current_offset = uint(buf.current_offset + uint(start_pos_y) * buf_stride_y + uint(start_pos_x) * buf_stride_x);
+
+ int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+ int gid0 = int(gl_GlobalInvocationID.x);
+ int gidH = gid0 - total_width;
+ int gidW = gid0 - BORDER_SIZE_LEFT;
+
+ if(gidH >= 0)
+ {
+ // Handle left border
+ uint left_val;
+ LOAD1(left_val, buf, offset_fp16(buf, 0, gidH) >> uint(2));
+ for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+ {
+ uint offset = offset_fp16(buf, i, gidH) >> 2;
+ int pos = i + BORDER_SIZE_LEFT;
+ if(i == -1)
+ {
+ if(pos % 2 == 0)
+ {
+ set_replicate(offset, pos, left_val);
+ }
+ }
+ else
+ {
+ if(pos % 2 == 0)
+ {
+ vec2 a = unpackHalf2x16(left_val);
+ uint b = packHalf2x16(a.xx);
+ STORE1(buf, offset, b);
+ }
+ }
+ }
+ // Handle right border
+ uint right_val;
+ LOAD1(right_val, buf, offset_fp16(buf, int(width) - 1, gidH) >> uint(2));
+ for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+ {
+ uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2;
+ int pos = i + BORDER_SIZE_LEFT + int(width);
+
+ if(i == 0)
+ {
+ if(pos % 2 == 0)
+ {
+ vec2 a = unpackHalf2x16(right_val);
+ uint b = packHalf2x16(a.yy);
+ STORE1(buf, offset, b);
+ }
+ else
+ {
+ set_replicate(offset, pos, right_val);
+ }
+ }
+ else
+ {
+ if(pos % 2 == 0)
+ {
+ vec2 a = unpackHalf2x16(right_val);
+ uint b = packHalf2x16(a.yy);
+ STORE1(buf, offset, b);
+ }
+ }
+ }
+ }
+ else
+ {
+ // Get value for corners
+ int val_idx = gidW;
+ if(gidW < 0 || (gidW > (int(width) - 1)))
+ {
+ val_idx = gidW < 0 ? 0 : (int(width) - 1);
+ }
+
+ // Handle top border
+ uint top_val;
+ LOAD1(top_val, buf, offset_fp16(buf, val_idx, 0) >> uint(2));
+ for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+ {
+ uint offset = offset_fp16(buf, gidW, i) >> 2;
+
+ if(gid0 % 2 == 0)
+ {
+ if(gidW == (int(width) - 1))
+ {
+ vec2 a = unpackHalf2x16(top_val);
+ uint b = packHalf2x16(a.xx);
+ STORE1(buf, offset, b);
+ }
+ else
+ {
+ if(gidW < 0)
+ {
+ vec2 a = unpackHalf2x16(top_val);
+ uint b;
+ if(BORDER_SIZE_LEFT % 2 == 0)
+ {
+ b = packHalf2x16(a.xx);
+ }
+ else
+ {
+ b = packHalf2x16(a.yy);
+ }
+ STORE1(buf, offset, b);
+ }
+ else if(gidW >= int(width))
+ {
+ vec2 a = unpackHalf2x16(top_val);
+ uint b;
+ if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+ {
+ b = packHalf2x16(a.yy);
+ }
+ STORE1(buf, offset, b);
+ }
+ else
+ {
+ STORE1(buf, offset, top_val);
+ }
+ }
+ }
+ }
+ // Handle bottom border
+ uint bottom_val;
+ LOAD1(bottom_val, buf, offset_fp16(buf, val_idx, int(height) - 1) >> uint(2));
+ for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+ {
+ uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2;
+
+ if(gid0 % 2 == 0)
+ {
+ if(gidW == (int(width) - 1))
+ {
+ vec2 a = unpackHalf2x16(bottom_val);
+ uint b = packHalf2x16(a.xx);
+ STORE1(buf, offset, b);
+ }
+ else
+ {
+ if(gidW < 0)
+ {
+ vec2 a = unpackHalf2x16(bottom_val);
+ uint b;
+ if(BORDER_SIZE_LEFT % 2 == 0)
+ {
+ b = packHalf2x16(a.xx);
+ }
+ else
+ {
+ b = packHalf2x16(a.yy);
+ }
+ STORE1(buf, offset, b);
+ }
+ else if(gidW >= int(width))
+ {
+ vec2 a = unpackHalf2x16(bottom_val);
+ uint b;
+ if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+ {
+ b = packHalf2x16(a.yy);
+ }
+ STORE1(buf, offset, b);
+ }
+ else
+ {
+ STORE1(buf, offset, bottom_val);
+ }
+ }
+ }
+ }
+ }
+}
+#endif /* FILL_IMAGE_BORDERS_REPLICATE */
+
+#ifdef FILL_IMAGE_BORDERS_CONSTANT
+BUFFER_DECLARATION(buf, 1, uint, restrict);
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(buf);
+ uint width;
+ uint height;
+ int start_pos_x;
+ int start_pos_y;
+ float constant_value;
+};
+
+void set_constant(uint offset, int pos)
+{
+ uint packed_b;
+ LOAD1(packed_b, buf, offset);
+
+ vec2 b = unpackHalf2x16(packed_b);
+
+ if(pos % 2 == 0)
+ {
+ b.x = constant_value;
+ }
+ else
+ {
+ b.y = constant_value;
+ }
+
+ packed_b = packHalf2x16(b);
+
+ STORE1(buf, offset, packed_b);
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] width Width of the valid region of the image
+ * @param[in] height Height of the valid region of the image
+ * @param[in] start_pos_x X coordinate indicating the start point of the valid region
+ * @param[in] start_pos_y Y coordinate indicating the start point of the valid region
+ * @param[in] constant_value Constant value to use to fill the edges
+ */
+void main()
+{
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf);
+
+ int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+ int gid0 = int(gl_GlobalInvocationID.x);
+ int gidH = gid0 - total_width;
+ int gidW = gid0 - BORDER_SIZE_LEFT;
+
+ // Update pointer to point to the starting point of the valid region
+ buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x))));
+
+ vec2 b = vec2(constant_value, constant_value);
+
+ uint packed_b = packHalf2x16(b);
+
+ if(gidH >= 0)
+ {
+ // Handle left border
+ for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+ {
+ uint offset = offset_fp16(buf, i, gidH) >> 2;
+ int pos = i + BORDER_SIZE_LEFT;
+
+ if(i == -1)
+ {
+ if(pos % 2 == 0)
+ {
+ set_constant(offset, pos);
+ }
+ }
+ else
+ {
+ if(pos % 2 == 0)
+ {
+ STORE1(buf, offset, packed_b);
+ }
+ }
+ }
+ // Handle right border
+ for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+ {
+ uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2;
+ int pos = i + BORDER_SIZE_LEFT + int(width);
+
+ if(i == 0)
+ {
+ if(pos % 2 == 0)
+ {
+ STORE1(buf, offset, packed_b);
+ }
+ else
+ {
+ set_constant(offset, pos);
+ }
+ }
+ else
+ {
+ if(pos % 2 == 0)
+ {
+ STORE1(buf, offset, packed_b);
+ }
+ }
+ }
+ }
+ else
+ {
+ // Handle top border
+ for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+ {
+ uint offset = offset_fp16(buf, gidW, i) >> 2;
+
+ if(gid0 % 2 == 0)
+ {
+ STORE1(buf, offset, packed_b);
+ }
+ }
+ // Handle bottom border
+ for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+ {
+ uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2;
+
+ if(gid0 % 2 == 0)
+ {
+ STORE1(buf, offset, packed_b);
+ }
+ }
+ }
+}
+#endif /* FILL_IMAGE_BORDERS_CONSTANT */
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
new file mode 100755
index 0000000000..3313b88718
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+#define LOAD8(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1)); \
+ r.z = LOAD4(name, offset + uint(2)); \
+ r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r) \
+ STORE4(name, offset, r.x); \
+ STORE4(name, offset + uint(1), r.y); \
+ STORE4(name, offset + uint(2), r.z); \
+ STORE4(name, offset + uint(3), r.w)
+
+#ifdef GEMM_TRANSPOSE1xW
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+ /* Compute address for Matrix B - source */
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+ uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2;
+ vec4 b0;
+ LOAD16(b0, src, offset(src, 0, 0));
+ STORE16(dst, dst_addr_in_bytes, b0);
+}
+#endif /* GEMM_TRANSPOSE1xW */
+
+#ifdef GEMM_INTERLEAVE4x4
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGLES kernel reshapes the input matrix interleaving the values
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+ /* Compute source and destination addresses */
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ int i;
+ int j;
+
+ for(i = 0; i < 4; ++i)
+ {
+ for(j = 0; j < 4; ++j)
+ {
+ float res = LOAD4(src, offset(src, i, j));
+ uint ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j);
+ STORE4(dst, ofset0, res);
+ }
+ }
+}
+#endif /* GEMM_INTERLEAVE4x4 */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+BUFFER_DECLARATION(accum, 1, float, restrict);
+BUFFER_DECLARATION(biases, 2, float, readonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(accum);
+ VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
+ * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
+ Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+ for(int i = 0; i < 16; ++i)
+ {
+ float accum_value = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i));
+ float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i));
+ accum_value = biases_value + accum_value;
+
+ // Store result in the accummulate buffer
+ STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value);
+ }
+}
+#endif /* GEMM_ACCUMULATE_BIASES */
+
+#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src0);
+ IMAGE_PARAM_DECLARATION(src1);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main()
+{
+ Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+ Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ /* Compute address for matrix A and B */
+ src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);
+ src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);
+
+ /* Compute end row address for matrix B */
+ int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);
+
+ /* Reset accumulators */
+ vec4 c00 = vec4(0.0f);
+ vec4 c10 = vec4(0.0f);
+ vec4 c20 = vec4(0.0f);
+ vec4 c30 = vec4(0.0f);
+
+ // FIXME: loop unrolling really needed for GLES?
+ for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))
+ {
+ /* Load values from matrix A (interleaved) and matrix B (transposed) */
+ vec4 a0;
+ vec4 b0;
+ LOAD16(a0, src0, src0.current_offset);
+ LOAD16(b0, src1, src1.current_offset);
+
+ c00 += vec4(a0.x) * b0;
+ c10 += vec4(a0.y) * b0;
+ c20 += vec4(a0.z) * b0;
+ c30 += vec4(a0.w) * b0;
+
+ /* Load values from matrix A (interleaved) and matrix B (transposed) */
+ LOAD16(a0, src0, src0.current_offset + uint(4));
+ LOAD16(b0, src1, src1.current_offset + uint(4));
+
+ c00 += vec4(a0.x) * b0;
+ c10 += vec4(a0.y) * b0;
+ c20 += vec4(a0.z) * b0;
+ c30 += vec4(a0.w) * b0;
+ }
+
+ for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))
+ {
+ /* Load values from matrix A (interleaved) and matrix B (transposed) */
+ vec4 a0;
+ vec4 b0;
+ LOAD16(a0, src0, src0.current_offset);
+ LOAD16(b0, src1, src1.current_offset);
+
+ c00 += vec4(a0.x) * b0;
+ c10 += vec4(a0.y) * b0;
+ c20 += vec4(a0.z) * b0;
+ c30 += vec4(a0.w) * b0;
+ }
+
+ /* Multiply by the weight of matrix product */
+ c00 = c00 * vec4(ALPHA);
+ c10 = c10 * vec4(ALPHA);
+ c20 = c20 * vec4(ALPHA);
+ c30 = c30 * vec4(ALPHA);
+
+ /* Store 4x4 block */
+ STORE16(dst, offset(dst, 0, 0), c00);
+ STORE16(dst, offset(dst, 0, 1), c10);
+ STORE16(dst, offset(dst, 0, 2), c20);
+ STORE16(dst, offset(dst, 0, 3), c30);
+}
+#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
+
+#ifdef GEMM_MM_FLOATING_POINT
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src0);
+ IMAGE_PARAM_DECLARATION(src1);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main()
+{
+ Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+ Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+ /* Compute the address for the vector A and matrix B */
+ src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);
+ src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);
+
+ /* Compute end row address for matrix A */
+ int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);
+
+ /* Reset accumulators */
+ vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))
+ {
+ vec2 a0;
+ LOAD8(a0, src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vec2 a1;
+ LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vec2 a2;
+ LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vec2 a3;
+ LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ vec4 b0;
+ vec4 b1;
+ LOAD16(b0, src1, src1.current_offset);
+ LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));
+
+ acc0 += b0 * vec4(a0.x);
+ acc0 += b1 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1.x);
+ acc1 += b1 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2.x);
+ acc2 += b1 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3.x);
+ acc3 += b1 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))
+ {
+ // Load values from matrix A
+ float a0;
+ a0 = LOAD4(src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1;
+ a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2;
+ a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3;
+ a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ vec4 b0;
+ LOAD16(b0, src1, src1.current_offset);
+
+ acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ /* Multiply by the weight of vector-matrix product */
+ acc0 = acc0 * vec4(ALPHA);
+ STORE16(dst, offset(dst, 0, 0), acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = acc1 * vec4(ALPHA);
+ STORE16(dst, offset(dst, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = acc2 * vec4(ALPHA);
+ STORE16(dst, offset(dst, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = acc3 * vec4(ALPHA);
+ STORE16(dst, offset(dst, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_MATRIXADDITION
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using BETA
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+ /* Compute source and destination addresses */
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ /* Load values from A x B */
+ vec4 alpha_ab;
+ vec4 c;
+ vec4 out1;
+
+ LOAD16(alpha_ab, dst, dst.current_offset);
+ LOAD16(c, src, src.current_offset);
+
+ /* Computes alpha * axb + beta * c */
+ out1 = alpha_ab + vec4(BETA * c);
+
+ /* Store final result in axb matrix */
+ STORE16(dst, dst.current_offset, out1);
+}
+#endif /* GEMM_MATRIXADDITION */
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+#ifdef GEMM_MM_FLOATING_POINT
+BUFFER_DECLARATION(src0, 1, uint, readonly);
+BUFFER_DECLARATION(src1, 2, uvec2, readonly);
+BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src0);
+ IMAGE_PARAM_DECLARATION(src1);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main()
+{
+ Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+ Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+ Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+ int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+ /* Compute the address for the vector A and matrix B */
+ src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+ src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+ /* Compute end row address for matrix A */
+ uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+ /* Reset accumulators */
+ vec4 acc0 = vec4(0.0f);
+
+ for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
+ {
+ uint packed_a0;
+ vec2 a0;
+
+ GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
+ a0 = vec2(unpackHalf2x16(packed_a0));
+
+ uvec2 packed_b0;
+ uvec2 packed_b1;
+ vec4 b0;
+ vec4 b1;
+
+ GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+ GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);
+
+ b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+ b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));
+
+ acc0 += b0 * vec4(a0.x);
+ acc0 += b1 * vec4(a0.y);
+ }
+
+ for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
+ {
+ uint packed_a0;
+ vec2 a0;
+
+ GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
+ a0 = vec2(unpackHalf2x16(packed_a0));
+
+ uvec2 packed_b0;
+ vec4 b0;
+
+ GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+
+ b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+
+ acc0 += b0 * (a0.x);
+ }
+
+ /* Multiply by the weight of vector-matrix product */
+ acc0 = acc0 * vec4(ALPHA);
+
+ uvec2 packed_d;
+ packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
+ GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+}
+#endif /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+BUFFER_DECLARATION(accum, 1, uvec2, restrict);
+BUFFER_DECLARATION(biases, 2, uvec2, readonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(accum);
+ VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
+ * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Image accum = GC_CONVERT_TO_IMAGE_STRUCT(accum);
+ Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+
+ vec4 u[2];
+ uvec2 packed_s[2];
+ GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
+ GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
+ u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+ vec4 tmp;
+ tmp = u[0] + u[1];
+ packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
+ GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+}
+#endif /* GEMM_ACCUMULATE_BIASES */
+#else /* DATA_TYPE_F32 */
+#error Data type not supported
+#endif /* DATA_TYPE_F32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
new file mode 100644
index 0000000000..86dedf5a9c
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT(x, type) type(x)
+
+#define PACK(value, stype, dtype) \
+ pack_##stype##_##dtype(value)
+
+#define UNPACK(value, stype, dtype) \
+ unpack_##stype##_##dtype(value)
+
+#define BUFFER_DECLARATION(name, location, type, access) \
+ layout(std430, binding = location) access buffer name##Buffer \
+ { \
+ type name##_ptr[]; \
+ }
+
+#define VECTOR_PARAM_DECLARATION(name) \
+ uint name##_stride_x; \
+ uint name##_step_x; \
+ uint name##_offset_first_element_in_bytes; \
+ uint name##_buffer_data_type_size
+
+#define IMAGE_PARAM_DECLARATION(name) \
+ uint name##_stride_x; \
+ uint name##_step_x; \
+ uint name##_stride_y; \
+ uint name##_step_y; \
+ uint name##_offset_first_element_in_bytes; \
+ uint name##_buffer_data_type_size
+
+#define TENSOR3D_PARAM_DECLARATION(name) \
+ uint name##_stride_x; \
+ uint name##_step_x; \
+ uint name##_stride_y; \
+ uint name##_step_y; \
+ uint name##_stride_z; \
+ uint name##_step_z; \
+ uint name##_offset_first_element_in_bytes; \
+ uint name##_buffer_data_type_size
+
+/** Structure to hold Vector information */
+struct Vector
+{
+ uint current_offset; /**< Current offset of vector */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ uint stride_x; /**< Stride of the image in X dimension (in bytes) */
+};
+
+/** Structure to hold Image information */
+struct Image
+{
+ uint current_offset; /**< Current offset of image */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ uint stride_x; /**< Stride of the image in X dimension (in bytes) */
+ uint stride_y; /**< Stride of the image in Y dimension (in bytes) */
+};
+
+/** Structure to hold 3D tensor information */
+struct Tensor3D
+{
+ uint current_offset; /**< Current offset of tensor */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ uint stride_x; /**< Stride of the image in X dimension (in bytes) */
+ uint stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ uint stride_z; /**< Stride of the image in Z dimension (in bytes) */
+};
+
+/////////////////////////////////////////////////////////////
+// TODO: old to be removed
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+ update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_FP16(name) \
+ update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+ update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(name) \
+ update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+ update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_FP16(name) \
+ update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+ update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+ update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_FP16(name) \
+ update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name) \
+ update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+ name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_FP16(name) \
+ update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+ name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+ update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(name) \
+ update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+// FIXME: Redesign the macros if different data types are supported.
+#define LOAD4(name, offset) \
+ name##_ptr[offset]
+
+#define STORE4(name, offset, value) \
+ name##_ptr[offset] = value
+
+// Load 1 element, which size is determined by ssbo type.
+#define LOAD1(r, name, offset) \
+ r = name##_ptr[offset]
+
+#define STORE1(name, offset, value) \
+ name##_ptr[offset] = value
+
+#define LOAD2(r, name, offset) \
+ LOAD1(r[0], name, offset); \
+ LOAD1(r[1], name, (offset) + uint(1))
+
+#define STORE2(name, offset, value) \
+ name##_ptr[offset] = value[0]; \
+ name##_ptr[(offset) + uint(1)] = value[1]
+
+#define LOAD3(r, name, offset) \
+ LOAD1(r[0], name, offset); \
+ LOAD1(r[1], name, (offset) + uint(1)); \
+ LOAD1(r[2], name, (offset) + uint(2))
+
+#define CURRENT_OFFSET(name) \
+ name.current_offset
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+ Vector vector;
+ vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ vector.stride_x = stride_x;
+ vector.current_offset = (vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x) >> 2;
+
+ return vector;
+}
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+ Vector vector;
+ vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ vector.stride_x = stride_x;
+ vector.current_offset = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+ return vector;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y) >> 2;
+
+ return img;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+ return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+ return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+ return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Tensor3D tensor;
+ tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ tensor.stride_x = stride_x;
+ tensor.stride_y = stride_y;
+ tensor.stride_z = stride_z;
+ tensor.current_offset = (tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+ return tensor;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Tensor3D tensor;
+ tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ tensor.stride_x = stride_x;
+ tensor.stride_y = stride_y;
+ tensor.stride_z = stride_z;
+ tensor.current_offset = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+ return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ */
+uint vector_offset(Vector vec, int x)
+{
+ return CONVERT(CONVERT(vec.current_offset << 2, int) + x * CONVERT(vec.stride_x, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ */
+uint vector_offset_fp16(Vector vec, int x)
+{
+ return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ */
+uint offset(Image img, int x, int y)
+{
+ return CONVERT(CONVERT(img.current_offset << 2, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ */
+uint offset_fp16(Image img, int x, int y)
+{
+ return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ */
+uint tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+ return CONVERT(CONVERT(tensor.current_offset << 2, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ */
+uint tensor3D_offset_fp16(Tensor3D tensor, int x, int y, int z)
+{
+ return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+/////////////////////////////////////////////////////////////
+// new one
+
+#define GC_CONVERT_TO_VECTOR_STRUCT(name) \
+ gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+ gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define GC_CONVERT_TO_IMAGE_STRUCT(name) \
+ gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define GC_CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+ gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT(name) \
+ gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+ name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+ gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+Vector gc_update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+ Vector vector;
+ vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ vector.stride_x = stride_x;
+ vector.current_offset = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+ return vector;
+}
+
+Image gc_update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+ return img;
+}
+
+Tensor3D gc_update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Tensor3D tensor;
+ tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ tensor.stride_x = stride_x;
+ tensor.stride_y = stride_y;
+ tensor.stride_z = stride_z;
+ tensor.current_offset = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+ return tensor;
+}
+
+Image gc_update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Image img;
+ img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+ img.stride_x = stride_x;
+ img.stride_y = stride_y;
+ img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+ return img;
+}
+
+#define GC_CURRENT_OFFSET(name) \
+ name.current_offset
+
+uint gc_vector_offset(Vector vec, int x)
+{
+ return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+uint gc_image_offset(Image img, int x, int y)
+{
+ return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+uint gc_tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+ return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+// load/store number of element depends on buffer type
+#define GC_LOAD1(r, name, offset) \
+ r = name##_ptr[offset]
+
+#define GC_LOAD2(r, name, offset) \
+ GC_LOAD1(r[0], name, offset); \
+ GC_LOAD1(r[1], name, (offset) + uint(1))
+
+#define GC_LOAD3(r, name, offset) \
+ GC_LOAD1(r[0], name, offset); \
+ GC_LOAD1(r[1], name, (offset) + uint(1)); \
+ GC_LOAD1(r[2], name, (offset) + uint(2))
+
+#define GC_STORE1(value, name, offset) \
+ name##_ptr[offset] = value
+
+#define GC_STORE2(value, name, offset) \
+ GC_STORE1(value[0], name, offset); \
+ GC_STORE1(value[1], name, (offset) + uint(1))
+
+#define GC_STORE3(value, name, offset) \
+ GC_STORE1(value[0], name, offset); \
+ GC_STORE1(value[1], name, (offset) + uint(1)); \
+ GC_STORE1(value[2], name, (offset) + uint(2))
+
+// has to manually expand them since not supported by compiler
+#define GC_LOAD1_1D_OFFSET(r, name, x) \
+ GC_LOAD1(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_2D_OFFSET(r, name, x, y) \
+ GC_LOAD1(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_3D_OFFSET(r, name, x, y, z) \
+ GC_LOAD1(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_1D_OFFSET(value, name, x) \
+ GC_STORE1(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_2D_OFFSET(value, name, x, y) \
+ GC_STORE1(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_3D_OFFSET(value, name, x, y, z) \
+ GC_STORE1(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_1D_OFFSET(r, name, x) \
+ GC_LOAD2(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_2D_OFFSET(r, name, x, y) \
+ GC_LOAD2(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_3D_OFFSET(r, name, x, y, z) \
+ GC_LOAD2(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_1D_OFFSET(value, name, x) \
+ GC_STORE2(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_2D_OFFSET(value, name, x, y) \
+ GC_STORE2(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_3D_OFFSET(value, name, x, y, z) \
+ GC_STORE2(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_1D_OFFSET(r, name, x) \
+ GC_LOAD3(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_2D_OFFSET(r, name, x, y) \
+ GC_LOAD3(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_3D_OFFSET(r, name, x, y, z) \
+ GC_LOAD3(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+/////////////////////////////////////////////////////////////
+
+#endif // _HELPER_H
diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
new file mode 100755
index 0000000000..5699340c14
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src1);
+ TENSOR3D_PARAM_DECLARATION(src2);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+#ifdef CROSS_MAP
+/** Apply cross map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+ Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float acc = 0.0;
+
+ int num_of_slices = int(gl_NumWorkGroups.z * gl_WorkGroupSize.z);
+ int current_slice = int(gl_GlobalInvocationID.z);
+
+ int left_slice = max(current_slice - int(RADIUS), int(0));
+ int right_slice = min(current_slice + int(RADIUS), int(num_of_slices - 1));
+
+ for(int i = left_slice; i <= right_slice; i++)
+ {
+ acc += src2_ptr[tensor3D_offset(src2, 0, 0, i - current_slice)];
+ }
+
+ float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+ float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+ dst_ptr[dst.current_offset] = normalized_pixel;
+}
+
+#elif defined(IN_MAP_1D)
+/** Apply in map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+ Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float acc = 0.0;
+
+ int num_of_items_x = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);
+ int current_pos = int(gl_GlobalInvocationID.x);
+
+ int left_pos = max(current_pos - int(RADIUS), int(0));
+ int right_pos = min(current_pos + int(RADIUS), int(num_of_items_x + -1));
+
+ for(int i = left_pos; i <= right_pos; i++)
+ {
+ acc += src2_ptr[tensor3D_offset(src2, i - current_pos, 0, 0)];
+ }
+
+ float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+ float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+ dst_ptr[dst.current_offset] = normalized_pixel;
+}
+#endif /*CROSS_MAP*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
new file mode 100644
index 0000000000..031687af0c
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src1);
+ TENSOR3D_PARAM_DECLARATION(src2);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @param[in] src1_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] src1_step_z src1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src2_ptr Pointer to the source image. Supported data types: Same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] src2_step_z src2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: Same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] scale Float scaling factor. Supported data types: F32
+ */
+void main()
+{
+ // Get pixels pointer
+ Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+ Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ dst_ptr[dst.current_offset] = (src1_ptr[src1.current_offset] * src2_ptr[src2.current_offset] * float(SCALE));
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
new file mode 100644
index 0000000000..1e0fee4688
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
@@ -0,0 +1,1444 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD8(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1)); \
+ r.z = LOAD4(name, offset + uint(2)); \
+ r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r) \
+ STORE4(name, offset, r.x); \
+ STORE4(name, offset + uint(1), r.y); \
+ STORE4(name, offset + uint(2), r.z); \
+ STORE4(name, offset + uint(3), r.w)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ } \
+ if(isnan(a.z) || (a.z < b.z)) \
+ { \
+ res.z = b.z; \
+ } \
+ if(isnan(a.w) || (a.w < b.w)) \
+ { \
+ res.w = b.w; \
+ }
+#define POOL_OP_float(res, a, b) \
+ (res) = (a); \
+ if(isnan(a) || (a < b)) \
+ { \
+ res = b; \
+ }
+#define POOL_OP_vec2(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -3.402823466385289e+38
+#endif // POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1(res, input, output) \
+ vec4 data00; \
+ vec2 data01; \
+ vec4 data10; \
+ vec2 data11; \
+ vec4 data20; \
+ vec2 data21; \
+ LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ float data010; \
+ vec4 data100; \
+ vec4 data101; \
+ float data11; \
+ vec4 data200; \
+ vec4 data201; \
+ float data21; \
+ LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
+ LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
+ LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ vec4 data010; \
+ vec4 data100; \
+ vec4 data101; \
+ vec4 data11; \
+ vec4 data200; \
+ vec4 data201; \
+ vec4 data21; \
+ LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
+ LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
+ LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x = int(min(start_x + pool_size, upper_bound_w));
+ int end_y = int(min(start_y + pool_size, upper_bound_h));
+
+ float data_max;
+ data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
+
+ for(int i = 0; (start_x + i) < end_x; ++i)
+ {
+ for(int j = 0; (start_y + j) < end_y; ++j)
+ {
+ float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+ POOL_OP_float(data_max, data_max, data);
+ }
+ }
+
+ return data_max;
+}
+
+float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x = int(min(start_x + pool_size, upper_bound_w));
+ int end_y = int(min(start_y + pool_size, upper_bound_h));
+
+ float data_total = 0.0f;
+ for(int i = 0; (start_x + i) < end_x; i++)
+ {
+ for(int j = 0; (start_y + j) < end_y; ++j)
+ {
+ float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+ if(isnan(data))
+ {
+ data = 0.0f;
+ }
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data = POW2_OP(data, 1);
+#endif /* defined(POOL_L2) */
+ data_total = data_total + data;
+ }
+ }
+
+ return data_total / float((end_y - start_y) * (end_x - start_x));
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ vec4 res;
+ // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+ POOLING3x3_STRIDE1(res, src, dst);
+#elif STRIDE_X == 2
+ POOLING3x3_STRIDE2(res, src, dst);
+#elif STRIDE_X == 3
+ POOLING3x3_STRIDE3(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+ // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+ ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+ int end_y = min((start_y + 3), MAX_HEIGHT);
+ res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ STORE16(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F32;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ vec4 vdata0;
+ vdata0 = vec4(INITIAL_VALUE);
+ vec4 vdata1;
+ vdata1 = vec4(INITIAL_VALUE);
+ float sdata;
+ sdata = float(INITIAL_VALUE);
+
+ for(int y = 0; y < int(POOL_SIZE); y++)
+ {
+ int x = 0;
+ for(; x <= (int(POOL_SIZE) - 8); x += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
+ LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata0, vdata0, data2);
+ POOL_OP(vdata1, vdata1, data3);
+ }
+
+ // Leftover
+ for(; x < int(POOL_SIZE); ++x)
+ {
+ float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4 *= data4;
+#endif /* defined(POOL_L2) */
+ POOL_OP_float(sdata, sdata, data4);
+ }
+ }
+
+ //Reduce result
+ vec4 reduce4;
+ POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
+ vec2 reduce2;
+ POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
+ float res;
+ POOL_OP_float(res, reduce2.x, reduce2.y);
+ POOL_OP_float(res, res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ {
+ // Divide by pool region in case of average pooling
+ int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ int end_x = int(min(STRIDE_X + POOL_SIZE, MAX_WIDTH));
+ int end_y = int(min(STRIDE_Y + POOL_SIZE, MAX_HEIGHT));
+ float res1 = float((end_y - start_y) * (end_x - start_x));
+ res = DIV_OP(res, res1);
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+#endif /* POOLING_LAYER_2 */
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+vec2 load_and_unpack(Tensor3D, uint);
+vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD2_fp16(r, name, offset) \
+ r.xy = load_and_unpack(name, offset)
+
+#define LOAD4_fp16(r, name, offset) \
+ r.xy = load_and_unpack(name, offset); \
+ r.zw = load_and_unpack(name, offset + uint(1))
+
+#define STORE4_fp16(name, offset, r) \
+ uint datastore1; \
+ uint datastore2; \
+ datastore1 = uint(packHalf2x16(r.xy)); \
+ datastore2 = uint(packHalf2x16(r.zw)); \
+ STORE1(name, offset << uint(1), datastore1); \
+ STORE1(name, (offset << uint(1)) + uint(1), datastore2)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ } \
+ if(isnan(a.z) || (a.z < b.z)) \
+ { \
+ res.z = b.z; \
+ } \
+ if(isnan(a.w) || (a.w < b.w)) \
+ { \
+ res.w = b.w; \
+ }
+#define POOL_OP_float(res, a, b) \
+ (res) = (a); \
+ if(isnan(a) || (a < b)) \
+ { \
+ res = b; \
+ }
+#define POOL_OP_vec2(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -65504.0f
+#endif //POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1_fp16(res, input, output) \
+ vec4 data00; \
+ vec2 data01; \
+ vec4 data10; \
+ vec2 data11; \
+ vec4 data20; \
+ vec2 data21; \
+ LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2_fp16(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ float data010; \
+ vec4 data100; \
+ vec4 data101; \
+ float data11; \
+ vec4 data200; \
+ vec4 data201; \
+ float data21; \
+ vec2 datamiddle0; \
+ vec2 datamiddle1; \
+ vec2 datamiddle2; \
+ LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
+ data010 = datamiddle0.x; \
+ LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
+ data11 = datamiddle1.x; \
+ LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
+ data21 = datamiddle2.x; \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3_fp16(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ vec4 data010; \
+ vec4 data100; \
+ vec4 data101; \
+ vec4 data11; \
+ vec4 data200; \
+ vec4 data201; \
+ vec4 data21; \
+ LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
+ LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
+ LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+vec2 load_and_unpack(Tensor3D src, uint offset)
+{
+ uint packed_s;
+ vec2 s;
+ LOAD1(packed_s, src, offset);
+
+ s = vec2(unpackHalf2x16(packed_s));
+ return s;
+}
+
+vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
+ int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
+
+ int start_x2 = start_x1 + stride_x;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
+ int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
+
+ //Initialize maximum
+ vec2 data_max = vec2(0);
+
+ //Load and Set initial maximum1
+ vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
+ data_max.x = data_init1.x;
+
+ //Load and Set initial maximum2
+ if(end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
+ data_max.y = data_init2.x;
+ }
+ else
+ {
+ vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
+ data_max.y = data_init2.y;
+ }
+ }
+
+ for(int i = 0; (start_y1 + i) < end_y1; i++)
+ for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+ {
+ //Calculate maximum1
+ if((start_x1 + j + 1) < end_x1)
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ float data_mr1;
+ POOL_OP_float(data_mr1, data1.x, data1.y);
+ POOL_OP_float(data_max.x, data_max.x, data_mr1);
+ }
+ else
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ POOL_OP_float(data_max.x, data_max.x, data1.x);
+ }
+
+ //Calculate maximum2
+ if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+
+ if((start_x2 + j + 1) < end_x2)
+ {
+ float data_mr2;
+ POOL_OP_float(data_mr2, data2.x, data2.y);
+ POOL_OP_float(data_max.y, data_max.y, data_mr2);
+ }
+ else
+ {
+ POOL_OP_float(data_max.y, data_max.y, data2.x);
+ }
+ }
+ else
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+ vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+ if((start_x2 + j + 1) < end_x2)
+ {
+ float data_mr2;
+ POOL_OP_float(data_mr2, data3.x, data2.y);
+ POOL_OP_float(data_max.y, data_max.y, data_mr2);
+ }
+ else
+ {
+ POOL_OP_float(data_max.y, data_max.y, data2.y);
+ }
+ }
+ }
+ }
+ return data_max;
+}
+
+vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
+ int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
+
+ int start_x2 = start_x1 + stride_x;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
+ int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
+
+ //Initialize sum
+ float data_total1 = float(0);
+ float data_total2 = float(0);
+ for(int i = 0; (start_y1 + i) < end_y1; i++)
+ for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+ //Calculate sum1
+ if((start_x1 + j + 1) < end_x1)
+ {
+ data_total1 = data_total1 + data1.x + data1.y;
+ }
+ else
+ {
+ data_total1 = data_total1 + data1.x;
+ }
+
+ //Calculate sum2
+ if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 = POW2_OP(data2, 2);
+#endif /* defined(POOL_L2) */
+ if((start_x2 + j + 1) < end_x2)
+ {
+ data_total2 = data_total2 + data2.x + data2.y;
+ }
+ else
+ {
+ data_total2 = data_total2 + data2.x;
+ }
+ }
+ else
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+ vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 = POW2_OP(data2, 2);
+ data3 = POW2_OP(data3, 2);
+#endif /* defined(POOL_L2) */
+ if((start_x2 + j + 1) < end_x2)
+ {
+ data_total2 = data_total2 + data3.x + data2.y;
+ }
+ else
+ {
+ data_total2 = data_total2 + data2.y;
+ }
+ }
+ }
+ }
+ //Calculate average
+ vec2 data_avg;
+ data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
+ data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
+
+ return data_avg;
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ vec4 res;
+ // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+ POOLING3x3_STRIDE1_fp16(res, src, dst);
+#elif STRIDE_X == 2
+ POOLING3x3_STRIDE2_fp16(res, src, dst);
+#elif STRIDE_X == 3
+ POOLING3x3_STRIDE3_fp16(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+ // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+ ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+ int end_y = min((start_y + 3), MAX_HEIGHT);
+ res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F16;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ vec4 vdata00;
+ vdata00 = vec4(INITIAL_VALUE);
+ vec4 vdata01;
+ vdata01 = vec4(INITIAL_VALUE);
+ vec4 vdata10;
+ vdata10 = vec4(INITIAL_VALUE);
+ vec4 vdata11;
+ vdata11 = vec4(INITIAL_VALUE);
+ vec2 sdata;
+ sdata = vec2(INITIAL_VALUE);
+
+ for(int y = 0; y < int(POOL_SIZE); y++)
+ {
+ int x = 0;
+ for(; x <= (int(POOL_SIZE) - 8); x += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+ LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata00, vdata00, data2);
+ POOL_OP(vdata10, vdata10, data3);
+ }
+
+ // Leftover
+ for(; x < int(POOL_SIZE); x = x + 2)
+ {
+ vec2 data4middle;
+ data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+ if((x + 1) >= int(POOL_SIZE))
+ {
+ POOL_OP_float(sdata.x, sdata.x, data4middle.x);
+ }
+ else
+ {
+ float data4;
+ POOL_OP_float(data4, data4middle.x, data4middle.y);
+ POOL_OP_float(sdata.x, sdata.x, data4);
+ }
+ }
+ }
+
+ for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++)
+ {
+ int x1 = STRIDE_X;
+ for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+ LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata01, vdata01, data2);
+ POOL_OP(vdata11, vdata11, data3);
+ }
+
+ // Leftover
+ for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
+ {
+ vec2 data4middle;
+ data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+ if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+ {
+ POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+ }
+ else
+ {
+ float data4;
+ POOL_OP_float(data4, data4middle.x, data4middle.y);
+ POOL_OP_float(sdata.y, sdata.y, data4);
+ }
+ }
+ }
+
+ //Reduce result
+ vec4 reduce40;
+ POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
+ vec2 reduce20;
+ POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
+ vec4 reduce41;
+ POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
+ vec2 reduce21;
+ POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
+ vec2 data;
+ POOL_OP_float(data.x, reduce20.x, reduce20.y);
+ POOL_OP_float(data.x, data.x, sdata.x);
+ POOL_OP_float(data.y, reduce21.x, reduce21.y);
+ POOL_OP_float(data.y, data.y, sdata.y);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ {
+ // Divide by pool region in case of average pooling
+ int start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+ int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
+ int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
+ int start_x2 = start_x1 + STRIDE_X;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
+ int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+ vec2 res1;
+ res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
+ res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
+ data.x = DIV_OP(data.x, res1.x);
+ data.y = DIV_OP(data.y, res1.y);
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+ uint res;
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+#endif /*POOLING_LAYER_2*/
+#endif /*DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
new file mode 100644
index 0000000000..0bbabeaafc
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#define MAX_OP(x, y) max((x), (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define SUB_OP(x, y) ((x) - (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define EXP_OP(x) exp((x))
+
+#if defined(DATA_TYPE_FP32)
+const float MINVAL = -1.0 / 0.0;
+vec4 type_min = CONVERT(MINVAL, vec4);
+
+#define LOAD16(name, offset) \
+ vec4(LOAD4(name, offset), \
+ LOAD4(name, offset + uint(1)), \
+ LOAD4(name, offset + uint(2)), \
+ LOAD4(name, offset + uint(3)))
+
+#define STORE16(name, offset, value) \
+ STORE4(name, offset, value.x); \
+ STORE4(name, offset + uint(1), value.y); \
+ STORE4(name, offset + uint(2), value.z); \
+ STORE4(name, offset + uint(3), value.w)
+
+#ifdef SOFTMAX_LAYER_MAX
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(max, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+BUFFER_DECLARATION(sum, 4, float, writeonly);
+#elif defined(SOFTMAX_LAYER_NORM)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(sum, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+#endif // SOFTMAX_LAYER_MAX
+
+layout(std140) uniform shader_params
+{
+#ifdef SOFTMAX_LAYER_MAX
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ uint width;
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(max);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ TENSOR3D_PARAM_DECLARATION(sum);
+ uint width;
+#elif defined(SOFTMAX_LAYER_NORM)
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(sum);
+ TENSOR3D_PARAM_DECLARATION(dst);
+#endif // SOFTMAX_LAYER_MAX
+};
+
+#ifdef SOFTMAX_LAYER_MAX
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] width Input image width
+ */
+void main(void)
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+ // Initialize local maximum
+ vec4 max_val = CONVERT(type_min, vec4);
+
+ // Calculate max of row
+ uint width2 = width >> 2;
+ for(int i = 0; i < int(width2); i++)
+ {
+ vec4 data = LOAD16(src, offset(src, i << 2, 0));
+ max_val = MAX_OP(data, max_val);
+ }
+
+#ifdef NON_MULTIPLE_OF_4
+ // Handle non multiple of 4
+ for(int i = int(width2 << 2); i < int(width); i++)
+ {
+ float data = LOAD4(src, offset(src, i, 0));
+ max_val.x = MAX_OP(data, max_val.x);
+ }
+#endif /* NON_MULTIPLE_OF_4 */
+
+ // Perform max reduction
+ max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+ max_val.x = MAX_OP(max_val.x, max_val.y);
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), max_val.x);
+}
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
+ * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in] width Input image width
+ */
+void main(void)
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+ Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+ // Load max value of 1D logits vector (row)
+ vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4);
+
+ // Set sum vector
+ vec4 sum1D = CONVERT(0, vec4);
+
+ // Shift values, exp and sum
+ uint width2 = width >> 2;
+ for(int i = 0; i < int(width2); i++)
+ {
+ vec4 data = LOAD16(src, offset(src, i << 2, 0));
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ STORE16(dst, offset(dst, i << 2, 0), data);
+ sum1D = ADD_OP(sum1D, data);
+ }
+
+#ifdef NON_MULTIPLE_OF_4
+ // Handle non multiple of 4
+ for(int i = int(width2 << 2); i < int(width); i++)
+ {
+ float data;
+ data = LOAD4(src, offset(src, i, 0));
+ data = SUB_OP(data, max_val.x);
+ data = EXP_OP(data);
+ STORE4(dst, offset(dst, i, 0), data);
+ sum1D.x = ADD_OP(sum1D.x, data);
+ }
+#endif /* NON_MULTIPLE_OF_4 */
+
+ // Perform min/max reduction
+ sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+ sum1D.x = ADD_OP(sum1D.x, sum1D.y);
+
+ // Calculate and store result
+ STORE4(sum, CURRENT_OFFSET(sum), sum1D.x);
+}
+#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+ // Load max value of 1D logits vector (row)
+ vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4);
+ vec4 data = LOAD16(src, CURRENT_OFFSET(src));
+ STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val));
+}
+#endif // SOFTMAX_LAYER_MAX
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+const float MINVAL1 = -1.0 / 0.0;
+vec4 type_min1 = CONVERT(MINVAL1, vec4);
+
+#define GC_LOAD4_IMAGE(r, name, x, y) \
+ load_and_unpack(r.xy, name, x, y); \
+ load_and_unpack(r.zw, name, (x + 2), y)
+
+#define GC_STORE4_IMAGE(r, name, x, y) \
+ GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \
+ GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y)
+
+#ifdef SOFTMAX_LAYER_MAX
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(max, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+BUFFER_DECLARATION(sum, 4, uint, writeonly);
+#elif defined(SOFTMAX_LAYER_NORM)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(sum, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+#endif // SOFTMAX_LAYER_MAX
+
+layout(std140) uniform shader_params
+{
+#ifdef SOFTMAX_LAYER_MAX
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ uint width;
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(max);
+ TENSOR3D_PARAM_DECLARATION(dst);
+ TENSOR3D_PARAM_DECLARATION(sum);
+ uint width;
+#elif defined(SOFTMAX_LAYER_NORM)
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(sum);
+ TENSOR3D_PARAM_DECLARATION(dst);
+#endif // SOFTMAX_LAYER_MAX
+};
+
+#define load_and_unpack(rs, names, xs, ys) \
+ do \
+ { \
+ uint packed_s; \
+ GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \
+ rs = vec2(unpackHalf2x16(packed_s)); \
+ } while(false)
+
+#ifdef SOFTMAX_LAYER_MAX
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] width Input image width
+ */
+void main(void)
+{
+ Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+ // Initialize local maximum
+ vec4 max_val1 = CONVERT(type_min1, vec4);
+
+ // Calculate max of row
+ uint width2 = width >> 2;
+ for(int i = 0; i < int(width2); i++)
+ {
+ vec4 data1;
+ GC_LOAD4_IMAGE(data1, src, (i << 2), 0);
+ max_val1 = MAX_OP(data1, max_val1);
+ }
+
+#ifdef NON_MULTIPLE_OF_4
+ // Handle non multiple of 4
+ for(int i = int(width2 << 2); i < int(width); i = i + 2)
+ {
+ vec2 data;
+ load_and_unpack(data, src, i, 0);
+ max_val1.x = MAX_OP(data.x, max_val1.x);
+ if((i + 1) < int(width))
+ {
+ max_val1.x = MAX_OP(data.y, max_val1.x);
+ }
+ }
+#endif /* NON_MULTIPLE_OF_4 */
+
+ // Perform max reduction
+ max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw);
+ max_val1.x = MAX_OP(max_val1.x, max_val1.y);
+ vec2 res1 = vec2(max_val1.x, 0.f);
+ uint res;
+ res = uint(packHalf2x16(res1));
+
+ // Store result
+ GC_STORE1_2D_OFFSET(res, dst, 0, 0);
+}
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
+ * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in] width Input image width
+ */
+void main(void)
+{
+ Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+ Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+ // Load max value of 1D logits vector (row)
+ vec2 datamaxinit;
+ load_and_unpack(datamaxinit, max, 0, 0);
+ vec4 max_val = CONVERT(datamaxinit.x, vec4);
+
+ // Set sum vector
+ vec4 sum1D1 = CONVERT(0.f, vec4);
+
+ // Shift values, exp and sum
+ uint width2 = width >> 2;
+ for(int i = 0; i < int(width2); i++)
+ {
+ vec4 data;
+ GC_LOAD4_IMAGE(data, src, (i << 2), 0);
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ GC_STORE4_IMAGE(data, dst, (i << 2), 0);
+ sum1D1 = ADD_OP(sum1D1, data);
+ }
+
+#ifdef NON_MULTIPLE_OF_4
+ // Handle non multiple of 4
+ for(int i = int(width2 << 2); i < int(width); i = i + 2)
+ {
+ vec2 datamiddle;
+ float data1;
+ load_and_unpack(datamiddle, src, i, 0);
+ data1 = SUB_OP(datamiddle.x, max_val.x);
+ data1 = EXP_OP(data1);
+ vec2 datares1;
+ if((i + 1) < int(width))
+ {
+ float data2;
+ data2 = SUB_OP(datamiddle.y, max_val.x);
+ data2 = EXP_OP(data2);
+ datares1 = vec2(data1, data2);
+ data1 = ADD_OP(data2, data1);
+ }
+ else
+ {
+ datares1 = vec2(data1, 0.f);
+ }
+ uint datares;
+ datares = uint(packHalf2x16(datares1));
+ GC_STORE1_2D_OFFSET(datares, dst, i, 0);
+ sum1D1.x = ADD_OP(sum1D1.x, data1);
+ }
+#endif /* NON_MULTIPLE_OF_4 */
+
+ // Perform min/max reduction
+ sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw);
+ sum1D1.x = ADD_OP(sum1D1.x, sum1D1.y);
+ vec2 res1 = vec2(sum1D1.x, 0.f);
+ uint res;
+ res = uint(packHalf2x16(res1));
+ // Calculate and store result
+ GC_STORE1_2D_OFFSET(res, sum, 0, 0);
+}
+#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+ Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+ // Load max value of 1D logits vector (row)
+ vec2 sum1;
+ load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y));
+ vec4 sum_val1 = CONVERT(sum1.x, vec4);
+
+ vec4 data1;
+ GC_LOAD4_IMAGE(data1, src, 0, 0);
+ vec4 res = DIV_OP(data1, sum_val1);
+ GC_STORE4_IMAGE(res, dst, 0, 0);
+}
+#endif // SOFTMAX_LAYER_MAX
+#endif // DATA_TYPE_FP32 \ No newline at end of file
diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
new file mode 100755
index 0000000000..6d020fe70d
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+#define LOAD16(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1)); \
+ r.z = LOAD4(name, offset + uint(2)); \
+ r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r) \
+ STORE4(name, offset, r.x); \
+ STORE4(name, offset + uint(1), r.y); \
+ STORE4(name, offset + uint(2), r.z); \
+ STORE4(name, offset + uint(3), r.w)
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+ // Compute source address
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Load the NxN block at (x, y)
+ vec4 u0;
+ vec4 u1;
+ vec4 u2;
+ vec4 u3;
+ LOAD16(u0, src, offset(src, 0, 0));
+ LOAD16(u1, src, offset(src, 0, 1));
+ LOAD16(u2, src, offset(src, 0, 2));
+ LOAD16(u3, src, offset(src, 0, 3));
+
+ // Transpose the block
+ vec4 tmp;
+ tmp.xyz = u0.yzw;
+ u0.y = u1.x;
+ u0.z = u2.x;
+ u0.w = u3.x;
+ u1.x = tmp.x;
+ u2.x = tmp.y;
+ u3.x = tmp.z;
+ tmp.xy = u1.zw;
+ u1.z = u2.y;
+ u1.w = u3.y;
+ u2.y = tmp.x;
+ u3.y = tmp.y;
+ tmp.x = u2.w;
+ u2.w = u3.z;
+ u3.z = tmp.x;
+
+ // Store the block at (y, x)
+ uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst.stride_y) + (dst.offset_first_element_in_bytes);
+
+ STORE16(dst, uint((dst_offset_in_bytes + uint(0) * dst.stride_y) >> 2), u0);
+ STORE16(dst, uint((dst_offset_in_bytes + uint(1) * dst.stride_y) >> 2), u1);
+ STORE16(dst, uint((dst_offset_in_bytes + uint(2) * dst.stride_y) >> 2), u2);
+ STORE16(dst, uint((dst_offset_in_bytes + uint(3) * dst.stride_y) >> 2), u3);
+}
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+ IMAGE_PARAM_DECLARATION(src);
+ IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+ // Compute source address
+ Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Load the NxN block at (x, y)
+ vec4 u0;
+ vec4 u1;
+ vec4 u2;
+ vec4 u3;
+ uvec2 packed_s[4];
+ GC_LOAD1_2D_OFFSET(packed_s[0], src, 0, 0);
+ GC_LOAD1_2D_OFFSET(packed_s[1], src, 0, 1);
+ GC_LOAD1_2D_OFFSET(packed_s[2], src, 0, 2);
+ GC_LOAD1_2D_OFFSET(packed_s[3], src, 0, 3);
+ u0 = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ u1 = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+ u2 = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+ u3 = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+
+ // Transpose the block
+ vec4 tmp;
+ tmp.xyz = u0.yzw;
+ u0.y = u1.x;
+ u0.z = u2.x;
+ u0.w = u3.x;
+ u1.x = tmp.x;
+ u2.x = tmp.y;
+ u3.x = tmp.z;
+ tmp.xy = u1.zw;
+ u1.z = u2.y;
+ u1.w = u3.y;
+ u2.y = tmp.x;
+ u3.y = tmp.y;
+ tmp.x = u2.w;
+ u2.w = u3.z;
+ u3.z = tmp.x;
+
+ // Store the block at (y, x)
+ uint dst_offset_in_bytes = uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+
+ packed_s[0] = uvec2(packHalf2x16(u0.xy), packHalf2x16(u0.zw));
+ packed_s[1] = uvec2(packHalf2x16(u1.xy), packHalf2x16(u1.zw));
+ packed_s[2] = uvec2(packHalf2x16(u2.xy), packHalf2x16(u2.zw));
+ packed_s[3] = uvec2(packHalf2x16(u3.xy), packHalf2x16(u3.zw));
+ GC_STORE1(packed_s[0], dst, uint((dst_offset_in_bytes + uint(0) * dst_stride_y) >> 3));
+ GC_STORE1(packed_s[1], dst, uint((dst_offset_in_bytes + uint(1) * dst_stride_y) >> 3));
+ GC_STORE1(packed_s[2], dst, uint((dst_offset_in_bytes + uint(2) * dst_stride_y) >> 3));
+ GC_STORE1(packed_s[3], dst, uint((dst_offset_in_bytes + uint(3) * dst_stride_y) >> 3));
+}
+#endif /*ARM_COMPUTE_ENABLE_FP16*/