aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2018-01-05 15:12:53 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commit09849a0e7128731473f37cf6045147db68b1c495 (patch)
tree865841c62c4910a34c7f0e86bd5bbc7e14a53e30 /src/core/GLES_COMPUTE
parent07d4054812d56afdae081294503f98c6dcaea048 (diff)
downloadComputeLibrary-09849a0e7128731473f37cf6045147db68b1c495.tar.gz
APPBROWSER-372: Rewrite the direct_convolution5x5.cs with the new common code
Change-Id: Ie2f398d62dea97e9201f77d22c9f0796db297b63 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115280 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Zhenglin Li <zhenglin.li@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs982
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/helpers_cs.h35
2 files changed, 116 insertions, 901 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index a36bd438ff..c919e4ed80 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,488 +24,114 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
-#ifdef DATA_TYPE_FP32
-
-precision highp float;
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
/** This kernel performs a direct convolution to convolve the low three dimensions
*
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized direct convolution options for FP16.
+ * The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * This OpenGL ES shader works with stride_x = 1 and 2
* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_attrs The attributes of the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_attrs The attributes of the weights tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
*/
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
+ Tensor3DAttributes weights_attrs;
#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
+ VectorAttributes biases_attrs;
#endif /* BIAS */
uint weights_stride_w;
uint weights_depth;
};
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
-#define LOAD20(r, name, offset) \
- r[0] = LOAD4(name, offset); \
- r[1] = LOAD4(name, offset + uint(1)); \
- r[2] = LOAD4(name, offset + uint(2)); \
- r[3] = LOAD4(name, offset + uint(3)); \
- r[4] = LOAD4(name, offset + uint(4))
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
void main()
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
- float pixels = CONVERT(0, float);
+ float pixels = 0.f;
uint z_index = gl_GlobalInvocationID.z;
- weights.current_offset += z_index * weights_stride_w >> 2;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
float temp[5];
float temp_weight[5];
-
for(int d = 0; d < int(weights_depth); ++d)
{
- LOAD20(temp, src, offset(src, 0, 0));
- LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+ temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+ temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
- LOAD20(temp, src, offset(src, 0, 1));
- LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+ temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+ temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
- LOAD20(temp, src, offset(src, 0, 2));
- LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+ temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+ temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
- LOAD20(temp, src, offset(src, 0, 3));
- LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+ temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
+ temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 3, 0));
pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
- LOAD20(temp, src, offset(src, 0, 4));
- LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+ temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
+ temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 4, 0));
pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
- src.current_offset += (src_stride_z >> 2);
- weights.current_offset += (weights_stride_z >> 2);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+ pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
- STORE4(dst, CURRENT_OFFSET(dst), pixels);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
-
#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-
-#if defined(PROCESS_4X_1Y_1Z)
-
-/** This kernel performs a direct convolution to convolve the low three dimensions
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
- uint weights_stride_w;
- uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
- uvec2 packed[2];
- vec4 ret[2];
-
- GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
- return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
- uvec2 packed[3];
- vec4 ret[3];
-
- GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
- ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
- return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
- uvec3 packed_w;
- vec2 ret[3];
-
- GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
- ret[0] = vec2(unpackHalf2x16(packed_w[0]));
- ret[1] = vec2(unpackHalf2x16(packed_w[1]));
- ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
- return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
- vec4 src0 = tmp[0];
- vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
- vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
- vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
- vec4 src4 = tmp[1];
- vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
- vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
- vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
- vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
- vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
- vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
- vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- return ret;
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
-void main()
-{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
- Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* BIAS */
-
- vec4 res = vec4(0);
- vec2 w[3];
- vec4 s[STRIDE_X + 1];
- uvec2 packed_d;
- uint z_index = gl_GlobalInvocationID.z;
-
- weights.current_offset += z_index * weights_stride_w;
-
- for(int d = 0; d < int(weights_depth); ++d)
- {
- for(int row = 0; row < 5; row++)
- {
- w = load_weight(weights, row);
- s = LOAD_SRC(src, row);
- res += CONVOLVE1x5(s, w);
- }
-
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
- }
-
-#ifdef BIAS
- uint packed_b;
- float b;
-
- GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
- b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
- res += vec4(b);
-#endif /* BIAS */
-
- packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
-}
-
-#elif defined(PROCESS_4X_3Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
- uint weights_stride_w;
- uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
+// Common definitions for DATA_TYPE_FP16
#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD3_UNPACK12_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
#else /* STRDIDE_X == 1 */
#error STRIDE_X larger than 2 is not supported
#endif /* STRIDE_X == 1 */
-vec4[2] load_src_stride1(Image src, int row)
-{
- uvec2 packed[2];
- vec4 ret[2];
-
- GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
- return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
- uvec2 packed[3];
- vec4 ret[3];
-
- GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
- ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
- return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
- uvec3 packed_w;
- vec2 ret[3];
-
- GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
- ret[0] = vec2(unpackHalf2x16(packed_w[0]));
- ret[1] = vec2(unpackHalf2x16(packed_w[1]));
- ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
- return ret;
-}
+#define LOAD_WEIGHT_AT_ROW(row) VLOAD3_UNPACK6_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, row, 0))
vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
{
@@ -531,501 +157,57 @@ vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
return ret;
}
-void main()
-{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
- Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
- vec4 res[3];
- vec2 w[5][3];
- vec4 s[STRIDE_X + 1];
- uvec2 packed_d;
- uint z_index = gl_GlobalInvocationID.z;
- int i;
-
- for(i = 0; i < 3; i++)
- {
- res[i] = vec4(0);
- }
-
- weights.current_offset += z_index * weights_stride_w;
-
- for(int d = 0; d < int(weights_depth); ++d)
- {
- // load weights once
- for(int row = 0; row < 5; row++)
- {
- w[row] = load_weight(weights, row);
- }
-
- // 1st line
- s = LOAD_SRC(src, 0);
- res[0] += CONVOLVE1x5(s, w[0]);
-
- // 2nd line
- s = LOAD_SRC(src, 1);
- res[0] += CONVOLVE1x5(s, w[1]);
- res[1] += CONVOLVE1x5(s, w[0]);
-
- // 3rd line
- s = LOAD_SRC(src, 2);
- res[0] += CONVOLVE1x5(s, w[2]);
- res[1] += CONVOLVE1x5(s, w[1]);
- res[2] += CONVOLVE1x5(s, w[0]);
-
- // 4th line
- s = LOAD_SRC(src, 3);
- res[0] += CONVOLVE1x5(s, w[3]);
- res[1] += CONVOLVE1x5(s, w[2]);
- res[2] += CONVOLVE1x5(s, w[1]);
-
- // 5th line
- s = LOAD_SRC(src, 4);
- res[0] += CONVOLVE1x5(s, w[4]);
- res[1] += CONVOLVE1x5(s, w[3]);
- res[2] += CONVOLVE1x5(s, w[2]);
-
- // 6th line
- s = LOAD_SRC(src, 5);
- res[1] += CONVOLVE1x5(s, w[4]);
- res[2] += CONVOLVE1x5(s, w[3]);
-
- // 7th line
- s = LOAD_SRC(src, 6);
- res[2] += CONVOLVE1x5(s, w[4]);
-
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
- }
-
-#ifdef BIAS
- uint packed_b;
- float b;
-
- GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
- b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
- for(i = 0; i < 3; i++)
- {
- res[i] += vec4(b);
- }
-#endif /* BIAS */
-
- for(i = 0; i < 3; i++)
- {
- packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
- }
-}
-
-#elif defined(PROCESS_4X_3Y_2Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
- uint weights_stride_w;
- uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
- uvec2 packed[2];
- vec4 ret[2];
-
- GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
- return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
- uvec2 packed[3];
- vec4 ret[3];
-
- GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
- ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
- return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
- uvec3 packed_w;
- vec2 ret[3];
-
- GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
- ret[0] = vec2(unpackHalf2x16(packed_w[0]));
- ret[1] = vec2(unpackHalf2x16(packed_w[1]));
- ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
- return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
- vec4 src0 = tmp[0];
- vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
- vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
- vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
- vec4 src4 = tmp[1];
- vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
- vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
- vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
- vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
- vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
- vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
- vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- return ret;
-}
-
-void main()
-{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
- Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
- vec4 res[3];
- vec2 w[5][3];
- vec4 s[STRIDE_X + 1];
- uvec2 packed_d;
- uint z_index = (gl_GlobalInvocationID.z);
- uint s_offset = src.current_offset;
- int i, z;
-
- weights.current_offset += z_index * weights_stride_w;
-
- for(z = 0; z < 2; z++)
- {
- z_index += uint(z);
- src.current_offset = s_offset;
-
- for(i = 0; i < 3; i++)
- {
- res[i] = vec4(0);
- }
-
- for(int d = 0; d < int(weights_depth); ++d)
- {
- // load weights once
- for(int row = 0; row < 5; row++)
- {
- w[row] = load_weight(weights, row);
- }
-
- // 1st line
- s = LOAD_SRC(src, 0);
- res[0] += CONVOLVE1x5(s, w[0]);
-
- // 2nd line
- s = LOAD_SRC(src, 1);
- res[0] += CONVOLVE1x5(s, w[1]);
- res[1] += CONVOLVE1x5(s, w[0]);
-
- // 3rd line
- s = LOAD_SRC(src, 2);
- res[0] += CONVOLVE1x5(s, w[2]);
- res[1] += CONVOLVE1x5(s, w[1]);
- res[2] += CONVOLVE1x5(s, w[0]);
-
- // 4th line
- s = LOAD_SRC(src, 3);
- res[0] += CONVOLVE1x5(s, w[3]);
- res[1] += CONVOLVE1x5(s, w[2]);
- res[2] += CONVOLVE1x5(s, w[1]);
-
- // 5th line
- s = LOAD_SRC(src, 4);
- res[0] += CONVOLVE1x5(s, w[4]);
- res[1] += CONVOLVE1x5(s, w[3]);
- res[2] += CONVOLVE1x5(s, w[2]);
-
- // 6th line
- s = LOAD_SRC(src, 5);
- res[1] += CONVOLVE1x5(s, w[4]);
- res[2] += CONVOLVE1x5(s, w[3]);
-
- // 7th line
- s = LOAD_SRC(src, 6);
- res[2] += CONVOLVE1x5(s, w[4]);
-
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
- }
-
-#ifdef BIAS
- uint packed_b;
- float b;
-
- GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
- b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
- for(i = 0; i < 3; i++)
- {
- res[i] += vec4(b);
- }
-#endif /* BIAS */
-
- for(i = 0; i < 3; i++)
- {
- packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
- }
-
- dst.current_offset += dst_stride_z;
- }
-}
-
-#elif defined(PROCESS_8X_1Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
- *
- * @note This OpenGL ES shader works with stride_x = 1
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
- uint weights_stride_w;
- uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#if defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#error stride == 2 for PROCESS_8X_1Y not implemented
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[3] load_src_stride1(Image src, int row)
-{
- uvec4 packed[2];
- vec4 ret[3];
-
- GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
- ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
- ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
- ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
- return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
- uvec3 packed_w;
- vec2 ret[3];
-
- GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
- ret[0] = vec2(unpackHalf2x16(packed_w[0]));
- ret[1] = vec2(unpackHalf2x16(packed_w[1]));
- ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
- return ret;
-}
-
-vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
-{
- vec4 src0 = tmp[0];
- vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
- vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
- vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
- vec4 src4 = tmp[1];
- vec4 ret[2];
-
- ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- src0 = tmp[1];
- src1 = vec4(tmp[1].yzw, tmp[2].x);
- src2 = vec4(tmp[1].zw, tmp[2].xy);
- src3 = vec4(tmp[1].w, tmp[2].xyz);
- src4 = tmp[2];
- ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
- return ret;
-}
-
void main()
{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
- vec4 res[2];
- vec2 w[3];
- vec4 s[STRIDE_X + 2];
- uvec4 packed_d;
- uint z_index = gl_GlobalInvocationID.z;
+ vec4 res = vec4(0);
+ vec2 w[3];
+ vec4 s[STRIDE_X + 1];
- res[0] = vec4(0);
- res[1] = vec4(0);
- weights.current_offset += z_index * weights_stride_w;
+ uint z_index = gl_GlobalInvocationID.z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
for(int d = 0; d < int(weights_depth); ++d)
{
for(int row = 0; row < 5; row++)
{
- w = load_weight(weights, row);
- s = LOAD_SRC(src, row);
- res[0] += CONVOLVE1x5(s, w)[0];
- res[1] += CONVOLVE1x5(s, w)[1];
+ w = LOAD_WEIGHT_AT_ROW(row);
+ s = LOAD_SRC_AT_ROW(row);
+ res += CONVOLVE1x5(s, w);
}
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- uint packed_b;
+ vec2 vec2_b;
float b;
- GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
- b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
- res[0] += vec4(b);
- res[1] += vec4(b);
+ vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+ b = (z_index % uint(2) == uint(0)) ? vec2_b.x : vec2_b.y;
+ res += vec4(b);
#endif /* BIAS */
- packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
- packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+ STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
}
-#else /* defined(PROCESS_4X_1Y_1Z) */
-
-#endif /* defined(PROCESS_4X_1Y_1Z) */
-
-#else /* DATA_TYPE_FP16 */
+#endif /* PROCESS_nX_nY_nZ */
+#else /* DATA_TYPE_FP32 */
#error Data type not supported
-#endif /* DATA_TYPE_FP16 */
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
index fffc87d90d..dd9e1a3864 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -326,6 +326,23 @@ uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
#define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
#define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+#define VLOAD5(return_type, tensor_ptr, offset) \
+ return_type(LOAD(tensor_ptr, offset), \
+ LOAD(tensor_ptr, (offset) + uint(1)), \
+ LOAD(tensor_ptr, (offset) + uint(2)), \
+ LOAD(tensor_ptr, (offset) + uint(3)), \
+ LOAD(tensor_ptr, (offset) + uint(4)))
+
+#define VSTORE5(tensor_ptr, offset, data) \
+ STORE(tensor_ptr, offset, data[0]); \
+ STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+ STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+ STORE(tensor_ptr, (offset) + uint(3), data[3]); \
+ STORE(tensor_ptr, (offset) + uint(4), data[4])
+
+#define VLOAD5_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD5(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE5_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE5(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
/** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
*
* @param[in] data The vec4 object to be packed
@@ -348,6 +365,19 @@ mediump vec4 unpack4_half(highp uvec2 packed_data)
return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
}
+/** Unpacking the uvec3 object to 6 half-precision (16-bits) floating point values and converting to a vec2[3] object
+ *
+ * @param[in] packed_data The uvec3 object to be unpacked
+ *
+ * @return The unpacked vec2[3] object
+ */
+mediump vec2[3] unpack6_half(highp uvec3 packed_data)
+{
+ return vec2[3](unpackHalf2x16(packed_data[0]),
+ unpackHalf2x16(packed_data[1]),
+ unpackHalf2x16(packed_data[2]));
+}
+
/** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
*
* @param[in] data The vec4[2] object to be packed
@@ -396,6 +426,9 @@ mediump vec4[3] unpack12_half(highp uvec2[3] packed_data)
#define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
#define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+#define VLOAD3_UNPACK6_HALF(tensor_ptr, offset) unpack6_half(VLOAD3(uvec3, tensor_ptr, offset))
+#define VLOAD3_UNPACK6_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK6_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+
#define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
#define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
#define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))