From 09849a0e7128731473f37cf6045147db68b1c495 Mon Sep 17 00:00:00 2001 From: Joel Liang Date: Fri, 5 Jan 2018 15:12:53 +0800 Subject: APPBROWSER-372: Rewrite the direct_convolution5x5.cs with the new common code Change-Id: Ie2f398d62dea97e9201f77d22c9f0796db297b63 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115280 Tested-by: Jenkins Reviewed-by: Zhenglin Li Reviewed-by: Anthony Barbier --- .../cs_shaders/direct_convolution5x5.cs | 982 ++------------------- src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h | 35 +- 2 files changed, 116 insertions(+), 901 deletions(-) (limited to 'src/core/GLES_COMPUTE') diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs index a36bd438ff..c919e4ed80 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,488 +24,114 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; -#include "helpers.h" +#include "helpers_cs.h" -#ifdef DATA_TYPE_FP32 - -precision highp float; +#if defined(DATA_TYPE_FP16) +precision mediump float; +#endif // DATA_TYPE_FP16 /** This kernel performs a direct convolution to convolve the low three dimensions * - * @note This OpenGL ES shader works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" + * @note This kernel has multiple optimized direct convolution options for FP16. + * The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z" + * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1" + * This OpenGL ES shader works with stride_x = 1 and 2 * @note If biases are used then "define HAS_BIAS" has to be passed at compile time * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_attrs The attributes of the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_attrs The attributes of the weights tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors */ - -layout(std140) uniform shader_params +SHADER_PARAMS_DECLARATION { - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); + Tensor3DAttributes src_attrs; + Tensor3DAttributes dst_attrs; + Tensor3DAttributes weights_attrs; #ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); + VectorAttributes biases_attrs; #endif /* BIAS */ uint weights_stride_w; uint weights_depth; }; -BUFFER_DECLARATION(src, 1, float, readonly); -BUFFER_DECLARATION(dst, 2, float, writeonly); -BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef DATA_TYPE_FP32 +TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly); +TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly); #ifdef BIAS -BUFFER_DECLARATION(biases, 4, float, readonly); +TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly); #endif /* BIAS */ -#define LOAD20(r, name, offset) \ - r[0] = LOAD4(name, offset); \ - r[1] = LOAD4(name, offset + uint(1)); \ - r[2] = LOAD4(name, offset + uint(2)); \ - r[3] = LOAD4(name, offset + uint(3)); \ - r[4] = LOAD4(name, offset + uint(4)) - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ void main() { - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); #ifdef BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); #endif /* BIAS */ - float pixels = CONVERT(0, float); + float pixels = 0.f; uint z_index = gl_GlobalInvocationID.z; - weights.current_offset += z_index * weights_stride_w >> 2; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w); + float temp[5]; float temp_weight[5]; - for(int d = 0; d < int(weights_depth); ++d) { - LOAD20(temp, src, offset(src, 0, 0)); - LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0)); + temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 0)); + temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0)); pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; - LOAD20(temp, src, offset(src, 0, 1)); - LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0)); + temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 1)); + temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0)); pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; - LOAD20(temp, src, offset(src, 0, 2)); - LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0)); + temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 2)); + temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0)); pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; - LOAD20(temp, src, offset(src, 0, 3)); - LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0)); + temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 3)); + temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 3, 0)); pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; - LOAD20(temp, src, offset(src, 0, 4)); - LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0)); + temp = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 4)); + temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 4, 0)); pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; - src.current_offset += (src_stride_z >> 2); - weights.current_offset += (weights_stride_z >> 2); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z); } #ifdef BIAS - pixels += LOAD4(biases, vector_offset(biases, int(z_index))); + pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index)); #endif /* BIAS */ - STORE4(dst, CURRENT_OFFSET(dst), pixels); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels); } - #elif defined(DATA_TYPE_FP16) -precision mediump float; - -#if defined(PROCESS_4X_1Y_1Z) - -/** This kernel performs a direct convolution to convolve the low three dimensions - * - * @note This OpenGL ES shader works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ - -layout(std140) uniform shader_params -{ - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); -#ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); -#endif /* BIAS */ - uint weights_stride_w; - uint weights_depth; -}; - -BUFFER_DECLARATION(src, 1, uvec2, readonly); -BUFFER_DECLARATION(dst, 2, uvec2, writeonly); -BUFFER_DECLARATION(weights, 3, uint, readonly); -#ifdef BIAS -BUFFER_DECLARATION(biases, 4, uint, readonly); -#endif /* BIAS */ - -#if STRIDE_X == 1 -#define LOAD_SRC(src, row) load_src_stride1(src, row) -#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define LOAD_SRC(src, row) load_src_stride2(src, row) -#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight) -#else /* STRDIDE_X == 1 */ -#error STRIDE_X larger than 2 is not supported -#endif /* STRIDE_X == 1 */ - -vec4[2] load_src_stride1(Image src, int row) -{ - uvec2 packed[2]; - vec4 ret[2]; - - GC_LOAD2_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - - return ret; -} - -vec4[3] load_src_stride2(Image src, int row) -{ - uvec2 packed[3]; - vec4 ret[3]; - - GC_LOAD3_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y)); - - return ret; -} - -vec2[3] load_weight(Tensor3D weights, int row) -{ - uvec3 packed_w; - vec2 ret[3]; - - GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0); - - ret[0] = vec2(unpackHalf2x16(packed_w[0])); - ret[1] = vec2(unpackHalf2x16(packed_w[1])); - ret[2] = vec2(unpackHalf2x16(packed_w[2])); - - return ret; -} - -vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3]) -{ - vec4 src0 = tmp[0]; - vec4 src1 = vec4(tmp[0].yzw, tmp[1].x); - vec4 src2 = vec4(tmp[0].zw, tmp[1].xy); - vec4 src3 = vec4(tmp[0].w, tmp[1].xyz); - vec4 src4 = tmp[1]; - vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - return ret; -} - -vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3]) -{ - vec4 src0 = vec4(tmp[0].xz, tmp[1].xz); - vec4 src1 = vec4(tmp[0].yw, tmp[1].yw); - vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); - vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y); - vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz); - vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - return ret; -} - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ -void main() -{ - Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); - -#ifdef BIAS - Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); -#endif /* BIAS */ - - vec4 res = vec4(0); - vec2 w[3]; - vec4 s[STRIDE_X + 1]; - uvec2 packed_d; - uint z_index = gl_GlobalInvocationID.z; - - weights.current_offset += z_index * weights_stride_w; - - for(int d = 0; d < int(weights_depth); ++d) - { - for(int row = 0; row < 5; row++) - { - w = load_weight(weights, row); - s = LOAD_SRC(src, row); - res += CONVOLVE1x5(s, w); - } - - src.current_offset += src_stride_z; - weights.current_offset += weights_stride_z; - } - -#ifdef BIAS - uint packed_b; - float b; - - GC_LOAD1_1D_OFFSET(packed_b, biases, z_index); - b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y; - res += vec4(b); -#endif /* BIAS */ - - packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw)); - GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0); -} - -#elif defined(PROCESS_4X_3Y_1Z) - -/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once - * - * @note This OpenGL ES shader works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ - -layout(std140) uniform shader_params -{ - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); -#ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); -#endif /* BIAS */ - uint weights_stride_w; - uint weights_depth; -}; - -BUFFER_DECLARATION(src, 1, uvec2, readonly); -BUFFER_DECLARATION(dst, 2, uvec2, writeonly); -BUFFER_DECLARATION(weights, 3, uint, readonly); -#ifdef BIAS -BUFFER_DECLARATION(bias, 4, uint, readonly); -#endif /* BIAS */ - +// Common definitions for DATA_TYPE_FP16 #if STRIDE_X == 1 -#define LOAD_SRC(src, row) load_src_stride1(src, row) +#define LOAD_SRC_AT_ROW(row) VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row)) #define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight) #elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define LOAD_SRC(src, row) load_src_stride2(src, row) +#define LOAD_SRC_AT_ROW(row) VLOAD3_UNPACK12_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row)) #define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight) #else /* STRDIDE_X == 1 */ #error STRIDE_X larger than 2 is not supported #endif /* STRIDE_X == 1 */ -vec4[2] load_src_stride1(Image src, int row) -{ - uvec2 packed[2]; - vec4 ret[2]; - - GC_LOAD2_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - - return ret; -} - -vec4[3] load_src_stride2(Image src, int row) -{ - uvec2 packed[3]; - vec4 ret[3]; - - GC_LOAD3_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y)); - - return ret; -} - -vec2[3] load_weight(Tensor3D weights, int row) -{ - uvec3 packed_w; - vec2 ret[3]; - - GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0); - - ret[0] = vec2(unpackHalf2x16(packed_w[0])); - ret[1] = vec2(unpackHalf2x16(packed_w[1])); - ret[2] = vec2(unpackHalf2x16(packed_w[2])); - - return ret; -} +#define LOAD_WEIGHT_AT_ROW(row) VLOAD3_UNPACK6_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, row, 0)) vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3]) { @@ -531,501 +157,57 @@ vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3]) return ret; } -void main() -{ - Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); - -#ifdef BIAS - Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); -#endif /* BIAS */ - - vec4 res[3]; - vec2 w[5][3]; - vec4 s[STRIDE_X + 1]; - uvec2 packed_d; - uint z_index = gl_GlobalInvocationID.z; - int i; - - for(i = 0; i < 3; i++) - { - res[i] = vec4(0); - } - - weights.current_offset += z_index * weights_stride_w; - - for(int d = 0; d < int(weights_depth); ++d) - { - // load weights once - for(int row = 0; row < 5; row++) - { - w[row] = load_weight(weights, row); - } - - // 1st line - s = LOAD_SRC(src, 0); - res[0] += CONVOLVE1x5(s, w[0]); - - // 2nd line - s = LOAD_SRC(src, 1); - res[0] += CONVOLVE1x5(s, w[1]); - res[1] += CONVOLVE1x5(s, w[0]); - - // 3rd line - s = LOAD_SRC(src, 2); - res[0] += CONVOLVE1x5(s, w[2]); - res[1] += CONVOLVE1x5(s, w[1]); - res[2] += CONVOLVE1x5(s, w[0]); - - // 4th line - s = LOAD_SRC(src, 3); - res[0] += CONVOLVE1x5(s, w[3]); - res[1] += CONVOLVE1x5(s, w[2]); - res[2] += CONVOLVE1x5(s, w[1]); - - // 5th line - s = LOAD_SRC(src, 4); - res[0] += CONVOLVE1x5(s, w[4]); - res[1] += CONVOLVE1x5(s, w[3]); - res[2] += CONVOLVE1x5(s, w[2]); - - // 6th line - s = LOAD_SRC(src, 5); - res[1] += CONVOLVE1x5(s, w[4]); - res[2] += CONVOLVE1x5(s, w[3]); - - // 7th line - s = LOAD_SRC(src, 6); - res[2] += CONVOLVE1x5(s, w[4]); - - src.current_offset += src_stride_z; - weights.current_offset += weights_stride_z; - } - -#ifdef BIAS - uint packed_b; - float b; - - GC_LOAD1_1D_OFFSET(packed_b, bias, z_index); - b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y; - for(i = 0; i < 3; i++) - { - res[i] += vec4(b); - } -#endif /* BIAS */ - - for(i = 0; i < 3; i++) - { - packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw)); - GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0); - } -} - -#elif defined(PROCESS_4X_3Y_2Z) - -/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once - * - * @note This OpenGL ES shader works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ - -layout(std140) uniform shader_params -{ - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); -#ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); -#endif /* BIAS */ - uint weights_stride_w; - uint weights_depth; -}; - -BUFFER_DECLARATION(src, 1, uvec2, readonly); -BUFFER_DECLARATION(dst, 2, uvec2, writeonly); -BUFFER_DECLARATION(weights, 3, uint, readonly); -#ifdef BIAS -BUFFER_DECLARATION(bias, 4, uint, readonly); -#endif /* BIAS */ - -#if STRIDE_X == 1 -#define LOAD_SRC(src, row) load_src_stride1(src, row) -#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define LOAD_SRC(src, row) load_src_stride2(src, row) -#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight) -#else /* STRDIDE_X == 1 */ -#error STRIDE_X larger than 2 is not supported -#endif /* STRIDE_X == 1 */ - -vec4[2] load_src_stride1(Image src, int row) -{ - uvec2 packed[2]; - vec4 ret[2]; - - GC_LOAD2_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - - return ret; -} - -vec4[3] load_src_stride2(Image src, int row) -{ - uvec2 packed[3]; - vec4 ret[3]; - - GC_LOAD3_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y)); - - return ret; -} - -vec2[3] load_weight(Tensor3D weights, int row) -{ - uvec3 packed_w; - vec2 ret[3]; - - GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0); - - ret[0] = vec2(unpackHalf2x16(packed_w[0])); - ret[1] = vec2(unpackHalf2x16(packed_w[1])); - ret[2] = vec2(unpackHalf2x16(packed_w[2])); - - return ret; -} - -vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3]) -{ - vec4 src0 = tmp[0]; - vec4 src1 = vec4(tmp[0].yzw, tmp[1].x); - vec4 src2 = vec4(tmp[0].zw, tmp[1].xy); - vec4 src3 = vec4(tmp[0].w, tmp[1].xyz); - vec4 src4 = tmp[1]; - vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - return ret; -} - -vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3]) -{ - vec4 src0 = vec4(tmp[0].xz, tmp[1].xz); - vec4 src1 = vec4(tmp[0].yw, tmp[1].yw); - vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); - vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y); - vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz); - vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - return ret; -} - -void main() -{ - Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); - -#ifdef BIAS - Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); -#endif /* BIAS */ - - vec4 res[3]; - vec2 w[5][3]; - vec4 s[STRIDE_X + 1]; - uvec2 packed_d; - uint z_index = (gl_GlobalInvocationID.z); - uint s_offset = src.current_offset; - int i, z; - - weights.current_offset += z_index * weights_stride_w; - - for(z = 0; z < 2; z++) - { - z_index += uint(z); - src.current_offset = s_offset; - - for(i = 0; i < 3; i++) - { - res[i] = vec4(0); - } - - for(int d = 0; d < int(weights_depth); ++d) - { - // load weights once - for(int row = 0; row < 5; row++) - { - w[row] = load_weight(weights, row); - } - - // 1st line - s = LOAD_SRC(src, 0); - res[0] += CONVOLVE1x5(s, w[0]); - - // 2nd line - s = LOAD_SRC(src, 1); - res[0] += CONVOLVE1x5(s, w[1]); - res[1] += CONVOLVE1x5(s, w[0]); - - // 3rd line - s = LOAD_SRC(src, 2); - res[0] += CONVOLVE1x5(s, w[2]); - res[1] += CONVOLVE1x5(s, w[1]); - res[2] += CONVOLVE1x5(s, w[0]); - - // 4th line - s = LOAD_SRC(src, 3); - res[0] += CONVOLVE1x5(s, w[3]); - res[1] += CONVOLVE1x5(s, w[2]); - res[2] += CONVOLVE1x5(s, w[1]); - - // 5th line - s = LOAD_SRC(src, 4); - res[0] += CONVOLVE1x5(s, w[4]); - res[1] += CONVOLVE1x5(s, w[3]); - res[2] += CONVOLVE1x5(s, w[2]); - - // 6th line - s = LOAD_SRC(src, 5); - res[1] += CONVOLVE1x5(s, w[4]); - res[2] += CONVOLVE1x5(s, w[3]); - - // 7th line - s = LOAD_SRC(src, 6); - res[2] += CONVOLVE1x5(s, w[4]); - - src.current_offset += src_stride_z; - weights.current_offset += weights_stride_z; - } - -#ifdef BIAS - uint packed_b; - float b; - - GC_LOAD1_1D_OFFSET(packed_b, bias, z_index); - b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y; - for(i = 0; i < 3; i++) - { - res[i] += vec4(b); - } -#endif /* BIAS */ - - for(i = 0; i < 3; i++) - { - packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw)); - GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0); - } - - dst.current_offset += dst_stride_z; - } -} - -#elif defined(PROCESS_8X_1Y_1Z) - -/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once - * - * @note This OpenGL ES shader works with stride_x = 1 - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note If biases are used then "define HAS_BIAS" has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ - -layout(std140) uniform shader_params -{ - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); -#ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); -#endif /* BIAS */ - uint weights_stride_w; - uint weights_depth; -}; - -BUFFER_DECLARATION(src, 1, uvec4, readonly); -BUFFER_DECLARATION(dst, 2, uvec4, writeonly); -BUFFER_DECLARATION(weights, 3, uint, readonly); +#if defined(PROCESS_4X_1Y_1Z) +TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly); +TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly); #ifdef BIAS -BUFFER_DECLARATION(bias, 4, uint, readonly); +TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly); #endif /* BIAS */ -#if STRIDE_X == 1 -#define LOAD_SRC(src, row) load_src_stride1(src, row) -#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#error stride == 2 for PROCESS_8X_1Y not implemented -#else /* STRDIDE_X == 1 */ -#error STRIDE_X larger than 2 is not supported -#endif /* STRIDE_X == 1 */ - -vec4[3] load_src_stride1(Image src, int row) -{ - uvec4 packed[2]; - vec4 ret[3]; - - GC_LOAD2_2D_OFFSET(packed, src, 0, row); - - ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); - ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w)); - ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); - - return ret; -} - -vec2[3] load_weight(Tensor3D weights, int row) -{ - uvec3 packed_w; - vec2 ret[3]; - - GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0); - - ret[0] = vec2(unpackHalf2x16(packed_w[0])); - ret[1] = vec2(unpackHalf2x16(packed_w[1])); - ret[2] = vec2(unpackHalf2x16(packed_w[2])); - - return ret; -} - -vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3]) -{ - vec4 src0 = tmp[0]; - vec4 src1 = vec4(tmp[0].yzw, tmp[1].x); - vec4 src2 = vec4(tmp[0].zw, tmp[1].xy); - vec4 src3 = vec4(tmp[0].w, tmp[1].xyz); - vec4 src4 = tmp[1]; - vec4 ret[2]; - - ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - src0 = tmp[1]; - src1 = vec4(tmp[1].yzw, tmp[2].x); - src2 = vec4(tmp[1].zw, tmp[2].xy); - src3 = vec4(tmp[1].w, tmp[2].xyz); - src4 = tmp[2]; - ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; - - return ret; -} - void main() { - Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); #ifdef BIAS - Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); #endif /* BIAS */ - vec4 res[2]; - vec2 w[3]; - vec4 s[STRIDE_X + 2]; - uvec4 packed_d; - uint z_index = gl_GlobalInvocationID.z; + vec4 res = vec4(0); + vec2 w[3]; + vec4 s[STRIDE_X + 1]; - res[0] = vec4(0); - res[1] = vec4(0); - weights.current_offset += z_index * weights_stride_w; + uint z_index = gl_GlobalInvocationID.z; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w); for(int d = 0; d < int(weights_depth); ++d) { for(int row = 0; row < 5; row++) { - w = load_weight(weights, row); - s = LOAD_SRC(src, row); - res[0] += CONVOLVE1x5(s, w)[0]; - res[1] += CONVOLVE1x5(s, w)[1]; + w = LOAD_WEIGHT_AT_ROW(row); + s = LOAD_SRC_AT_ROW(row); + res += CONVOLVE1x5(s, w); } - src.current_offset += src_stride_z; - weights.current_offset += weights_stride_z; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z); } #ifdef BIAS - uint packed_b; + vec2 vec2_b; float b; - GC_LOAD1_1D_OFFSET(packed_b, bias, z_index); - b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y; - res[0] += vec4(b); - res[1] += vec4(b); + vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index)); + b = (z_index % uint(2) == uint(0)) ? vec2_b.x : vec2_b.y; + res += vec4(b); #endif /* BIAS */ - packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw)); - packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw)); - GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0); + STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res); } -#else /* defined(PROCESS_4X_1Y_1Z) */ - -#endif /* defined(PROCESS_4X_1Y_1Z) */ - -#else /* DATA_TYPE_FP16 */ +#endif /* PROCESS_nX_nY_nZ */ +#else /* DATA_TYPE_FP32 */ #error Data type not supported -#endif /* DATA_TYPE_FP16 */ +#endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h index fffc87d90d..dd9e1a3864 100755 --- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h +++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017, 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -326,6 +326,23 @@ uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z) #define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter)) #define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data) +#define VLOAD5(return_type, tensor_ptr, offset) \ + return_type(LOAD(tensor_ptr, offset), \ + LOAD(tensor_ptr, (offset) + uint(1)), \ + LOAD(tensor_ptr, (offset) + uint(2)), \ + LOAD(tensor_ptr, (offset) + uint(3)), \ + LOAD(tensor_ptr, (offset) + uint(4))) + +#define VSTORE5(tensor_ptr, offset, data) \ + STORE(tensor_ptr, offset, data[0]); \ + STORE(tensor_ptr, (offset) + uint(1), data[1]); \ + STORE(tensor_ptr, (offset) + uint(2), data[2]); \ + STORE(tensor_ptr, (offset) + uint(3), data[3]); \ + STORE(tensor_ptr, (offset) + uint(4), data[4]) + +#define VLOAD5_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD5(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter)) +#define VSTORE5_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE5(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data) + /** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object * * @param[in] data The vec4 object to be packed @@ -348,6 +365,19 @@ mediump vec4 unpack4_half(highp uvec2 packed_data) return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y)); } +/** Unpacking the uvec3 object to 6 half-precision (16-bits) floating point values and converting to a vec2[3] object + * + * @param[in] packed_data The uvec3 object to be unpacked + * + * @return The unpacked vec2[3] object + */ +mediump vec2[3] unpack6_half(highp uvec3 packed_data) +{ + return vec2[3](unpackHalf2x16(packed_data[0]), + unpackHalf2x16(packed_data[1]), + unpackHalf2x16(packed_data[2])); +} + /** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object * * @param[in] data The vec4[2] object to be packed @@ -396,6 +426,9 @@ mediump vec4[3] unpack12_half(highp uvec2[3] packed_data) #define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter)) #define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data) +#define VLOAD3_UNPACK6_HALF(tensor_ptr, offset) unpack6_half(VLOAD3(uvec3, tensor_ptr, offset)) +#define VLOAD3_UNPACK6_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK6_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter)) + #define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset)) #define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data)) #define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter)) -- cgit v1.2.1