From f1f3ebd517089e934cf3f06e64d90619a395ad87 Mon Sep 17 00:00:00 2001 From: Joel Liang Date: Fri, 10 Nov 2017 09:59:19 +0800 Subject: APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader The new common code of compute shader is in file helpers_cs.h Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code. It will also remove the dependence of the token pasting operator (##). We'll remove the "##" support after we rewrite all of the compute shader code. Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455 Tested-by: Kaizen Reviewed-by: Georgios Pinitas Reviewed-by: Frank Lei (cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883) Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601 Reviewed-by: Jim He --- .../cs_shaders/direct_convolution1x1.cs | 220 +++++++-------------- 1 file changed, 72 insertions(+), 148 deletions(-) (limited to 'src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs') diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs index 3a31cb80a7..071c1858bc 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs @@ -24,107 +24,88 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; -#include "helpers.h" +#include "helpers_cs.h" -layout(std140) uniform shader_params +#if defined(DATA_TYPE_FP16) +precision mediump float; +#endif // DATA_TYPE_FP16 + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" + * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_attrs The attributes of the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_attrs The attributes of the weights tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +SHADER_PARAMS_DECLARATION { - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); - TENSOR3D_PARAM_DECLARATION(weights); + Tensor3DAttributes src_attrs; + Tensor3DAttributes dst_attrs; + Tensor3DAttributes weights_attrs; #ifdef BIAS - VECTOR_PARAM_DECLARATION(biases); + VectorAttributes biases_attrs; #endif /* BIAS */ uint weights_stride_w; uint weights_depth; }; #if defined(DATA_TYPE_FP32) -precision highp float; - -BUFFER_DECLARATION(src, 1, float, readonly); -BUFFER_DECLARATION(dst, 2, float, writeonly); -BUFFER_DECLARATION(weights, 3, float, readonly); +TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly); +TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly); #ifdef BIAS -BUFFER_DECLARATION(biases, 4, float, readonly); +TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly); #endif /* BIAS */ -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" - * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1" - * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ void main() { - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); #ifdef BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); #endif /* BIAS */ - float pixels = CONVERT(0, float); + float pixels = 0.f; uint z_index = gl_GlobalInvocationID.z; - weights.current_offset += z_index * weights_stride_w >> 2; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w); + float temp; float temp_weight; - for(int d = 0; d < int(weights_depth); ++d) { - temp = LOAD4(src, CURRENT_OFFSET(src)); - temp_weight = LOAD4(weights, CURRENT_OFFSET(weights)); + temp = LOAD_CURRENT_ITEM(src_ptr, src_iter); + temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter); pixels += temp * temp_weight; - src.current_offset += (src_stride_z >> 2); - weights.current_offset += (weights_stride_z >> 2); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z); } #ifdef BIAS - pixels += LOAD4(biases, vector_offset(biases, int(z_index))); + pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index)); #endif /* BIAS */ - STORE4(dst, CURRENT_OFFSET(dst), pixels); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels); } #elif defined(DATA_TYPE_FP16) -precision mediump float; -BUFFER_DECLARATION(src, 1, uvec4, readonly); -BUFFER_DECLARATION(dst, 2, uvec4, writeonly); -BUFFER_DECLARATION(weights, 3, uint, readonly); +TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); +TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly); #ifdef BIAS -BUFFER_DECLARATION(biases, 4, uint, readonly); +TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly); #endif /* BIAS */ #if STRIDE_X == 2 @@ -135,15 +116,10 @@ BUFFER_DECLARATION(biases, 4, uint, readonly); #error STRIDE_X larger than 2 is not supported #endif /* STRIDE_X == 2 */ -vec4[2] convolve_stride1(Image src, float w) +vec4[2] convolve_stride1(ImageIterator src_iter, float w) { - uvec4 packed_s; - vec4 s[2]; - - GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0); - - s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); - s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); + vec4 s[2]; + s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter); s[0] *= w; s[1] *= w; @@ -151,22 +127,14 @@ vec4[2] convolve_stride1(Image src, float w) return s; } -vec4[2] convolve_stride2(Image src, float w) +vec4[2] convolve_stride2(ImageIterator src_iter, float w) { - uvec4 packed_s; - vec4 s[2]; - vec4 r[2]; - - GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0); - s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); - s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); + vec4 s[2]; + vec4 r[2]; + s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter); r[0] = vec4(s[0].xz, s[1].xz); - - GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0); - s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); - s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); - + s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0)); r[1] = vec4(s[0].xz, s[1].xz); r[0] *= w; @@ -175,51 +143,14 @@ vec4[2] convolve_stride2(Image src, float w) return r; } -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" - * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1" - * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - * @param[in] weights_depth The third dimensions of the weights tensors - */ void main() { - Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); #ifdef BIAS - Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); #endif /* BIAS */ vec4 pixels[2]; @@ -227,48 +158,41 @@ void main() pixels[1] = vec4(0.f); uint z_index = gl_GlobalInvocationID.z; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w); - weights.current_offset += z_index * weights_stride_w; - - uint packed_w; float w; - for(int d = 0; d < int(weights_depth); ++d) { - GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0); - w = unpackHalf2x16(packed_w).x; + w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x; - vec4 r[2] = CONVOLVE(src, w); + vec4 r[2] = CONVOLVE(src_iter, w); pixels[0] += r[0]; pixels[1] += r[1]; - src.current_offset += src_stride_z; - weights.current_offset += weights_stride_z; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z); } #ifdef BIAS - uint packed_b; + vec2 vec2_b; float b; - GC_LOAD1_1D_OFFSET(packed_b, biases, z_index); + vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index)); if(z_index % uint(2) == uint(0)) { - b = unpackHalf2x16(packed_b).x; + b = vec2_b.x; } else { - b = unpackHalf2x16(packed_b).y; + b = vec2_b.y; } - pixels[0] += vec4(b); - pixels[1] += vec4(b); + pixels[0] += b; + pixels[1] += b; #endif /* BIAS */ - uvec4 packed_d; - packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw), - packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw)); - GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0); + STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels); } #else /* DATA_TYPE_FP32 */ #error Data type not supported -- cgit v1.2.1