From b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f Mon Sep 17 00:00:00 2001 From: Frank Lei Date: Tue, 5 Dec 2017 10:43:33 +0800 Subject: APPBROWSER-312 Fully connected performance optimization Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830 Reviewed-by: Stephen Li Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Anthony Barbier --- .../GLES_COMPUTE/cs_shaders/convolution_layer.cs | 67 +++++++++++++++++++--- 1 file changed, 58 insertions(+), 9 deletions(-) (limited to 'src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs') diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs index 1a0c9f1d30..87a109adc0 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs @@ -25,14 +25,6 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; #include "helpers.h" -#ifdef DATA_TYPE_FP16 -BUFFER_DECLARATION(src, 1, uint, readonly); -BUFFER_DECLARATION(dst, 2, uint, restrict); -#else // DATA_TYPE_FP16 -BUFFER_DECLARATION(src, 1, float, readonly); -BUFFER_DECLARATION(dst, 2, float, restrict); -#endif // DATA_TYPE_FP16 - layout(std140) uniform shader_params { #ifdef IM2COL_GENERIC @@ -58,10 +50,21 @@ layout(std140) uniform shader_params }; #ifdef DATA_TYPE_FP16 +#if defined(IM2COL_REDUCED_8X) +BUFFER_DECLARATION(src, 1, uvec4, readonly); +BUFFER_DECLARATION(dst, 2, uvec4, restrict); +#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */ +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, restrict); +#else /* IM2COL_REDUCED_8X */ +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, restrict); +#endif /* IM2COL_REDUCED_8X */ precision mediump float; #ifdef IM2COL_REDUCED +#if defined(IM2COL_REDUCED_GENERIC) /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" @@ -142,9 +145,55 @@ void main(void) } #endif // HAS_BIAS } -#endif // IM2COL_REDUCED +#else /* IM2COL_REDUCED_GENERIC */ +/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + */ +void main(void) +{ + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst); +#if defined(IM2COL_REDUCED_8X) + uint tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x); + uvec4 tmp; + LOAD1(tmp, src, src.current_offset >> uint(4)); + STORE1(dst, tmp_out_offset >> uint(4), tmp); +#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */ + uint tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x); + uvec2 tmp; + LOAD1(tmp, src, src.current_offset >> uint(3)); + STORE1(dst, tmp_out_offset >> uint(3), tmp); +#else /* IM2COL_REDUCED_8X */ + uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x); + uint tmp; + LOAD1(tmp, src, src.current_offset >> uint(2)); + STORE1(dst, tmp_out_offset >> uint(2), tmp); +#endif /* IM2COL_REDUCED_8X */ +} +#endif /* IM2COL_REDUCED_GENERIC */ +#endif // IM2COL_REDUCED #elif defined(DATA_TYPE_FP32) +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, restrict); #ifdef IM2COL_GENERIC /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. -- cgit v1.2.1