From e855c237a5b61c4ed5a5ab79dd4af27385cf72f5 Mon Sep 17 00:00:00 2001 From: Stephen Li Date: Thu, 4 Jan 2018 14:13:22 +0800 Subject: APPBROWSER-377: GCConvoutionLayer support for FP16 Change-Id: I801b5e393a16a9f92c062826e6fcfd5982ca7bb3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116584 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../GLES_COMPUTE/cs_shaders/convolution_layer.cs | 518 +++++++++++++++++++-- src/core/GLES_COMPUTE/cs_shaders/gemm.cs | 192 +++++++- src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h | 2 +- 3 files changed, 663 insertions(+), 49 deletions(-) mode change 100755 => 100644 src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs mode change 100755 => 100644 src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h (limited to 'src/core/GLES_COMPUTE/cs_shaders') diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs old mode 100755 new mode 100644 index 4bfac282e2..2648db08b3 --- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,32 +30,163 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = precision mediump float; #endif // DATA_TYPE_FP16 +#ifdef RESHAPE_TO_COLUMNS + +/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_attrs The attributes of the biases tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + * @param[in] depth The depth of the input tensor + * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix + */ + +SHADER_PARAMS_DECLARATION +{ + Tensor3DAttributes src_attrs; + ImageAttributes dst_attrs; +#ifdef HAS_BIAS + VectorAttributes biases_attrs; +#endif /* HAS_BIAS */ + uint width; + uint height; + uint depth; + uint total_filters; +}; + +#if defined(DATA_TYPE_FP16) + +TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); +#ifdef HAS_BIAS +TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly); +#endif /* BIAS */ + +void main() +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); +#ifdef HAS_BIAS + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); +#endif /* BIAS */ + + bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1)) + && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1))); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint( + gl_GlobalInvocationID.z) + * uint(width) * uint(height) * uint(dst_attrs.stride_y)))); + // Linearize convolution elements + if(is_last_thread) + { + for(uint i = 0u; i < uint(total_filters); i = i + 2u) + { + vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec2 s; + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.x = s0.x; + } + else + { + s.x = s0.y; + } + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + + vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.y = s1.x; + } + else + { + s.y = s1.y; + } + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); +#ifdef HAS_BIAS + vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter); + STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x)); +#endif /* HAS_BIAS */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); + } + } + else + { + for(uint i = 0u; i < uint(total_filters); i = i + 2u) + { + vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec2 s; + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.x = s0.x; + } + else + { + s.x = s0.y; + } + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + + vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.y = s1.x; + } + else + { + s.y = s1.y; + } + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); + } + } +} + +#endif /* DATA_TYPE_FP16 */ +#endif // RESHAPE_TO_COLUMNS + #ifdef IM2COL_GENERIC + /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx" + * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx" + * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx" + * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx" + * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx" * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_attrs The attributes of the source tensor * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_attrs The attributes of the destination tensor - * @param[in] filter_depth The depth of the used filter * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ + SHADER_PARAMS_DECLARATION { Tensor3DAttributes src_attrs; ImageAttributes dst_attrs; - uint filter_depth; uint src_stride_w; uint dst_stride_w; }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); @@ -63,64 +194,315 @@ void main(void) uint xc = gl_GlobalInvocationID.x; // x coordinate in the convolved tensor uint yc = gl_GlobalInvocationID.y; // y coordinate in the convolved tensor - uint ch = gl_GlobalInvocationID.z % filter_depth; // input feature map - uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch + uint ch = gl_GlobalInvocationID.z % KERNEL_DEPTH; // input feature map + uint batch = gl_GlobalInvocationID.z / KERNEL_DEPTH; // the batch // Calculate input indeces - uint xi = xc * uint(STRIDE_X) - uint(PAD_X); - uint yi = yc * uint(STRIDE_Y) - uint(PAD_Y); - uint input_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w)); + uint xi = xc * uint(STRIDE_X) - uint(PAD_LEFT); + uint yi = yc * uint(STRIDE_Y) - uint(PAD_TOP); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w)); // Calculate output indeces - uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT); - uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution - uint output_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo); + uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT); + uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo); + + uint src_pos = 0u; // Linearize convolution elements for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) { - for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x) + for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u)) { -#if PAD_X == 0 && PAD_Y == 0 - output_offset = input_offset + ((x * src_attrs.stride_x + y * src_attrs.stride_y) >> uint(2)); - STORE(dst_ptr, output_offset, LOAD(src_ptr, input_offset)); - -#else // PAD_X == 0 && PAD_Y == 0 +#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); +#else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) { - STORE(dst_ptr, output_offset, 0.0f); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f); } else { - output_offset = input_offset + (x * srcs_attrs.stride_x + y * src_attrs.stride_y) >> uint(2)); - STORE(dst_ptr, output_offset, LOAD(src_ptr, input_offset)); + src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); } -#endif // PAD_X == 0 && PAD_Y == 0 +#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ } } #ifdef HAS_BIAS if(ch == (uint(KERNEL_DEPTH) - 1)) { - STORE(dst_ptr, output_offset, 1.0f); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f); } -#endif // HAS_BIAS +#endif /* HAS_BIAS */ } #elif defined(DATA_TYPE_FP16) + TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); +#ifdef KERNEL_1x1 + +void main(void) +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + uint xc = gl_GlobalInvocationID.x; + uint yc = gl_GlobalInvocationID.y; + uint zc = gl_GlobalInvocationID.z; + uint ch = zc % uint(KERNEL_DEPTH); // input feature map + uint batch = zc / uint(KERNEL_DEPTH); // the batch + + // Calculate input indeces + uint xi = xc; + uint yi = yc; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z); + + // Calculate output indeces + uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; + uint xo = ch * dst_element_count; + uint yo = xc + yc * uint(CONVOLVED_WIDTH); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); + + bool x_start_even = ((xc % 2u) == 0u); + bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u); + uint input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y); + uint tmp_left = 0u; + uint tmp_right = 0u; + + if(ch % 2u != 0u) + { + return; + } + + if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1)))) + { + tmp_left = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z); + tmp_right = LOAD(src_ptr, input_pos); + if(x_start_even) + { + tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u); + } + else + { + tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u); + } + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + +#ifdef HAS_BIAS + if(ch == (uint(KERNEL_DEPTH) - 2u)) + { + mediump vec2 bias_vec = vec2(1.f, 0.f); + uint bias_u = packHalf2x16(bias_vec); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u); + } +#endif /* HAS_BIAS */ + } + else + { + tmp_left = LOAD(src_ptr, input_pos); + if(x_start_even) + { + tmp_right = (tmp_left & 0xffffu); + } + else + { + tmp_right = (tmp_left >> 16u); + } + +#ifdef HAS_BIAS + mediump vec2 bias_vec = vec2(0.f, 1.f); + uint bias_u = packHalf2x16(bias_vec); + tmp_right += (bias_u & 0xffff0000u); +#endif /* HAS_BIAS */ + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } +} + +#else /* KERNEL_1x1 */ + void main(void) { + uint xc = gl_GlobalInvocationID.x; + uint yc = gl_GlobalInvocationID.y; + uint zc = gl_GlobalInvocationID.z; + uint ch = zc % uint(KERNEL_DEPTH); // input feature map + uint batch = zc / uint(KERNEL_DEPTH); // the batch + + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + // Calculate input indeces + uint src_element_count = src_attrs.step_x / src_attrs.stride_x; + uint xi = (xc * uint(STRIDE_X)) / src_element_count; + uint yi = yc * uint(STRIDE_Y); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z); + + // Calculate output indeces + uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; + uint xo = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count; + uint yo = xc + yc * uint(CONVOLVED_WIDTH); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); + + bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u); + bool z_start_even = ((ch % 2u) == 0u); + uint input_pos = 0u; + uint tmp = 0u; + uint tmp_left = 0u; + uint tmp_right = 0u; + + // Linearize convolution elements + for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) + { + uint xstart = 0u; + uint xend = 0u; + + // even col, even row + if(x_start_even) + { + if(((y - yi + ch) % 2u) == 0u) + { + for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); + } + } + else + { + // 1st pair + if(!z_start_even && (y == yi)) + { + // cross 2d feature map + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + + (ch - 1u) * src_attrs.stride_z); + } + else + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, + (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); + } + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + tmp_right = (tmp_left >> 16u) + (tmp_right << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } + } + } + else + { + if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even)) + { + // 1st pair + if(y == yi) + { + // cross 2d feature map + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + + (ch - 1u) * src_attrs.stride_z); + } + else + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, + (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); + } + + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); + } + } + else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even)) + { + // 1st pair + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } + } + } + } + + // NOTE: must handle last element manually instead of in loops + // to avoid write conflict across 2d boundary + if(ch == uint(KERNEL_DEPTH) - 1u) + { + uint x = xi + (uint(KERNEL_WIDTH) / 2u); + uint y = yi + uint(KERNEL_HEIGHT) - 1u; + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp = LOAD(src_ptr, input_pos); + if(!x_start_even) + { + tmp = (tmp >> 16u) + (tmp << 16u); + } + +#ifdef HAS_BIAS + mediump vec2 bias_vec = vec2(1.f, 1.f); + uint bias_u = packHalf2x16(bias_vec); + if(z_start_even) + { + tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u); + } + else + { + tmp = (bias_u & 0xffffu); + } +#endif /* HAS_BIAS */ + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp); + } } -#else /* DATA_TYPE_FP32 */ +#endif /* KERNEL_1x1 */ +#else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ #endif /* IM2COL_GENERIC */ #ifdef IM2COL_REDUCED + /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" @@ -133,6 +515,7 @@ void main(void) * @param[in] width The width of the input tensor * @param[in] height The height of the input tensor */ + SHADER_PARAMS_DECLARATION { Tensor3DAttributes src_attrs; @@ -142,6 +525,7 @@ SHADER_PARAMS_DECLARATION }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); @@ -181,6 +565,7 @@ TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); #endif /* IM2COL_REDUCED_8X */ #if defined(IM2COL_REDUCED_GENERIC) + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); @@ -207,20 +592,20 @@ void main(void) else { // special op - uint tmpleft = uint(0); - uint tmpright = uint(0); - tmpright = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half + uint tmp_left = uint(0); + uint tmp_right = uint(0); + tmp_right = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half if(pos.x == uint(0)) { - tmpleft = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half - tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16)); + tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half + tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16)); } else { - tmpleft = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z))); - tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16))); + tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z))); + tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16))); } - STORE(dst_ptr, tmp_out_offset, tmpright); + STORE(dst_ptr, tmp_out_offset, tmp_right); } } else @@ -243,6 +628,7 @@ void main(void) } #else /* IM2COL_REDUCED_GENERIC */ + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); @@ -263,46 +649,86 @@ void main(void) STORE(dst_ptr, tmp_out_offset, tmp); #endif /* IM2COL_REDUCED_8X */ } -#endif /* IM2COL_REDUCED_GENERIC */ -#else /* DATA_TYPE_FP32 */ + +#endif /* IM2COL_REDUCED_GENERIC */ +#else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ #endif /* IM2COL_REDUCED */ -#ifdef COL2IM +#ifdef WIDTH_OUTPUT + /** This kernel performs a reshaping of the output of the convolution layer. * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_attrs The attributes of the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_attrs The attributes of the destination tensor - * @param[in] width The width of output convolved dimensions + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[in] dst_depth The length of the destination tensor in Z dimension + * @param[in] dst_strideZ The actual stride of the destination tensor in Z dimension */ + SHADER_PARAMS_DECLARATION { - ImageAttributes src_attrs; + Tensor3DAttributes src_attrs; Tensor3DAttributes dst_attrs; - uint width; + uint dst_depth; + uint dst_strideZ; }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); + void main(void) { - ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); - uvec2 pos = uvec2(gl_GlobalInvocationID.xy); - uint tmp_out_offset = TENSOR3D_OFFSET(dst_iter, pos.y % width, pos.y / width, pos.x); + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * WIDTH_OUTPUT * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * (src_attrs.stride_z)); - STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter)); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, + LOAD_CURRENT_ITEM(src_ptr, src_iter)); } #elif defined(DATA_TYPE_FP16) +TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); + +void main(void) +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); + + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + + if((pos.z % dst_depth) % 2u == 0u) + { + uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ; + uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); + uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); + vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); + vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); + vec2 result = vec2(tmp1.x, tmp2.x); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); + } + else + { + uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u; + uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); + uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); + vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); + vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); + vec2 result = vec2(tmp1.y, tmp2.y); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); + } +} + #else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs index 4beb3ad5d2..c81bed7066 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -383,6 +383,81 @@ void main(void) #elif defined(DATA_TYPE_FP16) +#ifdef GEMM_TRANSPOSE1xW +/** This OpenGL ES kernel computes the "vector" 1x8 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main(void) +{ + /* Compute address for Matrix B - source */ + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y); + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter)); +} +#endif /* GEMM_TRANSPOSE1xW */ + +#ifdef GEMM_INTERLEAVE4x4 +/** This OpenGLES kernel reshapes the input matrix interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main(void) +{ + /* Compute source and destination addresses */ + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift); + + vec4 s0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec4 s1[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1)); + vec4 s2[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2)); + vec4 s3[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3)); + + vec4 s[2]; + s[0] = vec4(s0[0].x, s1[0].x, s2[0].x, s3[0].x); + s[1] = vec4(s0[0].y, s1[0].y, s2[0].y, s3[0].y); + STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + + s[0] = vec4(s0[0].z, s1[0].z, s2[0].z, s3[0].z); + s[1] = vec4(s0[0].w, s1[0].w, s2[0].w, s3[0].w); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 1u), s); + + s[0] = vec4(s0[1].x, s1[1].x, s2[1].x, s3[1].x); + s[1] = vec4(s0[1].y, s1[1].y, s2[1].y, s3[1].y); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 2u), s); + + s[0] = vec4(s0[1].z, s1[1].z, s2[1].z, s3[1].z); + s[1] = vec4(s0[1].w, s1[1].w, s2[1].w, s3[1].w); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 3u), s); +} +#endif /* GEMM_INTERLEAVE4x4 */ + #ifdef GEMM_MM_FLOATING_POINT /** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication @@ -757,6 +832,119 @@ void main(void) } #endif /* ACCUM_PROCESS_8X */ #endif /* GEMM_ACCUMULATE_BIASES */ -#else /* DATA_TYPE_FP16 */ + +#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED +/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src0_attrs The attributes of the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src0_attrs; + ImageAttributes src1_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, src0Buffer, uvec2, src0_ptr, src0_shift, 3, readonly); +TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly); +TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main() +{ + ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift); + ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift); + + /* Compute address for matrix A and B */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y)); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y)); + /* Compute end row address for matrix B */ + int end_row_mtx_b = (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) + int(COLS_B); + + /* Reset accumulators */ + vec4 c00[2]; + vec4 c10[2]; + vec4 c20[2]; + vec4 c30[2]; + c00[0] = vec4(0.0f); + c00[1] = vec4(0.0f); + c10[0] = vec4(0.0f); + c10[1] = vec4(0.0f); + c20[0] = vec4(0.0f); + c20[1] = vec4(0.0f); + c30[0] = vec4(0.0f); + c30[1] = vec4(0.0f); + + // FIXME: loop unrolling really needed for GLES? + for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter); + vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + a0 = LOAD_UNPACK4_HALF(src0_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, 8)); + b0 = LOAD_UNPACK8_HALF(src1_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src1_iter, 16)); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + } + + for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 16)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter); + vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + } + + /* Multiply by the weight of matrix product */ + c00[0] = c00[0] * vec4(ALPHA); + c00[1] = c00[1] * vec4(ALPHA); + c10[0] = c10[0] * vec4(ALPHA); + c10[1] = c10[1] * vec4(ALPHA); + c20[0] = c20[0] * vec4(ALPHA); + c20[1] = c20[1] * vec4(ALPHA); + c30[0] = c30[0] * vec4(ALPHA); + c30[1] = c30[1] * vec4(ALPHA); + + /* Store 4x8 block */ + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30); +} +#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */ +#else /* DATA_TYPE_FP16 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h old mode 100755 new mode 100644 index dd9e1a3864..014ff4045e --- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h +++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * -- cgit v1.2.1