From 7068f9900d136312318ff430aef588b14e0c87ad Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Thu, 26 Oct 2017 15:23:08 +0100 Subject: COMPMID-631: Merge branches/gles_compute branch Last commit: commit b25c5f68042b0c81bf611d59a1bb8535e1c42497 Author: Xinghang Zhou Date: Wed Oct 25 18:48:10 2017 +0800 Synced validation's tolerances of GCSoftmax from cl side Change-Id: Ibe72054205c1c8721845d679a31af7ed0a7c5cf6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93283 Reviewed-by: Anthony Barbier Tested-by: Kaizen --- src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs | 1444 +++++++++++++++++++++ 1 file changed, 1444 insertions(+) create mode 100644 src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs (limited to 'src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs') diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs new file mode 100644 index 0000000000..1e0fee4688 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs @@ -0,0 +1,1444 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#if defined(DATA_TYPE_FP32) + +float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); +float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +#define LOAD8(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)) + +#define LOAD16(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)); \ + r.z = LOAD4(name, offset + uint(2)); \ + r.w = LOAD4(name, offset + uint(3)) + +#define STORE16(name, offset, r) \ + STORE4(name, offset, r.x); \ + STORE4(name, offset + uint(1), r.y); \ + STORE4(name, offset + uint(2), r.z); \ + STORE4(name, offset + uint(3), r.w) + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(res, a, b) ((res) = (a) + (b)) +#define POOL_OP_float(res, a, b) (res = a + b) +#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } \ + if(isnan(a.z) || (a.z < b.z)) \ + { \ + res.z = b.z; \ + } \ + if(isnan(a.w) || (a.w < b.w)) \ + { \ + res.w = b.w; \ + } +#define POOL_OP_float(res, a, b) \ + (res) = (a); \ + if(isnan(a) || (a < b)) \ + { \ + res = b; \ + } +#define POOL_OP_vec2(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(POOL_SIZE) +// Set the initial value for the pooling operation accordingly with the data type +#if defined(POOL_AVG) || defined(POOL_L2) +#define INITIAL_VALUE 0.0f +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define INITIAL_VALUE -3.402823466385289e+38 +#endif // POOL_AVG +#endif //POOL_SIZE + +#define POOLING3x3_STRIDE1(res, input, output) \ + vec4 data00; \ + vec2 data01; \ + vec4 data10; \ + vec2 data11; \ + vec4 data20; \ + vec2 data21; \ + LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE2(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + float data010; \ + vec4 data100; \ + vec4 data101; \ + float data11; \ + vec4 data200; \ + vec4 data201; \ + float data21; \ + LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ + LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ + LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE3(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + vec4 data010; \ + vec4 data100; \ + vec4 data101; \ + vec4 data11; \ + vec4 data200; \ + vec4 data201; \ + vec4 data21; \ + LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ + LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ + LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw)) + +float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x = int(min(start_x + pool_size, upper_bound_w)); + int end_y = int(min(start_y + pool_size, upper_bound_h)); + + float data_max; + data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0)); + + for(int i = 0; (start_x + i) < end_x; ++i) + { + for(int j = 0; (start_y + j) < end_y; ++j) + { + float data = LOAD4(src, tensor3D_offset(src, i, j, 0)); + POOL_OP_float(data_max, data_max, data); + } + } + + return data_max; +} + +float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x = int(min(start_x + pool_size, upper_bound_w)); + int end_y = int(min(start_y + pool_size, upper_bound_h)); + + float data_total = 0.0f; + for(int i = 0; (start_x + i) < end_x; i++) + { + for(int j = 0; (start_y + j) < end_y; ++j) + { + float data = LOAD4(src, tensor3D_offset(src, i, j, 0)); + if(isnan(data)) + { + data = 0.0f; + } +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data = POW2_OP(data, 1); +#endif /* defined(POOL_L2) */ + data_total = data_total + data; + } + } + + return data_total / float((end_y - start_y) * (end_x - start_x)); +} + +#ifdef POOLING_LAYER_2 +/** Performs a pooling function of pool size equal to 2. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_3) +/** Performs a pooling function of pool size equal to 3. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_3_OPTIMIZED) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + vec4 res; + // Perform pooling 3x3 for 4 output elements +#if STRIDE_X == 1 + POOLING3x3_STRIDE1(res, src, dst); +#elif STRIDE_X == 2 + POOLING3x3_STRIDE2(res, src, dst); +#elif STRIDE_X == 3 + POOLING3x3_STRIDE3(res, src, dst); +#endif /*STRIDE_X == 1*/ + + // Divide by pool region in case of average pooling +#if defined(POOL_AVG) || defined(POOL_L2) + ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); + int end_y = min((start_y + 3), MAX_HEIGHT); + res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + STORE16(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_7) +/** Performs a pooling function of pool size equal to 7. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_N) +/** Performs a pooling function of pool size equal to N + * + * @note Supported data types are F32; + * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + vec4 vdata0; + vdata0 = vec4(INITIAL_VALUE); + vec4 vdata1; + vdata1 = vec4(INITIAL_VALUE); + float sdata; + sdata = float(INITIAL_VALUE); + + for(int y = 0; y < int(POOL_SIZE); y++) + { + int x = 0; + for(; x <= (int(POOL_SIZE) - 8); x += 8) + { + vec4 data2; + vec4 data3; + LOAD16(data2, src, tensor3D_offset(src, x, y, 0)); + LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata0, vdata0, data2); + POOL_OP(vdata1, vdata1, data3); + } + + // Leftover + for(; x < int(POOL_SIZE); ++x) + { + float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4 *= data4; +#endif /* defined(POOL_L2) */ + POOL_OP_float(sdata, sdata, data4); + } + } + + //Reduce result + vec4 reduce4; + POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw); + vec2 reduce2; + POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw); + float res; + POOL_OP_float(res, reduce2.x, reduce2.y); + POOL_OP_float(res, res, sdata); + +#if defined(POOL_AVG) || defined(POOL_L2) + { + // Divide by pool region in case of average pooling + int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X; + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + int end_x = int(min(STRIDE_X + POOL_SIZE, MAX_WIDTH)); + int end_y = int(min(STRIDE_Y + POOL_SIZE, MAX_HEIGHT)); + float res1 = float((end_y - start_y) * (end_x - start_x)); + res = DIV_OP(res, res1); + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} +#endif /* POOLING_LAYER_2 */ + +#elif defined(DATA_TYPE_FP16) + +precision mediump float; + +vec2 load_and_unpack(Tensor3D, uint); +vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); +vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); + +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, writeonly); + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +#define LOAD2_fp16(r, name, offset) \ + r.xy = load_and_unpack(name, offset) + +#define LOAD4_fp16(r, name, offset) \ + r.xy = load_and_unpack(name, offset); \ + r.zw = load_and_unpack(name, offset + uint(1)) + +#define STORE4_fp16(name, offset, r) \ + uint datastore1; \ + uint datastore2; \ + datastore1 = uint(packHalf2x16(r.xy)); \ + datastore2 = uint(packHalf2x16(r.zw)); \ + STORE1(name, offset << uint(1), datastore1); \ + STORE1(name, (offset << uint(1)) + uint(1), datastore2) + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(res, a, b) ((res) = (a) + (b)) +#define POOL_OP_float(res, a, b) (res = a + b) +#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } \ + if(isnan(a.z) || (a.z < b.z)) \ + { \ + res.z = b.z; \ + } \ + if(isnan(a.w) || (a.w < b.w)) \ + { \ + res.w = b.w; \ + } +#define POOL_OP_float(res, a, b) \ + (res) = (a); \ + if(isnan(a) || (a < b)) \ + { \ + res = b; \ + } +#define POOL_OP_vec2(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(POOL_SIZE) +// Set the initial value for the pooling operation accordingly with the data type +#if defined(POOL_AVG) || defined(POOL_L2) +#define INITIAL_VALUE 0.0f +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define INITIAL_VALUE -65504.0f +#endif //POOL_AVG +#endif //POOL_SIZE + +#define POOLING3x3_STRIDE1_fp16(res, input, output) \ + vec4 data00; \ + vec2 data01; \ + vec4 data10; \ + vec2 data11; \ + vec4 data20; \ + vec2 data21; \ + LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE2_fp16(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + float data010; \ + vec4 data100; \ + vec4 data101; \ + float data11; \ + vec4 data200; \ + vec4 data201; \ + float data21; \ + vec2 datamiddle0; \ + vec2 datamiddle1; \ + vec2 datamiddle2; \ + LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ + data010 = datamiddle0.x; \ + LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ + data11 = datamiddle1.x; \ + LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ + data21 = datamiddle2.x; \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE3_fp16(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + vec4 data010; \ + vec4 data100; \ + vec4 data101; \ + vec4 data11; \ + vec4 data200; \ + vec4 data201; \ + vec4 data21; \ + LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ + LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ + LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw)) + +vec2 load_and_unpack(Tensor3D src, uint offset) +{ + uint packed_s; + vec2 s; + LOAD1(packed_s, src, offset); + + s = vec2(unpackHalf2x16(packed_s)); + return s; +} + +vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); + int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); + + int start_x2 = start_x1 + stride_x; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); + int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); + + //Initialize maximum + vec2 data_max = vec2(0); + + //Load and Set initial maximum1 + vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2)); + data_max.x = data_init1.x; + + //Load and Set initial maximum2 + if(end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2)); + data_max.y = data_init2.x; + } + else + { + vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2)); + data_max.y = data_init2.y; + } + } + + for(int i = 0; (start_y1 + i) < end_y1; i++) + for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) + { + //Calculate maximum1 + if((start_x1 + j + 1) < end_x1) + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + float data_mr1; + POOL_OP_float(data_mr1, data1.x, data1.y); + POOL_OP_float(data_max.x, data_max.x, data_mr1); + } + else + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + POOL_OP_float(data_max.x, data_max.x, data1.x); + } + + //Calculate maximum2 + if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2))); + + if((start_x2 + j + 1) < end_x2) + { + float data_mr2; + POOL_OP_float(data_mr2, data2.x, data2.y); + POOL_OP_float(data_max.y, data_max.y, data_mr2); + } + else + { + POOL_OP_float(data_max.y, data_max.y, data2.x); + } + } + else + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); + vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); + if((start_x2 + j + 1) < end_x2) + { + float data_mr2; + POOL_OP_float(data_mr2, data3.x, data2.y); + POOL_OP_float(data_max.y, data_max.y, data_mr2); + } + else + { + POOL_OP_float(data_max.y, data_max.y, data2.y); + } + } + } + } + return data_max; +} + +vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); + int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); + + int start_x2 = start_x1 + stride_x; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); + int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); + + //Initialize sum + float data_total1 = float(0); + float data_total2 = float(0); + for(int i = 0; (start_y1 + i) < end_y1; i++) + for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data1 = POW2_OP(data1, 2); +#endif /* defined(POOL_L2) */ + //Calculate sum1 + if((start_x1 + j + 1) < end_x1) + { + data_total1 = data_total1 + data1.x + data1.y; + } + else + { + data_total1 = data_total1 + data1.x; + } + + //Calculate sum2 + if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 = POW2_OP(data2, 2); +#endif /* defined(POOL_L2) */ + if((start_x2 + j + 1) < end_x2) + { + data_total2 = data_total2 + data2.x + data2.y; + } + else + { + data_total2 = data_total2 + data2.x; + } + } + else + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); + vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 = POW2_OP(data2, 2); + data3 = POW2_OP(data3, 2); +#endif /* defined(POOL_L2) */ + if((start_x2 + j + 1) < end_x2) + { + data_total2 = data_total2 + data3.x + data2.y; + } + else + { + data_total2 = data_total2 + data2.y; + } + } + } + } + //Calculate average + vec2 data_avg; + data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1)); + data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2)); + + return data_avg; +} + +#ifdef POOLING_LAYER_2 +/** Performs a pooling function of pool size equal to 2. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_3) +/** Performs a pooling function of pool size equal to 3. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_3_OPTIMIZED) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + vec4 res; + // Perform pooling 3x3 for 4 output elements +#if STRIDE_X == 1 + POOLING3x3_STRIDE1_fp16(res, src, dst); +#elif STRIDE_X == 2 + POOLING3x3_STRIDE2_fp16(res, src, dst); +#elif STRIDE_X == 3 + POOLING3x3_STRIDE3_fp16(res, src, dst); +#endif /*STRIDE_X == 1*/ + + // Divide by pool region in case of average pooling +#if defined(POOL_AVG) || defined(POOL_L2) + ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); + int end_y = min((start_y + 3), MAX_HEIGHT); + res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res); +} + +#elif defined(POOLING_LAYER_7) +/** Performs a pooling function of pool size equal to 7. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_N) +/** Performs a pooling function of pool size equal to N + * + * @note Supported data types are F16; + * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + vec4 vdata00; + vdata00 = vec4(INITIAL_VALUE); + vec4 vdata01; + vdata01 = vec4(INITIAL_VALUE); + vec4 vdata10; + vdata10 = vec4(INITIAL_VALUE); + vec4 vdata11; + vdata11 = vec4(INITIAL_VALUE); + vec2 sdata; + sdata = vec2(INITIAL_VALUE); + + for(int y = 0; y < int(POOL_SIZE); y++) + { + int x = 0; + for(; x <= (int(POOL_SIZE) - 8); x += 8) + { + vec4 data2; + vec4 data3; + LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); + LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata00, vdata00, data2); + POOL_OP(vdata10, vdata10, data3); + } + + // Leftover + for(; x < int(POOL_SIZE); x = x + 2) + { + vec2 data4middle; + data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4middle *= data4middle; +#endif /* defined(POOL_L2) */ + if((x + 1) >= int(POOL_SIZE)) + { + POOL_OP_float(sdata.x, sdata.x, data4middle.x); + } + else + { + float data4; + POOL_OP_float(data4, data4middle.x, data4middle.y); + POOL_OP_float(sdata.x, sdata.x, data4); + } + } + } + + for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++) + { + int x1 = STRIDE_X; + for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) + { + vec4 data2; + vec4 data3; + LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); + LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata01, vdata01, data2); + POOL_OP(vdata11, vdata11, data3); + } + + // Leftover + for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) + { + vec2 data4middle; + data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4middle *= data4middle; +#endif /* defined(POOL_L2) */ + if((x1 + 1) >= int(POOL_SIZE + STRIDE_X)) + { + POOL_OP_float(sdata.y, sdata.y, data4middle.x); + } + else + { + float data4; + POOL_OP_float(data4, data4middle.x, data4middle.y); + POOL_OP_float(sdata.y, sdata.y, data4); + } + } + } + + //Reduce result + vec4 reduce40; + POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw); + vec2 reduce20; + POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw); + vec4 reduce41; + POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw); + vec2 reduce21; + POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw); + vec2 data; + POOL_OP_float(data.x, reduce20.x, reduce20.y); + POOL_OP_float(data.x, data.x, sdata.x); + POOL_OP_float(data.y, reduce21.x, reduce21.y); + POOL_OP_float(data.y, data.y, sdata.y); + +#if defined(POOL_AVG) || defined(POOL_L2) + { + // Divide by pool region in case of average pooling + int start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X; + int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH)); + int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT)); + int start_x2 = start_x1 + STRIDE_X; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH)); + int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT)); + vec2 res1; + res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1)); + res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2)); + data.x = DIV_OP(data.x, res1.x); + data.y = DIV_OP(data.y, res1.y); + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + uint res; + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} +#endif /*POOLING_LAYER_2*/ +#endif /*DATA_TYPE_FP32 */ -- cgit v1.2.1