aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs1444
1 files changed, 1444 insertions, 0 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
new file mode 100644
index 0000000000..1e0fee4688
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
@@ -0,0 +1,1444 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD8(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset) \
+ r.x = LOAD4(name, offset); \
+ r.y = LOAD4(name, offset + uint(1)); \
+ r.z = LOAD4(name, offset + uint(2)); \
+ r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r) \
+ STORE4(name, offset, r.x); \
+ STORE4(name, offset + uint(1), r.y); \
+ STORE4(name, offset + uint(2), r.z); \
+ STORE4(name, offset + uint(3), r.w)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ } \
+ if(isnan(a.z) || (a.z < b.z)) \
+ { \
+ res.z = b.z; \
+ } \
+ if(isnan(a.w) || (a.w < b.w)) \
+ { \
+ res.w = b.w; \
+ }
+#define POOL_OP_float(res, a, b) \
+ (res) = (a); \
+ if(isnan(a) || (a < b)) \
+ { \
+ res = b; \
+ }
+#define POOL_OP_vec2(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -3.402823466385289e+38
+#endif // POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1(res, input, output) \
+ vec4 data00; \
+ vec2 data01; \
+ vec4 data10; \
+ vec2 data11; \
+ vec4 data20; \
+ vec2 data21; \
+ LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ float data010; \
+ vec4 data100; \
+ vec4 data101; \
+ float data11; \
+ vec4 data200; \
+ vec4 data201; \
+ float data21; \
+ LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
+ LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
+ LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ vec4 data010; \
+ vec4 data100; \
+ vec4 data101; \
+ vec4 data11; \
+ vec4 data200; \
+ vec4 data201; \
+ vec4 data21; \
+ LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
+ LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
+ LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
+ LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
+ LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
+ LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
+ LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
+ LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
+ LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x = int(min(start_x + pool_size, upper_bound_w));
+ int end_y = int(min(start_y + pool_size, upper_bound_h));
+
+ float data_max;
+ data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
+
+ for(int i = 0; (start_x + i) < end_x; ++i)
+ {
+ for(int j = 0; (start_y + j) < end_y; ++j)
+ {
+ float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+ POOL_OP_float(data_max, data_max, data);
+ }
+ }
+
+ return data_max;
+}
+
+float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x = int(min(start_x + pool_size, upper_bound_w));
+ int end_y = int(min(start_y + pool_size, upper_bound_h));
+
+ float data_total = 0.0f;
+ for(int i = 0; (start_x + i) < end_x; i++)
+ {
+ for(int j = 0; (start_y + j) < end_y; ++j)
+ {
+ float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+ if(isnan(data))
+ {
+ data = 0.0f;
+ }
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data = POW2_OP(data, 1);
+#endif /* defined(POOL_L2) */
+ data_total = data_total + data;
+ }
+ }
+
+ return data_total / float((end_y - start_y) * (end_x - start_x));
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ vec4 res;
+ // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+ POOLING3x3_STRIDE1(res, src, dst);
+#elif STRIDE_X == 2
+ POOLING3x3_STRIDE2(res, src, dst);
+#elif STRIDE_X == 3
+ POOLING3x3_STRIDE3(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+ // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+ ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+ int end_y = min((start_y + 3), MAX_HEIGHT);
+ res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ STORE16(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ //Load and calculate data
+ float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F32;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ vec4 vdata0;
+ vdata0 = vec4(INITIAL_VALUE);
+ vec4 vdata1;
+ vdata1 = vec4(INITIAL_VALUE);
+ float sdata;
+ sdata = float(INITIAL_VALUE);
+
+ for(int y = 0; y < int(POOL_SIZE); y++)
+ {
+ int x = 0;
+ for(; x <= (int(POOL_SIZE) - 8); x += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
+ LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata0, vdata0, data2);
+ POOL_OP(vdata1, vdata1, data3);
+ }
+
+ // Leftover
+ for(; x < int(POOL_SIZE); ++x)
+ {
+ float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4 *= data4;
+#endif /* defined(POOL_L2) */
+ POOL_OP_float(sdata, sdata, data4);
+ }
+ }
+
+ //Reduce result
+ vec4 reduce4;
+ POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
+ vec2 reduce2;
+ POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
+ float res;
+ POOL_OP_float(res, reduce2.x, reduce2.y);
+ POOL_OP_float(res, res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ {
+ // Divide by pool region in case of average pooling
+ int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ int end_x = int(min(STRIDE_X + POOL_SIZE, MAX_WIDTH));
+ int end_y = int(min(STRIDE_Y + POOL_SIZE, MAX_HEIGHT));
+ float res1 = float((end_y - start_y) * (end_x - start_x));
+ res = DIV_OP(res, res1);
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+#endif /* POOLING_LAYER_2 */
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+vec2 load_and_unpack(Tensor3D, uint);
+vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+layout(std140) uniform shader_params
+{
+ TENSOR3D_PARAM_DECLARATION(src);
+ TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD2_fp16(r, name, offset) \
+ r.xy = load_and_unpack(name, offset)
+
+#define LOAD4_fp16(r, name, offset) \
+ r.xy = load_and_unpack(name, offset); \
+ r.zw = load_and_unpack(name, offset + uint(1))
+
+#define STORE4_fp16(name, offset, r) \
+ uint datastore1; \
+ uint datastore2; \
+ datastore1 = uint(packHalf2x16(r.xy)); \
+ datastore2 = uint(packHalf2x16(r.zw)); \
+ STORE1(name, offset << uint(1), datastore1); \
+ STORE1(name, (offset << uint(1)) + uint(1), datastore2)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ } \
+ if(isnan(a.z) || (a.z < b.z)) \
+ { \
+ res.z = b.z; \
+ } \
+ if(isnan(a.w) || (a.w < b.w)) \
+ { \
+ res.w = b.w; \
+ }
+#define POOL_OP_float(res, a, b) \
+ (res) = (a); \
+ if(isnan(a) || (a < b)) \
+ { \
+ res = b; \
+ }
+#define POOL_OP_vec2(res, a, b) \
+ (res) = (a); \
+ if(isnan(a.x) || (a.x < b.x)) \
+ { \
+ res.x = b.x; \
+ } \
+ if(isnan(a.y) || (a.y < b.y)) \
+ { \
+ res.y = b.y; \
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -65504.0f
+#endif //POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1_fp16(res, input, output) \
+ vec4 data00; \
+ vec2 data01; \
+ vec4 data10; \
+ vec2 data11; \
+ vec4 data20; \
+ vec2 data21; \
+ LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2_fp16(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ float data010; \
+ vec4 data100; \
+ vec4 data101; \
+ float data11; \
+ vec4 data200; \
+ vec4 data201; \
+ float data21; \
+ vec2 datamiddle0; \
+ vec2 datamiddle1; \
+ vec2 datamiddle2; \
+ LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
+ data010 = datamiddle0.x; \
+ LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
+ data11 = datamiddle1.x; \
+ LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
+ data21 = datamiddle2.x; \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3_fp16(res, input, output) \
+ vec4 data000; \
+ vec4 data001; \
+ vec4 data010; \
+ vec4 data100; \
+ vec4 data101; \
+ vec4 data11; \
+ vec4 data200; \
+ vec4 data201; \
+ vec4 data21; \
+ LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
+ LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
+ LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
+ LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
+ LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
+ LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
+ LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+vec2 load_and_unpack(Tensor3D src, uint offset)
+{
+ uint packed_s;
+ vec2 s;
+ LOAD1(packed_s, src, offset);
+
+ s = vec2(unpackHalf2x16(packed_s));
+ return s;
+}
+
+vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
+ int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
+
+ int start_x2 = start_x1 + stride_x;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
+ int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
+
+ //Initialize maximum
+ vec2 data_max = vec2(0);
+
+ //Load and Set initial maximum1
+ vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
+ data_max.x = data_init1.x;
+
+ //Load and Set initial maximum2
+ if(end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
+ data_max.y = data_init2.x;
+ }
+ else
+ {
+ vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
+ data_max.y = data_init2.y;
+ }
+ }
+
+ for(int i = 0; (start_y1 + i) < end_y1; i++)
+ for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+ {
+ //Calculate maximum1
+ if((start_x1 + j + 1) < end_x1)
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ float data_mr1;
+ POOL_OP_float(data_mr1, data1.x, data1.y);
+ POOL_OP_float(data_max.x, data_max.x, data_mr1);
+ }
+ else
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ POOL_OP_float(data_max.x, data_max.x, data1.x);
+ }
+
+ //Calculate maximum2
+ if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+
+ if((start_x2 + j + 1) < end_x2)
+ {
+ float data_mr2;
+ POOL_OP_float(data_mr2, data2.x, data2.y);
+ POOL_OP_float(data_max.y, data_max.y, data_mr2);
+ }
+ else
+ {
+ POOL_OP_float(data_max.y, data_max.y, data2.x);
+ }
+ }
+ else
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+ vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+ if((start_x2 + j + 1) < end_x2)
+ {
+ float data_mr2;
+ POOL_OP_float(data_mr2, data3.x, data2.y);
+ POOL_OP_float(data_max.y, data_max.y, data_mr2);
+ }
+ else
+ {
+ POOL_OP_float(data_max.y, data_max.y, data2.y);
+ }
+ }
+ }
+ }
+ return data_max;
+}
+
+vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+ int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+ int end_x1 = int(min(start_x1 + pool_size, upper_bound_w));
+ int end_y1 = int(min(start_y1 + pool_size, upper_bound_h));
+
+ int start_x2 = start_x1 + stride_x;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + pool_size, upper_bound_w));
+ int end_y2 = int(min(start_y2 + pool_size, upper_bound_h));
+
+ //Initialize sum
+ float data_total1 = float(0);
+ float data_total2 = float(0);
+ for(int i = 0; (start_y1 + i) < end_y1; i++)
+ for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+ {
+ vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+ //Calculate sum1
+ if((start_x1 + j + 1) < end_x1)
+ {
+ data_total1 = data_total1 + data1.x + data1.y;
+ }
+ else
+ {
+ data_total1 = data_total1 + data1.x;
+ }
+
+ //Calculate sum2
+ if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+ {
+ if((stride_x % 2) == 0)
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 = POW2_OP(data2, 2);
+#endif /* defined(POOL_L2) */
+ if((start_x2 + j + 1) < end_x2)
+ {
+ data_total2 = data_total2 + data2.x + data2.y;
+ }
+ else
+ {
+ data_total2 = data_total2 + data2.x;
+ }
+ }
+ else
+ {
+ vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+ vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 = POW2_OP(data2, 2);
+ data3 = POW2_OP(data3, 2);
+#endif /* defined(POOL_L2) */
+ if((start_x2 + j + 1) < end_x2)
+ {
+ data_total2 = data_total2 + data3.x + data2.y;
+ }
+ else
+ {
+ data_total2 = data_total2 + data2.y;
+ }
+ }
+ }
+ }
+ //Calculate average
+ vec2 data_avg;
+ data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
+ data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
+
+ return data_avg;
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ vec4 res;
+ // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+ POOLING3x3_STRIDE1_fp16(res, src, dst);
+#elif STRIDE_X == 2
+ POOLING3x3_STRIDE2_fp16(res, src, dst);
+#elif STRIDE_X == 3
+ POOLING3x3_STRIDE3_fp16(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+ // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+ ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+ int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+ int end_y = min((start_y + 3), MAX_HEIGHT);
+ res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ //Load and calculate data
+ vec2 data;
+ uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else /*POOL_AVG*/
+ data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F16;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+ // Get pixels pointer
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+ vec4 vdata00;
+ vdata00 = vec4(INITIAL_VALUE);
+ vec4 vdata01;
+ vdata01 = vec4(INITIAL_VALUE);
+ vec4 vdata10;
+ vdata10 = vec4(INITIAL_VALUE);
+ vec4 vdata11;
+ vdata11 = vec4(INITIAL_VALUE);
+ vec2 sdata;
+ sdata = vec2(INITIAL_VALUE);
+
+ for(int y = 0; y < int(POOL_SIZE); y++)
+ {
+ int x = 0;
+ for(; x <= (int(POOL_SIZE) - 8); x += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+ LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata00, vdata00, data2);
+ POOL_OP(vdata10, vdata10, data3);
+ }
+
+ // Leftover
+ for(; x < int(POOL_SIZE); x = x + 2)
+ {
+ vec2 data4middle;
+ data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+ if((x + 1) >= int(POOL_SIZE))
+ {
+ POOL_OP_float(sdata.x, sdata.x, data4middle.x);
+ }
+ else
+ {
+ float data4;
+ POOL_OP_float(data4, data4middle.x, data4middle.y);
+ POOL_OP_float(sdata.x, sdata.x, data4);
+ }
+ }
+ }
+
+ for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++)
+ {
+ int x1 = STRIDE_X;
+ for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+ {
+ vec4 data2;
+ vec4 data3;
+ LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+ LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ POOL_OP(vdata01, vdata01, data2);
+ POOL_OP(vdata11, vdata11, data3);
+ }
+
+ // Leftover
+ for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
+ {
+ vec2 data4middle;
+ data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+ if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+ {
+ POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+ }
+ else
+ {
+ float data4;
+ POOL_OP_float(data4, data4middle.x, data4middle.y);
+ POOL_OP_float(sdata.y, sdata.y, data4);
+ }
+ }
+ }
+
+ //Reduce result
+ vec4 reduce40;
+ POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
+ vec2 reduce20;
+ POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
+ vec4 reduce41;
+ POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
+ vec2 reduce21;
+ POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
+ vec2 data;
+ POOL_OP_float(data.x, reduce20.x, reduce20.y);
+ POOL_OP_float(data.x, data.x, sdata.x);
+ POOL_OP_float(data.y, reduce21.x, reduce21.y);
+ POOL_OP_float(data.y, data.y, sdata.y);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ {
+ // Divide by pool region in case of average pooling
+ int start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+ int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+ int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
+ int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
+ int start_x2 = start_x1 + STRIDE_X;
+ int start_y2 = start_y1;
+ int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
+ int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+ vec2 res1;
+ res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
+ res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
+ data.x = DIV_OP(data.x, res1.x);
+ data.y = DIV_OP(data.y, res1.y);
+ }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+ uint res;
+ res = uint(packHalf2x16(data));
+
+ // Store result
+ STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+#endif /*POOLING_LAYER_2*/
+#endif /*DATA_TYPE_FP32 */