aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2017-12-29 14:38:56 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commitc5a7e59655b61ad617fa34a4fb00e1a007c8255a (patch)
treed895eb81f88ab33c38c7bbbc4cc30d8ed1f5842b /src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
parent08d5421bb9dac9c5531b1925ff8f30653e7c2e81 (diff)
downloadComputeLibrary-c5a7e59655b61ad617fa34a4fb00e1a007c8255a.tar.gz
APPBROWSER-365: Rewrite the pooling_layer.cs with the new common code
Change-Id: I88a500467a22b78b0be304cf4ab4605ea1d6927e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114724 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs1264
1 files changed, 395 insertions, 869 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
index 64767a7ef1..aa639b2eda 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
@@ -23,38 +23,37 @@
*/
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
-#if defined(DATA_TYPE_FP32)
-
-float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+#include "helpers_cs.h"
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
-layout(std140) uniform shader_params
+/** Performs a pooling function
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2"
+ * n must be one of these: 2, 3, 7, N
+ * Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * POOL_AVG must be provided otherwise max pooling will be performed.
+ * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_attrs The attributes of the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] src_attrs The attributes of the destination image
+ */
+SHADER_PARAMS_DECLARATION
{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
};
-#define LOAD8(r, name, offset) \
- r.x = LOAD4(name, offset); \
- r.y = LOAD4(name, offset + uint(1))
-
-#define LOAD16(r, name, offset) \
- r.x = LOAD4(name, offset); \
- r.y = LOAD4(name, offset + uint(1)); \
- r.z = LOAD4(name, offset + uint(2)); \
- r.w = LOAD4(name, offset + uint(3))
-
-#define STORE16(name, offset, r) \
- STORE4(name, offset, r.x); \
- STORE4(name, offset + uint(1), r.y); \
- STORE4(name, offset + uint(2), r.z); \
- STORE4(name, offset + uint(3), r.w)
-
+// Common definitions
#if defined(POOL_AVG) || defined(POOL_L2)
#define POOL_OP(res, a, b) ((res) = (a) + (b))
#define POOL_OP_float(res, a, b) (res = a + b)
@@ -105,6 +104,14 @@ layout(std140) uniform shader_params
#define DIV_OP(x, y) (x * (1.f / y))
#define SQRT_OP(x) sqrt((x))
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
#if defined(POOL_SIZE)
// Set the initial value for the pooling operation accordingly with the data type
#if defined(POOL_AVG) || defined(POOL_L2)
@@ -114,154 +121,7 @@ layout(std140) uniform shader_params
#endif // POOL_AVG
#endif //POOL_SIZE
-#define POOLING3x3_STRIDE1(res, input, output) \
- vec4 data00; \
- vec2 data01; \
- vec4 data10; \
- vec2 data11; \
- vec4 data20; \
- vec2 data21; \
- LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \
- LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
- LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \
- LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
- LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \
- LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
- data00 = POW2_OP(data00, 4); \
- data01 = POW2_OP(data01, 2); \
- data10 = POW2_OP(data10, 4); \
- data11 = POW2_OP(data11, 2); \
- data20 = POW2_OP(data20, 4); \
- data21 = POW2_OP(data21, 2); \
- \
- vec4 values000; \
- vec4 values001; \
- vec4 values010; \
- vec4 values100; \
- vec4 values101; \
- vec4 values11; \
- vec4 values200; \
- vec4 values201; \
- vec4 values21; \
- values000.xyzw = data00.xyzy; \
- values001.xyzw = data00.zwzw; \
- values010.x = data01.x; \
- values010.y = data00.w; \
- values010.zw = data01.xy; \
- values100.xyzw = data10.xyzy; \
- values101.xyzw = data10.zwzw; \
- values11.x = data11.x; \
- values11.y = data10.w; \
- values11.zw = data11.xy; \
- values200.xyzw = data20.xyzy; \
- values201.xyzw = data20.zwzw; \
- values21.x = data21.x; \
- values21.y = data20.w; \
- values21.zw = data21.xy; \
- POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
- POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
- POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE2(res, input, output) \
- vec4 data000; \
- vec4 data001; \
- float data010; \
- vec4 data100; \
- vec4 data101; \
- float data11; \
- vec4 data200; \
- vec4 data201; \
- float data21; \
- LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
- LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
- data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
- LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
- LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
- data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
- LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
- LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
- data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
- data000 = POW2_OP(data000, 4); \
- data001 = POW2_OP(data001, 4); \
- data010 = POW2_OP(data010, 1); \
- data100 = POW2_OP(data100, 4); \
- data101 = POW2_OP(data101, 4); \
- data11 = POW2_OP(data11, 1); \
- data200 = POW2_OP(data200, 4); \
- data201 = POW2_OP(data201, 4); \
- data21 = POW2_OP(data21, 1); \
- \
- vec4 values000; \
- vec4 values001; \
- vec4 values010; \
- vec4 values100; \
- vec4 values101; \
- vec4 values11; \
- vec4 values200; \
- vec4 values201; \
- vec4 values21; \
- values000.xyzw = data000.xyzz; \
- values001.xyzw = vec4(data000.w, data001.xxy); \
- values010.xyzw = vec4(data001.zzw, data010); \
- values100.xyzw = data100.xyzz; \
- values101.xyzw = vec4(data100.w, data101.xxy); \
- values11.xyzw = vec4(data101.zzw, data11); \
- values200.xyzw = data200.xyzz; \
- values201.xyzw = vec4(data200.w, data201.xxy); \
- values21.xyzw = vec4(data201.zzw, data21); \
- POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
- POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
- POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE3(res, input, output) \
- vec4 data000; \
- vec4 data001; \
- vec4 data010; \
- vec4 data100; \
- vec4 data101; \
- vec4 data11; \
- vec4 data200; \
- vec4 data201; \
- vec4 data21; \
- LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \
- LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \
- LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \
- LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \
- LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \
- LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \
- LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \
- LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \
- LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \
- data000 = POW2_OP(data000, 4); \
- data001 = POW2_OP(data001, 4); \
- data010 = POW2_OP(data010, 4); \
- data100 = POW2_OP(data100, 4); \
- data101 = POW2_OP(data101, 4); \
- data11 = POW2_OP(data11, 4); \
- data200 = POW2_OP(data200, 4); \
- data201 = POW2_OP(data201, 4); \
- data21 = POW2_OP(data21, 4); \
- \
- POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
- POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
- POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
- POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
- POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
- POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
- POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-
-float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
@@ -269,13 +129,13 @@ float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w,
int end_y = int(min(start_y + pool_size, upper_bound_h));
float data_max;
- data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
+ data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter);
for(int i = 0; (start_y + i) < end_y; ++i)
{
for(int j = 0; (start_x + j) < end_x; ++j)
{
- float data = LOAD4(src, tensor3D_offset(src, j, i, 0));
+ float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
POOL_OP_float(data_max, data_max, data);
}
}
@@ -283,7 +143,7 @@ float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w,
return data_max;
}
-float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
@@ -295,7 +155,7 @@ float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w,
{
for(int j = 0; (start_y + j) < end_y; ++j)
{
- float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+ float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0));
if(isnan(data))
{
data = 0.0f;
@@ -316,45 +176,30 @@ float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w,
return data_total / float((end_y - start_y) * (end_x - start_x));
}
-#ifdef POOLING_LAYER_2
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
+
+#if defined(POOLING_LAYER_2)
+#define POOL_SIZE 2
+#elif defined(POOLING_LAYER_3)
+#define POOL_SIZE 3
+#elif defined(POOLING_LAYER_7)
+#define POOL_SIZE 7
+#else // POOLING_LAYER_n
+#error Please define POOLING_LAYER_N instead.
+#endif // POOLING_LAYER_n
+
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
//Load and calculate data
float res;
#if defined(POOL_AVG) || defined(POOL_L2)
- res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+ res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
#else /*POOL_AVG*/
- res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+ res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
#endif /*POOL_AVG*/
#if defined(POOL_L2)
@@ -363,100 +208,148 @@ void main(void)
#endif /* defined(POOL_L2) */
// Store result
- STORE4(dst, CURRENT_OFFSET(dst), res);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
}
-#elif defined(POOLING_LAYER_3)
-/** Performs a pooling function of pool size equal to 3.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
- // Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
- //Load and calculate data
- float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
- res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else /*POOL_AVG*/
- res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
+#define POOLING3x3_STRIDE1(res, input_ptr, input_iter) \
+ vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
+ vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
+ vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
+#define POOLING3x3_STRIDE2(res, input_ptr, input_iter) \
+ vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
+ float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \
+ vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
+ float data11 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \
+ vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
+ float data21 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
- // Store result
- STORE4(dst, CURRENT_OFFSET(dst), res);
-}
+#define POOLING3x3_STRIDE3(res, input_ptr, input_iter) \
+ vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
+ vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \
+ vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
+ vec4 data11 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \
+ vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
+ vec4 data21 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
vec4 res;
// Perform pooling 3x3 for 4 output elements
#if STRIDE_X == 1
- POOLING3x3_STRIDE1(res, src, dst);
+ POOLING3x3_STRIDE1(res, src_ptr, src_iter);
#elif STRIDE_X == 2
- POOLING3x3_STRIDE2(res, src, dst);
+ POOLING3x3_STRIDE2(res, src_ptr, src_iter);
#elif STRIDE_X == 3
- POOLING3x3_STRIDE3(res, src, dst);
+ POOLING3x3_STRIDE3(res, src_ptr, src_iter);
#endif /*STRIDE_X == 1*/
// Divide by pool region in case of average pooling
@@ -477,109 +370,28 @@ void main(void)
res = SQRT_OP(res);
#endif /* defined(POOL_L2) */
- STORE16(dst, CURRENT_OFFSET(dst), res);
-}
-
-#elif defined(POOLING_LAYER_7)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
- // Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
- //Load and calculate data
- float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
- res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else /*POOL_AVG*/
- res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- STORE4(dst, CURRENT_OFFSET(dst), res);
+ VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res);
}
#elif defined(POOLING_LAYER_N)
-/** Performs a pooling function of pool size equal to N
- *
- * @note Supported data types are F32;
- * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
- vec4 vdata0;
- vdata0 = vec4(INITIAL_VALUE);
- vec4 vdata1;
- vdata1 = vec4(INITIAL_VALUE);
- float sdata;
- sdata = float(INITIAL_VALUE);
+ vec4 vdata0 = vec4(INITIAL_VALUE);
+ vec4 vdata1 = vec4(INITIAL_VALUE);
+ float sdata = float(INITIAL_VALUE);
for(int y = 0; y < int(POOL_SIZE); y++)
{
int x = 0;
for(; x <= (int(POOL_SIZE) - 8); x += 8)
{
- vec4 data2;
- vec4 data3;
- LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
- LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+ vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
+ vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
@@ -594,7 +406,7 @@ void main(void)
// Leftover
for(; x < int(POOL_SIZE); ++x)
{
- float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+ float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data4 *= data4;
@@ -634,91 +446,17 @@ void main(void)
#endif /* defined(POOL_L2) */
// Store result
- STORE4(dst, CURRENT_OFFSET(dst), res);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
}
-#endif /* POOLING_LAYER_2 */
+#endif // POOLING_LAYER_N
#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-
-vec2 load_and_unpack(Tensor3D, uint);
-vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
+vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
-layout(std140) uniform shader_params
-{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-#define LOAD2_fp16(r, name, offset) \
- r.xy = load_and_unpack(name, offset)
-
-#define LOAD4_fp16(r, name, offset) \
- r.xy = load_and_unpack(name, offset); \
- r.zw = load_and_unpack(name, offset + uint(1))
-
-#define STORE4_fp16(name, offset, r) \
- uint datastore1; \
- uint datastore2; \
- datastore1 = uint(packHalf2x16(r.xy)); \
- datastore2 = uint(packHalf2x16(r.zw)); \
- STORE1(name, offset << uint(1), datastore1); \
- STORE1(name, (offset << uint(1)) + uint(1), datastore2)
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(res, a, b) ((res) = (a) + (b))
-#define POOL_OP_float(res, a, b) (res = a + b)
-#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(res, a, b) \
- (res) = (a); \
- if(isnan(a.x) || (a.x < b.x)) \
- { \
- res.x = b.x; \
- } \
- if(isnan(a.y) || (a.y < b.y)) \
- { \
- res.y = b.y; \
- } \
- if(isnan(a.z) || (a.z < b.z)) \
- { \
- res.z = b.z; \
- } \
- if(isnan(a.w) || (a.w < b.w)) \
- { \
- res.w = b.w; \
- }
-#define POOL_OP_float(res, a, b) \
- (res) = (a); \
- if(isnan(a) || (a < b)) \
- { \
- res = b; \
- }
-#define POOL_OP_vec2(res, a, b) \
- (res) = (a); \
- if(isnan(a.x) || (a.x < b.x)) \
- { \
- res.x = b.x; \
- } \
- if(isnan(a.y) || (a.y < b.y)) \
- { \
- res.y = b.y; \
- }
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
#if defined(POOL_SIZE)
// Set the initial value for the pooling operation accordingly with the data type
@@ -729,170 +467,7 @@ layout(std140) uniform shader_params
#endif //POOL_AVG
#endif //POOL_SIZE
-#define POOLING3x3_STRIDE1_fp16(res, input, output) \
- vec4 data00; \
- vec2 data01; \
- vec4 data10; \
- vec2 data11; \
- vec4 data20; \
- vec2 data21; \
- LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
- LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
- LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
- LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
- LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
- LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
- data00 = POW2_OP(data00, 4); \
- data01 = POW2_OP(data01, 2); \
- data10 = POW2_OP(data10, 4); \
- data11 = POW2_OP(data11, 2); \
- data20 = POW2_OP(data20, 4); \
- data21 = POW2_OP(data21, 2); \
- \
- vec4 values000; \
- vec4 values001; \
- vec4 values010; \
- vec4 values100; \
- vec4 values101; \
- vec4 values11; \
- vec4 values200; \
- vec4 values201; \
- vec4 values21; \
- values000.xyzw = data00.xyzy; \
- values001.xyzw = data00.zwzw; \
- values010.x = data01.x; \
- values010.y = data00.w; \
- values010.zw = data01.xy; \
- values100.xyzw = data10.xyzy; \
- values101.xyzw = data10.zwzw; \
- values11.x = data11.x; \
- values11.y = data10.w; \
- values11.zw = data11.xy; \
- values200.xyzw = data20.xyzy; \
- values201.xyzw = data20.zwzw; \
- values21.x = data21.x; \
- values21.y = data20.w; \
- values21.zw = data21.xy; \
- POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
- POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
- POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE2_fp16(res, input, output) \
- vec4 data000; \
- vec4 data001; \
- float data010; \
- vec4 data100; \
- vec4 data101; \
- float data11; \
- vec4 data200; \
- vec4 data201; \
- float data21; \
- vec2 datamiddle0; \
- vec2 datamiddle1; \
- vec2 datamiddle2; \
- LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
- LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
- datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
- data010 = datamiddle0.x; \
- LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
- LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
- datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
- data11 = datamiddle1.x; \
- LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
- LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
- datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
- data21 = datamiddle2.x; \
- data000 = POW2_OP(data000, 4); \
- data001 = POW2_OP(data001, 4); \
- data010 = POW2_OP(data010, 1); \
- data100 = POW2_OP(data100, 4); \
- data101 = POW2_OP(data101, 4); \
- data11 = POW2_OP(data11, 1); \
- data200 = POW2_OP(data200, 4); \
- data201 = POW2_OP(data201, 4); \
- data21 = POW2_OP(data21, 1); \
- \
- vec4 values000; \
- vec4 values001; \
- vec4 values010; \
- vec4 values100; \
- vec4 values101; \
- vec4 values11; \
- vec4 values200; \
- vec4 values201; \
- vec4 values21; \
- values000.xyzw = data000.xyzz; \
- values001.xyzw = vec4(data000.w, data001.xxy); \
- values010.xyzw = vec4(data001.zzw, data010); \
- values100.xyzw = data100.xyzz; \
- values101.xyzw = vec4(data100.w, data101.xxy); \
- values11.xyzw = vec4(data101.zzw, data11); \
- values200.xyzw = data200.xyzz; \
- values201.xyzw = vec4(data200.w, data201.xxy); \
- values21.xyzw = vec4(data201.zzw, data21); \
- POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
- POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
- POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
- POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
- POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE3_fp16(res, input, output) \
- vec4 data000; \
- vec4 data001; \
- vec4 data010; \
- vec4 data100; \
- vec4 data101; \
- vec4 data11; \
- vec4 data200; \
- vec4 data201; \
- vec4 data21; \
- LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \
- LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \
- LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \
- LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \
- LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \
- LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \
- LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \
- LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \
- LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \
- data000 = POW2_OP(data000, 4); \
- data001 = POW2_OP(data001, 4); \
- data010 = POW2_OP(data010, 4); \
- data100 = POW2_OP(data100, 4); \
- data101 = POW2_OP(data101, 4); \
- data11 = POW2_OP(data11, 4); \
- data200 = POW2_OP(data200, 4); \
- data201 = POW2_OP(data201, 4); \
- data21 = POW2_OP(data21, 4); \
- \
- POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
- POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
- POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
- POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
- POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
- POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
- POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
- POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-
-vec2 load_and_unpack(Tensor3D src, uint offset)
-{
- uint packed_s;
- vec2 s;
- LOAD1(packed_s, src, offset);
-
- s = vec2(unpackHalf2x16(packed_s));
- return s;
-}
-
-vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
@@ -908,7 +483,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
vec2 data_max = vec2(0);
//Load and Set initial maximum1
- vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
+ vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
data_max.x = data_init1.x;
//Load and Set initial maximum2
@@ -916,12 +491,12 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
{
if((stride_x % 2) == 0)
{
- vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
+ vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0));
data_max.y = data_init2.x;
}
else
{
- vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
+ vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0));
data_max.y = data_init2.y;
}
}
@@ -932,14 +507,14 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
//Calculate maximum1
if((start_x1 + j + 1) < end_x1)
{
- vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
float data_mr1;
POOL_OP_float(data_mr1, data1.x, data1.y);
POOL_OP_float(data_max.x, data_max.x, data_mr1);
}
else
{
- vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
POOL_OP_float(data_max.x, data_max.x, data1.x);
}
@@ -948,7 +523,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
{
if((stride_x % 2) == 0)
{
- vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+ vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
if((start_x2 + j + 1) < end_x2)
{
@@ -963,8 +538,8 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
}
else
{
- vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
- vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+ vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
+ vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
if((start_x2 + j + 1) < end_x2)
{
float data_mr2;
@@ -981,7 +556,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c
return data_max;
}
-vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x;
int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
@@ -999,7 +574,7 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c
for(int i = 0; (start_y1 + i) < end_y1; i++)
for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
{
- vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+ vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data1 = POW2_OP(data1, 2);
@@ -1019,7 +594,7 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c
{
if((stride_x % 2) == 0)
{
- vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+ vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data2 = POW2_OP(data2, 2);
@@ -1035,8 +610,8 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c
}
else
{
- vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
- vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+ vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
+ vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data2 = POW2_OP(data2, 2);
@@ -1068,46 +643,30 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c
return data_avg;
}
-#ifdef POOLING_LAYER_2
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
+
+#if defined(POOLING_LAYER_2)
+#define POOL_SIZE 2
+#elif defined(POOLING_LAYER_3)
+#define POOL_SIZE 3
+#elif defined(POOLING_LAYER_7)
+#define POOL_SIZE 7
+#else // POOLING_LAYER_n
+#error Please define POOLING_LAYER_N instead.
+#endif // POOLING_LAYER_n
+
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
//Load and calculate data
vec2 data;
- uint res;
#if defined(POOL_AVG) || defined(POOL_L2)
- data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+ data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
#else /*POOL_AVG*/
- data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+ data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
#endif /*POOL_AVG*/
#if defined(POOL_L2)
@@ -1115,106 +674,164 @@ void main(void)
data = SQRT_OP(data);
#endif /* defined(POOL_L2) */
- res = uint(packHalf2x16(data));
-
// Store result
- STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+ STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
}
-#elif defined(POOLING_LAYER_3)
-/** Performs a pooling function of pool size equal to 3.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
- // Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
- //Load and calculate data
- vec2 data;
- uint res;
-#if defined(POOL_AVG) || defined(POOL_L2)
- data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else /*POOL_AVG*/
- data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
+#define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter) \
+ vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
+ vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
+ vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data00.xyzy; \
+ values001.xyzw = data00.zwzw; \
+ values010.x = data01.x; \
+ values010.y = data00.w; \
+ values010.zw = data01.xy; \
+ values100.xyzw = data10.xyzy; \
+ values101.xyzw = data10.zwzw; \
+ values11.x = data11.x; \
+ values11.y = data10.w; \
+ values11.zw = data11.xy; \
+ values200.xyzw = data20.xyzy; \
+ values201.xyzw = data20.zwzw; \
+ values21.x = data21.x; \
+ values21.y = data20.w; \
+ values21.zw = data21.xy; \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
- res = uint(packHalf2x16(data));
+#define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter) \
+ vec4 data000; \
+ vec4 data001; \
+ float data010; \
+ vec4 data100; \
+ vec4 data101; \
+ float data11; \
+ vec4 data200; \
+ vec4 data201; \
+ float data21; \
+ vec2 datamiddle0; \
+ vec2 datamiddle1; \
+ vec2 datamiddle2; \
+ data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
+ datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
+ data010 = datamiddle0.x; \
+ data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
+ datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
+ data11 = datamiddle1.x; \
+ data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
+ datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
+ data21 = datamiddle2.x; \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 1); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 1); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ vec4 values000; \
+ vec4 values001; \
+ vec4 values010; \
+ vec4 values100; \
+ vec4 values101; \
+ vec4 values11; \
+ vec4 values200; \
+ vec4 values201; \
+ vec4 values21; \
+ values000.xyzw = data000.xyzz; \
+ values001.xyzw = vec4(data000.w, data001.xxy); \
+ values010.xyzw = vec4(data001.zzw, data010); \
+ values100.xyzw = data100.xyzz; \
+ values101.xyzw = vec4(data100.w, data101.xxy); \
+ values11.xyzw = vec4(data101.zzw, data11); \
+ values200.xyzw = data200.xyzz; \
+ values201.xyzw = vec4(data200.w, data201.xxy); \
+ values21.xyzw = vec4(data201.zzw, data21); \
+ POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \
+ POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \
+ POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \
+ POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \
+ POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
- // Store result
- STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
-}
+#define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter) \
+ vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \
+ vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \
+ vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \
+ vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \
+ vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \
+ vec4 data11 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \
+ vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \
+ vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \
+ vec4 data21 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \
+ data000 = POW2_OP(data000, 4); \
+ data001 = POW2_OP(data001, 4); \
+ data010 = POW2_OP(data010, 4); \
+ data100 = POW2_OP(data100, 4); \
+ data101 = POW2_OP(data101, 4); \
+ data11 = POW2_OP(data11, 4); \
+ data200 = POW2_OP(data200, 4); \
+ data201 = POW2_OP(data201, 4); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \
+ POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \
+ POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \
+ POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \
+ POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+ POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
vec4 res;
// Perform pooling 3x3 for 4 output elements
#if STRIDE_X == 1
- POOLING3x3_STRIDE1_fp16(res, src, dst);
+ POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter);
#elif STRIDE_X == 2
- POOLING3x3_STRIDE2_fp16(res, src, dst);
+ POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter);
#elif STRIDE_X == 3
- POOLING3x3_STRIDE3_fp16(res, src, dst);
+ POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter);
#endif /*STRIDE_X == 1*/
// Divide by pool region in case of average pooling
@@ -1235,116 +852,30 @@ void main(void)
res = SQRT_OP(res);
#endif /* defined(POOL_L2) */
- STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
-}
-
-#elif defined(POOLING_LAYER_7)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
- // Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
- //Load and calculate data
- vec2 data;
- uint res;
-#if defined(POOL_AVG) || defined(POOL_L2)
- data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else /*POOL_AVG*/
- data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
- res = uint(packHalf2x16(data));
-
- // Store result
- STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+ VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
}
#elif defined(POOLING_LAYER_N)
-/** Performs a pooling function of pool size equal to N
- *
- * @note Supported data types are F16;
- * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
- * @note In case of average pooling the following information must be passed at compile time:
- * POOL_AVG must be provided otherwise max pooling will be performed.
- * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- * PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+
void main(void)
{
// Get pixels pointer
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
- vec4 vdata00;
- vdata00 = vec4(INITIAL_VALUE);
- vec4 vdata01;
- vdata01 = vec4(INITIAL_VALUE);
- vec4 vdata10;
- vdata10 = vec4(INITIAL_VALUE);
- vec4 vdata11;
- vdata11 = vec4(INITIAL_VALUE);
- vec2 sdata;
- sdata = vec2(INITIAL_VALUE);
+ Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+ vec4 vdata00 = vec4(INITIAL_VALUE);
+ vec4 vdata01 = vec4(INITIAL_VALUE);
+ vec4 vdata10 = vec4(INITIAL_VALUE);
+ vec4 vdata11 = vec4(INITIAL_VALUE);
+ vec2 sdata = vec2(INITIAL_VALUE);
for(int y = 0; y < int(POOL_SIZE); y++)
{
int x = 0;
for(; x <= (int(POOL_SIZE) - 8); x += 8)
{
- vec4 data2;
- vec4 data3;
- LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
- LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+ vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
+ vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
@@ -1359,8 +890,7 @@ void main(void)
// Leftover
for(; x < int(POOL_SIZE); x = x + 2)
{
- vec2 data4middle;
- data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+ vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data4middle *= data4middle;
@@ -1385,10 +915,8 @@ void main(void)
int x1 = STRIDE_X;
for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
{
- vec4 data2;
- vec4 data3;
- LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
- LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+ vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+ vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
@@ -1404,7 +932,7 @@ void main(void)
for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
{
vec2 data4middle;
- data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+ data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data4middle *= data4middle;
@@ -1424,7 +952,7 @@ void main(void)
else
{
vec2 dataorigin2;
- dataorigin2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (STRIDE_X - 1), y, 0) >> uint(2)));
+ dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
dataorigin2.y *= dataorigin2.y;
@@ -1434,10 +962,8 @@ void main(void)
int x1 = STRIDE_X + 1;
for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
{
- vec4 data2;
- vec4 data3;
- LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
- LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+ vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+ vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
@@ -1452,8 +978,7 @@ void main(void)
// Leftover
for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
{
- vec2 data4middle;
- data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+ vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data4middle *= data4middle;
@@ -1516,11 +1041,12 @@ void main(void)
// Take square root of the result in L2 pooling
data = SQRT_OP(data);
#endif /* defined(POOL_L2) */
- uint res;
- res = uint(packHalf2x16(data));
// Store result
- STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+ STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
}
-#endif /*POOLING_LAYER_2*/
-#endif /*DATA_TYPE_FP32 */
+#endif // POOLING_LAYER_N
+
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32