diff options
author | Joel Liang <joel.liang@arm.com> | 2017-12-29 14:38:56 +0800 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:42:33 +0000 |
commit | c5a7e59655b61ad617fa34a4fb00e1a007c8255a (patch) | |
tree | d895eb81f88ab33c38c7bbbc4cc30d8ed1f5842b /src | |
parent | 08d5421bb9dac9c5531b1925ff8f30653e7c2e81 (diff) | |
download | ComputeLibrary-c5a7e59655b61ad617fa34a4fb00e1a007c8255a.tar.gz |
APPBROWSER-365: Rewrite the pooling_layer.cs with the new common code
Change-Id: I88a500467a22b78b0be304cf4ab4605ea1d6927e
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114724
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs | 1264 |
1 files changed, 395 insertions, 869 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs index 64767a7ef1..aa639b2eda 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs @@ -23,38 +23,37 @@ */ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; -#include "helpers.h" -#if defined(DATA_TYPE_FP32) - -float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); -float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); +#include "helpers_cs.h" -BUFFER_DECLARATION(src, 1, float, readonly); -BUFFER_DECLARATION(dst, 2, float, writeonly); +#if defined(DATA_TYPE_FP16) +precision mediump float; +#endif // DATA_TYPE_FP16 -layout(std140) uniform shader_params +/** Performs a pooling function + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" + * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2" + * n must be one of these: 2, 3, 7, N + * Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_attrs The attributes of the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] src_attrs The attributes of the destination image + */ +SHADER_PARAMS_DECLARATION { - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); + Tensor3DAttributes src_attrs; + Tensor3DAttributes dst_attrs; }; -#define LOAD8(r, name, offset) \ - r.x = LOAD4(name, offset); \ - r.y = LOAD4(name, offset + uint(1)) - -#define LOAD16(r, name, offset) \ - r.x = LOAD4(name, offset); \ - r.y = LOAD4(name, offset + uint(1)); \ - r.z = LOAD4(name, offset + uint(2)); \ - r.w = LOAD4(name, offset + uint(3)) - -#define STORE16(name, offset, r) \ - STORE4(name, offset, r.x); \ - STORE4(name, offset + uint(1), r.y); \ - STORE4(name, offset + uint(2), r.z); \ - STORE4(name, offset + uint(3), r.w) - +// Common definitions #if defined(POOL_AVG) || defined(POOL_L2) #define POOL_OP(res, a, b) ((res) = (a) + (b)) #define POOL_OP_float(res, a, b) (res = a + b) @@ -105,6 +104,14 @@ layout(std140) uniform shader_params #define DIV_OP(x, y) (x * (1.f / y)) #define SQRT_OP(x) sqrt((x)) +#if defined(DATA_TYPE_FP32) + +float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); +float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); + +TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly); + #if defined(POOL_SIZE) // Set the initial value for the pooling operation accordingly with the data type #if defined(POOL_AVG) || defined(POOL_L2) @@ -114,154 +121,7 @@ layout(std140) uniform shader_params #endif // POOL_AVG #endif //POOL_SIZE -#define POOLING3x3_STRIDE1(res, input, output) \ - vec4 data00; \ - vec2 data01; \ - vec4 data10; \ - vec2 data11; \ - vec4 data20; \ - vec2 data21; \ - LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \ - LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ - LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \ - LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ - LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \ - LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ - data00 = POW2_OP(data00, 4); \ - data01 = POW2_OP(data01, 2); \ - data10 = POW2_OP(data10, 4); \ - data11 = POW2_OP(data11, 2); \ - data20 = POW2_OP(data20, 4); \ - data21 = POW2_OP(data21, 2); \ - \ - vec4 values000; \ - vec4 values001; \ - vec4 values010; \ - vec4 values100; \ - vec4 values101; \ - vec4 values11; \ - vec4 values200; \ - vec4 values201; \ - vec4 values21; \ - values000.xyzw = data00.xyzy; \ - values001.xyzw = data00.zwzw; \ - values010.x = data01.x; \ - values010.y = data00.w; \ - values010.zw = data01.xy; \ - values100.xyzw = data10.xyzy; \ - values101.xyzw = data10.zwzw; \ - values11.x = data11.x; \ - values11.y = data10.w; \ - values11.zw = data11.xy; \ - values200.xyzw = data20.xyzy; \ - values201.xyzw = data20.zwzw; \ - values21.x = data21.x; \ - values21.y = data20.w; \ - values21.zw = data21.xy; \ - POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ - POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ - POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - -#define POOLING3x3_STRIDE2(res, input, output) \ - vec4 data000; \ - vec4 data001; \ - float data010; \ - vec4 data100; \ - vec4 data101; \ - float data11; \ - vec4 data200; \ - vec4 data201; \ - float data21; \ - LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ - LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ - data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ - LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ - LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ - data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ - LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ - LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ - data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ - data000 = POW2_OP(data000, 4); \ - data001 = POW2_OP(data001, 4); \ - data010 = POW2_OP(data010, 1); \ - data100 = POW2_OP(data100, 4); \ - data101 = POW2_OP(data101, 4); \ - data11 = POW2_OP(data11, 1); \ - data200 = POW2_OP(data200, 4); \ - data201 = POW2_OP(data201, 4); \ - data21 = POW2_OP(data21, 1); \ - \ - vec4 values000; \ - vec4 values001; \ - vec4 values010; \ - vec4 values100; \ - vec4 values101; \ - vec4 values11; \ - vec4 values200; \ - vec4 values201; \ - vec4 values21; \ - values000.xyzw = data000.xyzz; \ - values001.xyzw = vec4(data000.w, data001.xxy); \ - values010.xyzw = vec4(data001.zzw, data010); \ - values100.xyzw = data100.xyzz; \ - values101.xyzw = vec4(data100.w, data101.xxy); \ - values11.xyzw = vec4(data101.zzw, data11); \ - values200.xyzw = data200.xyzz; \ - values201.xyzw = vec4(data200.w, data201.xxy); \ - values21.xyzw = vec4(data201.zzw, data21); \ - POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ - POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ - POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - -#define POOLING3x3_STRIDE3(res, input, output) \ - vec4 data000; \ - vec4 data001; \ - vec4 data010; \ - vec4 data100; \ - vec4 data101; \ - vec4 data11; \ - vec4 data200; \ - vec4 data201; \ - vec4 data21; \ - LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ - LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ - LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ - LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ - LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ - LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ - LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ - LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ - LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ - data000 = POW2_OP(data000, 4); \ - data001 = POW2_OP(data001, 4); \ - data010 = POW2_OP(data010, 4); \ - data100 = POW2_OP(data100, 4); \ - data101 = POW2_OP(data101, 4); \ - data11 = POW2_OP(data11, 4); \ - data200 = POW2_OP(data200, 4); \ - data201 = POW2_OP(data201, 4); \ - data21 = POW2_OP(data21, 4); \ - \ - POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ - POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ - POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ - POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ - POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ - POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ - POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) - -float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; @@ -269,13 +129,13 @@ float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, int end_y = int(min(start_y + pool_size, upper_bound_h)); float data_max; - data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0)); + data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter); for(int i = 0; (start_y + i) < end_y; ++i) { for(int j = 0; (start_x + j) < end_x; ++j) { - float data = LOAD4(src, tensor3D_offset(src, j, i, 0)); + float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); POOL_OP_float(data_max, data_max, data); } } @@ -283,7 +143,7 @@ float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, return data_max; } -float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; @@ -295,7 +155,7 @@ float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, { for(int j = 0; (start_y + j) < end_y; ++j) { - float data = LOAD4(src, tensor3D_offset(src, i, j, 0)); + float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0)); if(isnan(data)) { data = 0.0f; @@ -316,45 +176,30 @@ float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, return data_total / float((end_y - start_y) * (end_x - start_x)); } -#ifdef POOLING_LAYER_2 -/** Performs a pooling function of pool size equal to 2. - * - * @note Supported data types are F32; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ +#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7) + +#if defined(POOLING_LAYER_2) +#define POOL_SIZE 2 +#elif defined(POOLING_LAYER_3) +#define POOL_SIZE 3 +#elif defined(POOLING_LAYER_7) +#define POOL_SIZE 7 +#else // POOLING_LAYER_n +#error Please define POOLING_LAYER_N instead. +#endif // POOLING_LAYER_n + void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); //Load and calculate data float res; #if defined(POOL_AVG) || defined(POOL_L2) - res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); + res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #else /*POOL_AVG*/ - res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); + res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /*POOL_AVG*/ #if defined(POOL_L2) @@ -363,100 +208,148 @@ void main(void) #endif /* defined(POOL_L2) */ // Store result - STORE4(dst, CURRENT_OFFSET(dst), res); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, res); } -#elif defined(POOLING_LAYER_3) -/** Performs a pooling function of pool size equal to 3. - * - * @note Supported data types are F32; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -void main(void) -{ - // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); +#elif defined(POOLING_LAYER_3_OPTIMIZED) - //Load and calculate data - float res; -#if defined(POOL_AVG) || defined(POOL_L2) - res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#else /*POOL_AVG*/ - res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif /*POOL_AVG*/ +#define POOLING3x3_STRIDE1(res, input_ptr, input_iter) \ + vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ + vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ + vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ +#define POOLING3x3_STRIDE2(res, input_ptr, input_iter) \ + vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ + float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \ + vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ + float data11 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \ + vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ + float data21 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - // Store result - STORE4(dst, CURRENT_OFFSET(dst), res); -} +#define POOLING3x3_STRIDE3(res, input_ptr, input_iter) \ + vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ + vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8)); \ + vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ + vec4 data11 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8)); \ + vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ + vec4 data21 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) -#elif defined(POOLING_LAYER_3_OPTIMIZED) -/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 - * - * @note Supported data types are F32; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); vec4 res; // Perform pooling 3x3 for 4 output elements #if STRIDE_X == 1 - POOLING3x3_STRIDE1(res, src, dst); + POOLING3x3_STRIDE1(res, src_ptr, src_iter); #elif STRIDE_X == 2 - POOLING3x3_STRIDE2(res, src, dst); + POOLING3x3_STRIDE2(res, src_ptr, src_iter); #elif STRIDE_X == 3 - POOLING3x3_STRIDE3(res, src, dst); + POOLING3x3_STRIDE3(res, src_ptr, src_iter); #endif /*STRIDE_X == 1*/ // Divide by pool region in case of average pooling @@ -477,109 +370,28 @@ void main(void) res = SQRT_OP(res); #endif /* defined(POOL_L2) */ - STORE16(dst, CURRENT_OFFSET(dst), res); -} - -#elif defined(POOLING_LAYER_7) -/** Performs a pooling function of pool size equal to 7. - * - * @note Supported data types are F32; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -void main(void) -{ - // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - //Load and calculate data - float res; -#if defined(POOL_AVG) || defined(POOL_L2) - res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#else /*POOL_AVG*/ - res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif /*POOL_AVG*/ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ - - // Store result - STORE4(dst, CURRENT_OFFSET(dst), res); + VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res); } #elif defined(POOLING_LAYER_N) -/** Performs a pooling function of pool size equal to N - * - * @note Supported data types are F32; - * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ + void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); - vec4 vdata0; - vdata0 = vec4(INITIAL_VALUE); - vec4 vdata1; - vdata1 = vec4(INITIAL_VALUE); - float sdata; - sdata = float(INITIAL_VALUE); + vec4 vdata0 = vec4(INITIAL_VALUE); + vec4 vdata1 = vec4(INITIAL_VALUE); + float sdata = float(INITIAL_VALUE); for(int y = 0; y < int(POOL_SIZE); y++) { int x = 0; for(; x <= (int(POOL_SIZE) - 8); x += 8) { - vec4 data2; - vec4 data3; - LOAD16(data2, src, tensor3D_offset(src, x, y, 0)); - LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4)); + vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); + vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling @@ -594,7 +406,7 @@ void main(void) // Leftover for(; x < int(POOL_SIZE); ++x) { - float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0)); + float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data4 *= data4; @@ -634,91 +446,17 @@ void main(void) #endif /* defined(POOL_L2) */ // Store result - STORE4(dst, CURRENT_OFFSET(dst), res); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, res); } -#endif /* POOLING_LAYER_2 */ +#endif // POOLING_LAYER_N #elif defined(DATA_TYPE_FP16) -precision mediump float; - -vec2 load_and_unpack(Tensor3D, uint); -vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); -vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); - -BUFFER_DECLARATION(src, 1, uint, readonly); -BUFFER_DECLARATION(dst, 2, uint, writeonly); +vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); +vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int); -layout(std140) uniform shader_params -{ - TENSOR3D_PARAM_DECLARATION(src); - TENSOR3D_PARAM_DECLARATION(dst); -}; - -#define LOAD2_fp16(r, name, offset) \ - r.xy = load_and_unpack(name, offset) - -#define LOAD4_fp16(r, name, offset) \ - r.xy = load_and_unpack(name, offset); \ - r.zw = load_and_unpack(name, offset + uint(1)) - -#define STORE4_fp16(name, offset, r) \ - uint datastore1; \ - uint datastore2; \ - datastore1 = uint(packHalf2x16(r.xy)); \ - datastore2 = uint(packHalf2x16(r.zw)); \ - STORE1(name, offset << uint(1), datastore1); \ - STORE1(name, (offset << uint(1)) + uint(1), datastore2) - -#if defined(POOL_AVG) || defined(POOL_L2) -#define POOL_OP(res, a, b) ((res) = (a) + (b)) -#define POOL_OP_float(res, a, b) (res = a + b) -#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) -#else /* defined(POOL_AVG) || defined(POOL_L2) */ -#define POOL_OP(res, a, b) \ - (res) = (a); \ - if(isnan(a.x) || (a.x < b.x)) \ - { \ - res.x = b.x; \ - } \ - if(isnan(a.y) || (a.y < b.y)) \ - { \ - res.y = b.y; \ - } \ - if(isnan(a.z) || (a.z < b.z)) \ - { \ - res.z = b.z; \ - } \ - if(isnan(a.w) || (a.w < b.w)) \ - { \ - res.w = b.w; \ - } -#define POOL_OP_float(res, a, b) \ - (res) = (a); \ - if(isnan(a) || (a < b)) \ - { \ - res = b; \ - } -#define POOL_OP_vec2(res, a, b) \ - (res) = (a); \ - if(isnan(a.x) || (a.x < b.x)) \ - { \ - res.x = b.x; \ - } \ - if(isnan(a.y) || (a.y < b.y)) \ - { \ - res.y = b.y; \ - } -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) -#define POW2_OP(x, vec_size) ((x) * (x)) -#else /* defined(POOL_L2) */ -#define POW2_OP(x, vec_size) (x) -#endif /* defined(POOL_L2) */ - -#define DIV_OP(x, y) (x * (1.f / y)) -#define SQRT_OP(x) sqrt((x)) +TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); #if defined(POOL_SIZE) // Set the initial value for the pooling operation accordingly with the data type @@ -729,170 +467,7 @@ layout(std140) uniform shader_params #endif //POOL_AVG #endif //POOL_SIZE -#define POOLING3x3_STRIDE1_fp16(res, input, output) \ - vec4 data00; \ - vec2 data01; \ - vec4 data10; \ - vec2 data11; \ - vec4 data20; \ - vec2 data21; \ - LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ - LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ - LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ - LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ - LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ - LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ - data00 = POW2_OP(data00, 4); \ - data01 = POW2_OP(data01, 2); \ - data10 = POW2_OP(data10, 4); \ - data11 = POW2_OP(data11, 2); \ - data20 = POW2_OP(data20, 4); \ - data21 = POW2_OP(data21, 2); \ - \ - vec4 values000; \ - vec4 values001; \ - vec4 values010; \ - vec4 values100; \ - vec4 values101; \ - vec4 values11; \ - vec4 values200; \ - vec4 values201; \ - vec4 values21; \ - values000.xyzw = data00.xyzy; \ - values001.xyzw = data00.zwzw; \ - values010.x = data01.x; \ - values010.y = data00.w; \ - values010.zw = data01.xy; \ - values100.xyzw = data10.xyzy; \ - values101.xyzw = data10.zwzw; \ - values11.x = data11.x; \ - values11.y = data10.w; \ - values11.zw = data11.xy; \ - values200.xyzw = data20.xyzy; \ - values201.xyzw = data20.zwzw; \ - values21.x = data21.x; \ - values21.y = data20.w; \ - values21.zw = data21.xy; \ - POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ - POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ - POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - -#define POOLING3x3_STRIDE2_fp16(res, input, output) \ - vec4 data000; \ - vec4 data001; \ - float data010; \ - vec4 data100; \ - vec4 data101; \ - float data11; \ - vec4 data200; \ - vec4 data201; \ - float data21; \ - vec2 datamiddle0; \ - vec2 datamiddle1; \ - vec2 datamiddle2; \ - LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ - LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ - datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ - data010 = datamiddle0.x; \ - LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ - LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ - datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ - data11 = datamiddle1.x; \ - LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ - LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ - datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ - data21 = datamiddle2.x; \ - data000 = POW2_OP(data000, 4); \ - data001 = POW2_OP(data001, 4); \ - data010 = POW2_OP(data010, 1); \ - data100 = POW2_OP(data100, 4); \ - data101 = POW2_OP(data101, 4); \ - data11 = POW2_OP(data11, 1); \ - data200 = POW2_OP(data200, 4); \ - data201 = POW2_OP(data201, 4); \ - data21 = POW2_OP(data21, 1); \ - \ - vec4 values000; \ - vec4 values001; \ - vec4 values010; \ - vec4 values100; \ - vec4 values101; \ - vec4 values11; \ - vec4 values200; \ - vec4 values201; \ - vec4 values21; \ - values000.xyzw = data000.xyzz; \ - values001.xyzw = vec4(data000.w, data001.xxy); \ - values010.xyzw = vec4(data001.zzw, data010); \ - values100.xyzw = data100.xyzz; \ - values101.xyzw = vec4(data100.w, data101.xxy); \ - values11.xyzw = vec4(data101.zzw, data11); \ - values200.xyzw = data200.xyzz; \ - values201.xyzw = vec4(data200.w, data201.xxy); \ - values21.xyzw = vec4(data201.zzw, data21); \ - POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ - POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ - POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ - POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ - POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - -#define POOLING3x3_STRIDE3_fp16(res, input, output) \ - vec4 data000; \ - vec4 data001; \ - vec4 data010; \ - vec4 data100; \ - vec4 data101; \ - vec4 data11; \ - vec4 data200; \ - vec4 data201; \ - vec4 data21; \ - LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ - LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ - LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ - LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ - LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ - LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ - LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ - LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ - LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ - data000 = POW2_OP(data000, 4); \ - data001 = POW2_OP(data001, 4); \ - data010 = POW2_OP(data010, 4); \ - data100 = POW2_OP(data100, 4); \ - data101 = POW2_OP(data101, 4); \ - data11 = POW2_OP(data11, 4); \ - data200 = POW2_OP(data200, 4); \ - data201 = POW2_OP(data201, 4); \ - data21 = POW2_OP(data21, 4); \ - \ - POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ - POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ - POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ - POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ - POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ - POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ - POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ - POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) - -vec2 load_and_unpack(Tensor3D src, uint offset) -{ - uint packed_s; - vec2 s; - LOAD1(packed_s, src, offset); - - s = vec2(unpackHalf2x16(packed_s)); - return s; -} - -vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; @@ -908,7 +483,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c vec2 data_max = vec2(0); //Load and Set initial maximum1 - vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2)); + vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); data_max.x = data_init1.x; //Load and Set initial maximum2 @@ -916,12 +491,12 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c { if((stride_x % 2) == 0) { - vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2)); + vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0)); data_max.y = data_init2.x; } else { - vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2)); + vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0)); data_max.y = data_init2.y; } } @@ -932,14 +507,14 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c //Calculate maximum1 if((start_x1 + j + 1) < end_x1) { - vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); float data_mr1; POOL_OP_float(data_mr1, data1.x, data1.y); POOL_OP_float(data_max.x, data_max.x, data_mr1); } else { - vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); POOL_OP_float(data_max.x, data_max.x, data1.x); } @@ -948,7 +523,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c { if((stride_x % 2) == 0) { - vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2))); + vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0)); if((start_x2 + j + 1) < end_x2) { @@ -963,8 +538,8 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c } else { - vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); - vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); + vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0)); + vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0)); if((start_x2 + j + 1) < end_x2) { float data_mr2; @@ -981,7 +556,7 @@ vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, c return data_max; } -vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x; int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; @@ -999,7 +574,7 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c for(int i = 0; (start_y1 + i) < end_y1; i++) for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) { - vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data1 = POW2_OP(data1, 2); @@ -1019,7 +594,7 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c { if((stride_x % 2) == 0) { - vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2))); + vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data2 = POW2_OP(data2, 2); @@ -1035,8 +610,8 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c } else { - vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); - vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); + vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0)); + vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data2 = POW2_OP(data2, 2); @@ -1068,46 +643,30 @@ vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, c return data_avg; } -#ifdef POOLING_LAYER_2 -/** Performs a pooling function of pool size equal to 2. - * - * @note Supported data types are F16; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ +#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7) + +#if defined(POOLING_LAYER_2) +#define POOL_SIZE 2 +#elif defined(POOLING_LAYER_3) +#define POOL_SIZE 3 +#elif defined(POOLING_LAYER_7) +#define POOL_SIZE 7 +#else // POOLING_LAYER_n +#error Please define POOLING_LAYER_N instead. +#endif // POOLING_LAYER_n + void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); //Load and calculate data vec2 data; - uint res; #if defined(POOL_AVG) || defined(POOL_L2) - data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); + data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #else /*POOL_AVG*/ - data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); + data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /*POOL_AVG*/ #if defined(POOL_L2) @@ -1115,106 +674,164 @@ void main(void) data = SQRT_OP(data); #endif /* defined(POOL_L2) */ - res = uint(packHalf2x16(data)); - // Store result - STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data); } -#elif defined(POOLING_LAYER_3) -/** Performs a pooling function of pool size equal to 3. - * - * @note Supported data types are F16; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -void main(void) -{ - // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); - - //Load and calculate data - vec2 data; - uint res; -#if defined(POOL_AVG) || defined(POOL_L2) - data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#else /*POOL_AVG*/ - data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif /*POOL_AVG*/ +#elif defined(POOLING_LAYER_3_OPTIMIZED) -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - data = SQRT_OP(data); -#endif /* defined(POOL_L2) */ +#define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter) \ + vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ + vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ + vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - res = uint(packHalf2x16(data)); +#define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter) \ + vec4 data000; \ + vec4 data001; \ + float data010; \ + vec4 data100; \ + vec4 data101; \ + float data11; \ + vec4 data200; \ + vec4 data201; \ + float data21; \ + vec2 datamiddle0; \ + vec2 datamiddle1; \ + vec2 datamiddle2; \ + data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ + datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ + data010 = datamiddle0.x; \ + data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ + datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ + data11 = datamiddle1.x; \ + data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ + datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ + data21 = datamiddle2.x; \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) - // Store result - STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); -} +#define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter) \ + vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0)); \ + vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2)); \ + vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4)); \ + vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0)); \ + vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2)); \ + vec4 data11 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4)); \ + vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0)); \ + vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2)); \ + vec4 data21 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw)) -#elif defined(POOLING_LAYER_3_OPTIMIZED) -/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 - * - * @note Supported data types are F16; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); vec4 res; // Perform pooling 3x3 for 4 output elements #if STRIDE_X == 1 - POOLING3x3_STRIDE1_fp16(res, src, dst); + POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter); #elif STRIDE_X == 2 - POOLING3x3_STRIDE2_fp16(res, src, dst); + POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter); #elif STRIDE_X == 3 - POOLING3x3_STRIDE3_fp16(res, src, dst); + POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter); #endif /*STRIDE_X == 1*/ // Divide by pool region in case of average pooling @@ -1235,116 +852,30 @@ void main(void) res = SQRT_OP(res); #endif /* defined(POOL_L2) */ - STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res); -} - -#elif defined(POOLING_LAYER_7) -/** Performs a pooling function of pool size equal to 7. - * - * @note Supported data types are F16; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -void main(void) -{ - // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); - - //Load and calculate data - vec2 data; - uint res; -#if defined(POOL_AVG) || defined(POOL_L2) - data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#else /*POOL_AVG*/ - data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif /*POOL_AVG*/ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - data = SQRT_OP(data); -#endif /* defined(POOL_L2) */ - - res = uint(packHalf2x16(data)); - - // Store result - STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); + VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res); } #elif defined(POOLING_LAYER_N) -/** Performs a pooling function of pool size equal to N - * - * @note Supported data types are F16; - * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; - * @note In case of average pooling the following information must be passed at compile time: - * POOL_AVG must be provided otherwise max pooling will be performed. - * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions - * PAD_X and PAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ + void main(void) { // Get pixels pointer - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); - - vec4 vdata00; - vdata00 = vec4(INITIAL_VALUE); - vec4 vdata01; - vdata01 = vec4(INITIAL_VALUE); - vec4 vdata10; - vdata10 = vec4(INITIAL_VALUE); - vec4 vdata11; - vdata11 = vec4(INITIAL_VALUE); - vec2 sdata; - sdata = vec2(INITIAL_VALUE); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); + + vec4 vdata00 = vec4(INITIAL_VALUE); + vec4 vdata01 = vec4(INITIAL_VALUE); + vec4 vdata10 = vec4(INITIAL_VALUE); + vec4 vdata11 = vec4(INITIAL_VALUE); + vec2 sdata = vec2(INITIAL_VALUE); for(int y = 0; y < int(POOL_SIZE); y++) { int x = 0; for(; x <= (int(POOL_SIZE) - 8); x += 8) { - vec4 data2; - vec4 data3; - LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); - LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2)); + vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); + vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling @@ -1359,8 +890,7 @@ void main(void) // Leftover for(; x < int(POOL_SIZE); x = x + 2) { - vec2 data4middle; - data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); + vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data4middle *= data4middle; @@ -1385,10 +915,8 @@ void main(void) int x1 = STRIDE_X; for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) { - vec4 data2; - vec4 data3; - LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); - LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2)); + vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); + vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling @@ -1404,7 +932,7 @@ void main(void) for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) { vec2 data4middle; - data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); + data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data4middle *= data4middle; @@ -1424,7 +952,7 @@ void main(void) else { vec2 dataorigin2; - dataorigin2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (STRIDE_X - 1), y, 0) >> uint(2))); + dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling dataorigin2.y *= dataorigin2.y; @@ -1434,10 +962,8 @@ void main(void) int x1 = STRIDE_X + 1; for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) { - vec4 data2; - vec4 data3; - LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); - LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2)); + vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); + vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling @@ -1452,8 +978,7 @@ void main(void) // Leftover for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) { - vec2 data4middle; - data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); + vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0)); #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling data4middle *= data4middle; @@ -1516,11 +1041,12 @@ void main(void) // Take square root of the result in L2 pooling data = SQRT_OP(data); #endif /* defined(POOL_L2) */ - uint res; - res = uint(packHalf2x16(data)); // Store result - STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data); } -#endif /*POOLING_LAYER_2*/ -#endif /*DATA_TYPE_FP32 */ +#endif // POOLING_LAYER_N + +#else // DATA_TYPE_FP32 +#error Data type not supported +#endif // DATA_TYPE_FP32 |