aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-08-31 14:21:36 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitcdf51455df8835e9e3bfd3e31ed389146af9a573 (patch)
tree31b0bf9302decbf8b1063f46373e3d26a9ca1409 /src/core
parent29088d517a2a9f249fe5cc851e0c97de3d4cc917 (diff)
downloadComputeLibrary-cdf51455df8835e9e3bfd3e31ed389146af9a573.tar.gz
COMPMID-515: L2 Pooling for FP32/FP16 in CL.
Change-Id: I43641fa672f5905ca62edd1f63fc93e0cf7ea382 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/85963 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/cl_kernels/fixed_point.h16
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl139
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp268
-rw-r--r--src/core/Utils.cpp12
5 files changed, 375 insertions, 64 deletions
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index 478a414cad..5476a6e070 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -241,9 +241,17 @@ MULQ_IMPL(qs16x16, qs32x16)
return CONVERT_SAT((res >> (itype)fixed_point_position), type); \
}
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
MULQ_SAT_IMPL(qs16x16, qs32x16)
#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position))
@@ -411,7 +419,7 @@ LOGQ_IMPL(qs16, qs16x16, 16)
{ \
type const_three = (type)(3 << (fixed_point_position)); \
type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \
- type temp = select(VopA >> shift_value, select((type)stype##_MAX, VopA << (-shift_value), clz(VopA) > (-shift_value)), shift_value < (type)0); \
+ type temp = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0)); \
type x = temp; \
x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
@@ -422,9 +430,11 @@ LOGQ_IMPL(qs16, qs16x16, 16)
x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
} \
type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \
- return select(x >> shift_value2, select((type)stype##_MAX, x << shift_value2, clz(x) > shift_value2), shift_value < (type)0); /* Saturate result if needed */ \
+ return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */ \
}
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
INVSQRTQ_IMPL(qs8, qs8x16, 16)
INVSQRTQ_IMPL(qs16, qs16x8, 8)
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 0497bf4b91..99d7e6e01b 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -33,18 +33,32 @@
#define POOL_OP(x, y) (max((x), (y)))
#endif /* POOL_AVG */
-#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), y, DATA_TYPE, FIXED_POINT_POSITION)
+#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)
#define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)
+#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
#else /* FIXED_POINT_POSITION */
-#if defined(POOL_AVG)
+#if defined(POOL_AVG) || defined(POOL_L2)
#define POOL_OP(x, y) ((x) + (y))
-#else /* POOL_AVG */
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
#endif /* FIXED_POINT_POSITION */
@@ -70,6 +84,12 @@
data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
VEC_DATA_TYPE(DATA_TYPE, 2) \
data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
\
VEC_DATA_TYPE(DATA_TYPE, 8) \
values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \
@@ -104,6 +124,12 @@
VEC_DATA_TYPE(DATA_TYPE, 8) \
data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 1); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 1); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 1); \
\
VEC_DATA_TYPE(DATA_TYPE, 8) \
values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \
@@ -141,6 +167,12 @@
data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
VEC_DATA_TYPE(DATA_TYPE, 4) \
data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 4); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 4); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 4); \
\
data00 = POOL_OP(data00, data10); \
data01 = POOL_OP(data01, data11); \
@@ -165,7 +197,7 @@ DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, cons
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -201,14 +233,25 @@ __kernel void pooling_layer_2(
VEC_DATA_TYPE(DATA_TYPE, 2)
data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 2);
+ data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
// Perform calculations
data0 = POOL_OP(data0, data1);
DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
- // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average or l2 pooling
res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
@@ -218,7 +261,7 @@ __kernel void pooling_layer_2(
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -256,15 +299,27 @@ __kernel void pooling_layer_3(
VEC_DATA_TYPE(DATA_TYPE, 3)
data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 3);
+ data1 = POW2_OP(data1, 3);
+ data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
// Perform calculations
data0 = POOL_OP(data0, data1);
data0 = POOL_OP(data0, data2);
DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+#if defined(POOL_AVG) || defined(POOL_L2)
// Divide by pool region in case of average pooling
-#ifdef POOL_AVG
res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
@@ -290,7 +345,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -326,10 +381,15 @@ __kernel void pooling_layer_3_optimized(
// Perform pooling 3x3 for 4 output elements
POOLING3x3(res, input, output);
+#if defined(POOL_AVG) || defined(POOL_L2)
// Divide by pool region in case of average pooling
-#ifdef POOL_AVG
res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif // POOL_AVG
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
}
@@ -339,7 +399,7 @@ __kernel void pooling_layer_3_optimized(
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -385,6 +445,17 @@ __kernel void pooling_layer_7(
VEC_DATA_TYPE(DATA_TYPE, 8)
data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 8);
+ data1 = POW2_OP(data1, 8);
+ data2 = POW2_OP(data2, 8);
+ data3 = POW2_OP(data3, 8);
+ data4 = POW2_OP(data4, 8);
+ data5 = POW2_OP(data5, 8);
+ data6 = POW2_OP(data6, 8);
+#endif /* defined(POOL_L2) */
+
// Pool operation of all rows
data0 = POOL_OP(data0, data1);
data2 = POOL_OP(data2, data3);
@@ -394,11 +465,11 @@ __kernel void pooling_layer_7(
data0 = POOL_OP(data0, data4);
// Set last element
-#ifdef POOL_AVG
+#if defined(POOL_AVG) || defined(POOL_L2)
data0.s7 = 0;
-#else /* POOL_AVG */
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
data0.s7 = data0.s6;
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
// Reduce result
VEC_DATA_TYPE(DATA_TYPE, 4)
@@ -407,10 +478,15 @@ __kernel void pooling_layer_7(
reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+#if defined(POOL_AVG) || defined(POOL_L2)
// Divide by pool region in case of average pooling
-#ifdef POOL_AVG
res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
@@ -419,9 +495,9 @@ __kernel void pooling_layer_7(
#if defined(POOL_SIZE)
// Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG)
+#if defined(POOL_AVG) || defined(POOL_L2)
#define INITIAL_VALUE 0
-#else // POOL_AVG
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
#ifdef FIXED_POINT_POSITION
#define MIN_VAL_EXPAND(type) type##_MIN
#define MIN_VAL(type) MIN_VAL_EXPAND(type)
@@ -485,6 +561,10 @@ __kernel void pooling_layer_N(
{
VEC_DATA_TYPE(DATA_TYPE, 8)
data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
vdata = POOL_OP(vdata, data0);
}
@@ -492,7 +572,11 @@ __kernel void pooling_layer_N(
for(; x < (int)POOL_SIZE; ++x)
{
DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
- sdata = POOL_OP(sdata, data0);
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ sdata = POOL_OP(sdata, data0);
}
}
@@ -504,10 +588,15 @@ __kernel void pooling_layer_N(
DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
res = POOL_OP(res, sdata);
+#if defined(POOL_AVG) || defined(POOL_L2)
// Divide by pool region in case of average pooling
-#ifdef POOL_AVG
res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 22c7730963..497e87b2b5 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -101,14 +101,14 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
// Set build options
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+ build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
if(is_data_type_fixed_point(input->info()->data_type()))
{
build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
}
build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
- if(pool_type == PoolingType::AVG)
+ if(pool_type != PoolingType::MAX)
{
build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index fdcbd5a898..b97564e77b 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -36,6 +37,7 @@
#include <algorithm>
#include <arm_neon.h>
+#include <cmath>
#include <limits>
#include <set>
#include <string>
@@ -111,6 +113,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
@@ -235,41 +238,146 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
case 2:
if(input->info()->data_type() == DataType::QS8)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::QS16)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F16)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F32)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
break;
case 3:
if(input->info()->data_type() == DataType::QS8)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::QS16)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F16)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F32)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
break;
case 7:
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling size");
@@ -436,11 +544,20 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window
execute_window_loop(window, [&](const Coordinates & id)
{
- const float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
- const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
- const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
- float16x4_t res = {};
- if(pooling_type == PoolingType::AVG)
+ float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+ float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x4_t res = {};
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmul_f16(top_data, top_data);
+ middle_data = vmul_f16(middle_data, middle_data);
+ bottom_data = vmul_f16(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
{
// Calculate scale
const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -456,6 +573,13 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window
res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
res = vpmax_f16(res, res);
}
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = vinv_f16(vinvsqrt_f16(res));
+ }
+
*(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
},
input, output);
@@ -484,11 +608,20 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
- const auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
float16x8_t res = {};
- if(pooling_type == PoolingType::AVG)
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
+ top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
+ bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
+ bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
+ }
+
+ if(pooling_type != PoolingType::MAX)
{
const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
const float16x8_t scale_v = vdupq_n_f16(scale);
@@ -498,6 +631,14 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window
{
res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
}
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = vinvq_f16(vinvsqrtq_f16(res));
+ }
+
+ // Store result
vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
},
input, output);
@@ -529,10 +670,19 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window
execute_window_loop(window, [&](const Coordinates & id)
{
- const float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
- if(pooling_type == PoolingType::AVG)
+ float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmul_f32(top_data, top_data);
+ bottom_data = vmul_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
{
// Calculate scale
float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -547,7 +697,16 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window
const float32x2_t max_data = vmax_f32(top_data, bottom_data);
res = vpmax_f32(max_data, max_data);
}
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
},
input, output);
}
@@ -719,11 +878,21 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window
execute_window_loop(window, [&](const Coordinates & id)
{
- const float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
- const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
- if(pooling_type == PoolingType::AVG)
+ float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+ float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmulq_f32(top_data, top_data);
+ middle_data = vmulq_f32(middle_data, middle_data);
+ bottom_data = vmulq_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
{
// Calculate scale
float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -740,7 +909,16 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window
res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
res = vpmax_f32(res, res);
}
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
},
input, output);
}
@@ -769,19 +947,32 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
execute_window_loop(window, [&](const Coordinates & id)
{
- float32x2_t res = {};
- if(pooling_type == PoolingType::AVG)
+ float32x2_t res = {};
+ float final_res = 0.f;
+ if(pooling_type != PoolingType::MAX)
{
// Calculate scale
float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
- float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
- float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+ float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
for(int i = 1; i < pool_size; ++i)
{
- data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
sum_data = vaddq_f32(sum_data, data.val[0]);
sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
}
@@ -800,7 +991,16 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
res = vpmax_f32(res, res);
}
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
},
input, output);
}
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 66d3fe8f78..99d39569c7 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -228,6 +228,18 @@ const std::string &arm_compute::string_from_norm_type(NormType type)
return norm_type_map[type];
}
+const std::string &arm_compute::string_from_pooling_type(PoolingType type)
+{
+ static std::map<PoolingType, const std::string> pool_type_map =
+ {
+ { PoolingType::MAX, "MAX" },
+ { PoolingType::AVG, "AVG" },
+ { PoolingType::L2, "L2" },
+ };
+
+ return pool_type_map[type];
+}
+
std::string arm_compute::lower_string(const std::string &val)
{
std::string res = val;