From cdf51455df8835e9e3bfd3e31ed389146af9a573 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 31 Aug 2017 14:21:36 +0100 Subject: COMPMID-515: L2 Pooling for FP32/FP16 in CL. Change-Id: I43641fa672f5905ca62edd1f63fc93e0cf7ea382 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/85963 Tested-by: Kaizen Reviewed-by: Gian Marco Iodice --- arm_compute/core/NEON/NEMath.h | 41 +++- arm_compute/core/NEON/NEMath.inl | 33 +++ arm_compute/core/Types.h | 3 +- arm_compute/core/Utils.h | 7 + .../runtime/NEON/functions/NEPoolingLayer.h | 2 +- scripts/clang-tidy.h | 20 ++ src/core/CL/cl_kernels/fixed_point.h | 16 +- src/core/CL/cl_kernels/pooling_layer.cl | 139 +++++++++-- src/core/CL/kernels/CLPoolingLayerKernel.cpp | 4 +- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 268 ++++++++++++++++++--- src/core/Utils.cpp | 12 + tests/TypePrinter.h | 3 + tests/datasets/PoolingTypesDataset.h | 2 +- tests/validation/CL/PoolingLayer.cpp | 2 +- tests/validation/CPP/PoolingLayer.cpp | 54 ++++- tests/validation/NEON/PoolingLayer.cpp | 2 +- 16 files changed, 524 insertions(+), 84 deletions(-) diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index 523649c65d..ba65926802 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -42,17 +42,23 @@ float32x4_t vfloorq_f32(float32x4_t val); * * @return The calculated inverse square root. */ -float32x4_t vinvsqrtq_f32(float32x4_t x); +float32x2_t vinvsqrt_f32(float32x2_t x); -#ifdef ARM_COMPUTE_ENABLE_FP16 /** Calculate inverse square root. * * @param[in] x Input value. * * @return The calculated inverse square root. */ -float16x8_t vinvsqrtq_f16(float16x8_t x); -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +float32x4_t vinvsqrtq_f32(float32x4_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float32x2_t vinv_f32(float32x2_t x); /** Calculate reciprocal. * @@ -122,6 +128,31 @@ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); * @return The calculated Hyperbolic Tangent. */ float16x8_t vtanhq_f16(float16x8_t val); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float16x4_t vinv_f16(float16x4_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float16x8_t vinvq_f16(float16x8_t x); + +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float16x4_t vinvsqrt_f16(float16x4_t x); + /** Calculate inverse square root. * * @param[in] x Input value. @@ -129,6 +160,7 @@ float16x8_t vtanhq_f16(float16x8_t val); * @return The calculated inverse square root. */ float16x8_t vinvsqrtq_f16(float16x8_t x); + /** Calculate exponential * * @param[in] x Input vector value in F16 format. @@ -136,6 +168,7 @@ float16x8_t vinvsqrtq_f16(float16x8_t x); * @return The calculated exponent. */ float16x8_t vexpq_f16(float16x8_t x); + /** Calculate n power of a number. * * pow(x,n) = e^(n*log(x)) diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index bdd747c4e9..50d85396d4 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -64,6 +64,15 @@ inline float32x4_t vfloorq_f32(float32x4_t val) return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r); } +inline float32x2_t vinvsqrt_f32(float32x2_t x) +{ + float32x2_t sqrt_reciprocal = vrsqrte_f32(x); + sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + + return sqrt_reciprocal; +} + inline float32x4_t vinvsqrtq_f32(float32x4_t x) { float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); @@ -73,6 +82,14 @@ inline float32x4_t vinvsqrtq_f32(float32x4_t x) return sqrt_reciprocal; } +inline float32x2_t vinv_f32(float32x2_t x) +{ + float32x2_t recip = vrecpe_f32(x); + recip = vmul_f32(vrecps_f32(x, recip), recip); + recip = vmul_f32(vrecps_f32(x, recip), recip); + return recip; +} + inline float32x4_t vinvq_f32(float32x4_t x) { float32x4_t recip = vrecpeq_f32(x); @@ -182,6 +199,14 @@ const std::array log_tab_f16 = } }; +inline float16x4_t vinvsqrt_f16(float16x4_t x) +{ + float16x4_t sqrt_reciprocal = vrsqrte_f16(x); + sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + return sqrt_reciprocal; +} + inline float16x8_t vinvsqrtq_f16(float16x8_t x) { float16x8_t sqrt_reciprocal = vrsqrteq_f16(x); @@ -190,6 +215,14 @@ inline float16x8_t vinvsqrtq_f16(float16x8_t x) return sqrt_reciprocal; } +inline float16x4_t vinv_f16(float16x4_t x) +{ + float16x4_t recip = vrecpe_f16(x); + recip = vmul_f16(vrecps_f16(x, recip), recip); + recip = vmul_f16(vrecps_f16(x, recip), recip); + return recip; +} + inline float16x8_t vinvq_f16(float16x8_t x) { float16x8_t recip = vrecpeq_f16(x); diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 7d9cd4e0cc..8750a9cf1f 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -398,7 +398,8 @@ enum class DimensionRoundingType enum class PoolingType { MAX, /**< Max Pooling */ - AVG /**< Average Pooling */ + AVG, /**< Average Pooling */ + L2 /**< L2 Pooling */ }; /** Padding and stride information class */ diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 39ec6587de..ab5d110f91 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -609,6 +609,13 @@ const std::string &string_from_border_mode(BorderMode border_mode); * @return The string describing the normalization type. */ const std::string &string_from_norm_type(NormType type); +/** Translates a given pooling type to a string. + * + * @param[in] type @ref PoolingType to be translated to string. + * + * @return The string describing the pooling type. + */ +const std::string &string_from_pooling_type(PoolingType type); /** Lower a given string. * * @param[in] val Given string to lower. diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index 5a9cffa5ae..5c36e80f37 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -42,7 +42,7 @@ class NEPoolingLayer : public INESimpleFunction public: /** Set the input and output tensors. * - * @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32. + * @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h index 2c932ca3a4..318b85ed58 100644 --- a/scripts/clang-tidy.h +++ b/scripts/clang-tidy.h @@ -1,6 +1,11 @@ #include //FIXME: Remove this file before the release +inline float16x4_t vrsqrts_f16 (float16x4_t, float16x4_t) +{ + return vdup_n_f16(0); +} + inline float16x8_t vrsqrtsq_f16 (float16x8_t, float16x8_t) { return vdupq_n_f16(0); @@ -121,6 +126,11 @@ inline float16x4_t vbsl_f16 (uint16x4_t,float16x4_t, float16x4_t) return vdup_n_f16(0); } +inline float16x4_t vrsqrte_f16(float16x4_t) +{ + return vdup_n_f16(0); +} + inline float16x8_t vrsqrteq_f16(float16x8_t) { return vdupq_n_f16(0); @@ -131,11 +141,21 @@ inline float16x8_t vfmsq_f16 (float16x8_t, float16x8_t, float16x8_t) return vdupq_n_f16(0); } +inline float16x4_t vrecpe_f16 (float16x4_t) +{ + return vdup_n_f16(0); +} + inline float16x8_t vrecpeq_f16 (float16x8_t) { return vdupq_n_f16(0); } +inline float16x4_t vrecps_f16 (float16x4_t, float16x4_t) +{ + return vdup_n_f16(0); +} + inline float16x8_t vrecpsq_f16 (float16x8_t, float16x8_t) { return vdupq_n_f16(0); diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h index 478a414cad..5476a6e070 100644 --- a/src/core/CL/cl_kernels/fixed_point.h +++ b/src/core/CL/cl_kernels/fixed_point.h @@ -241,9 +241,17 @@ MULQ_IMPL(qs16x16, qs32x16) return CONVERT_SAT((res >> (itype)fixed_point_position), type); \ } +MULQ_SAT_IMPL(qs8x1, qs16x1) +MULQ_SAT_IMPL(qs8x2, qs16x2) +MULQ_SAT_IMPL(qs8x3, qs16x3) +MULQ_SAT_IMPL(qs8x4, qs16x4) MULQ_SAT_IMPL(qs8x8, qs16x8) -MULQ_SAT_IMPL(qs16x8, qs32x8) MULQ_SAT_IMPL(qs8x16, qs16x16) +MULQ_SAT_IMPL(qs16x1, qs32x1) +MULQ_SAT_IMPL(qs16x2, qs32x2) +MULQ_SAT_IMPL(qs16x3, qs32x3) +MULQ_SAT_IMPL(qs16x4, qs32x4) +MULQ_SAT_IMPL(qs16x8, qs32x8) MULQ_SAT_IMPL(qs16x16, qs32x16) #define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position)) @@ -411,7 +419,7 @@ LOGQ_IMPL(qs16, qs16x16, 16) { \ type const_three = (type)(3 << (fixed_point_position)); \ type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \ - type temp = select(VopA >> shift_value, select((type)stype##_MAX, VopA << (-shift_value), clz(VopA) > (-shift_value)), shift_value < (type)0); \ + type temp = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0)); \ type x = temp; \ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \ @@ -422,9 +430,11 @@ LOGQ_IMPL(qs16, qs16x16, 16) x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \ } \ type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \ - return select(x >> shift_value2, select((type)stype##_MAX, x << shift_value2, clz(x) > shift_value2), shift_value < (type)0); /* Saturate result if needed */ \ + return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */ \ } +INVSQRTQ_IMPL(qs8, qs8x1, 1) +INVSQRTQ_IMPL(qs16, qs16x1, 1) INVSQRTQ_IMPL(qs8, qs8x16, 16) INVSQRTQ_IMPL(qs16, qs16x8, 8) diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index 0497bf4b91..99d7e6e01b 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -33,18 +33,32 @@ #define POOL_OP(x, y) (max((x), (y))) #endif /* POOL_AVG */ -#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), y, DATA_TYPE, FIXED_POINT_POSITION) +#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION) #define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION) +#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION))) + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ #else /* FIXED_POINT_POSITION */ -#if defined(POOL_AVG) +#if defined(POOL_AVG) || defined(POOL_L2) #define POOL_OP(x, y) ((x) + (y)) -#else /* POOL_AVG */ +#else /* defined(POOL_AVG) || defined(POOL_L2) */ #define POOL_OP(x, y) (fmax((x), (y))) -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ #define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) #endif /* FIXED_POINT_POSITION */ @@ -70,6 +84,12 @@ data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ VEC_DATA_TYPE(DATA_TYPE, 2) \ data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ \ VEC_DATA_TYPE(DATA_TYPE, 8) \ values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \ @@ -104,6 +124,12 @@ VEC_DATA_TYPE(DATA_TYPE, 8) \ data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + data00 = POW2_OP(data00, 8); \ + data01 = POW2_OP(data01, 1); \ + data10 = POW2_OP(data10, 8); \ + data11 = POW2_OP(data11, 1); \ + data20 = POW2_OP(data20, 8); \ + data21 = POW2_OP(data21, 1); \ \ VEC_DATA_TYPE(DATA_TYPE, 8) \ values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \ @@ -141,6 +167,12 @@ data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ VEC_DATA_TYPE(DATA_TYPE, 4) \ data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + data00 = POW2_OP(data00, 8); \ + data01 = POW2_OP(data01, 4); \ + data10 = POW2_OP(data10, 8); \ + data11 = POW2_OP(data11, 4); \ + data20 = POW2_OP(data20, 8); \ + data21 = POW2_OP(data21, 4); \ \ data00 = POOL_OP(data00, data10); \ data01 = POOL_OP(data01, data11); \ @@ -165,7 +197,7 @@ DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, cons * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32; * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension @@ -201,14 +233,25 @@ __kernel void pooling_layer_2( VEC_DATA_TYPE(DATA_TYPE, 2) data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 = POW2_OP(data0, 2); + data1 = POW2_OP(data1, 2); +#endif /* defined(POOL_L2) */ + // Perform calculations data0 = POOL_OP(data0, data1); DATA_TYPE res = POOL_OP(data0.s0, data0.s1); - // Divide by pool region in case of average pooling -#ifdef POOL_AVG +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average or l2 pooling res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ // Store result *(__global DATA_TYPE *)output.ptr = res; @@ -218,7 +261,7 @@ __kernel void pooling_layer_2( * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32; * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension @@ -256,15 +299,27 @@ __kernel void pooling_layer_3( VEC_DATA_TYPE(DATA_TYPE, 3) data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 = POW2_OP(data0, 3); + data1 = POW2_OP(data1, 3); + data2 = POW2_OP(data2, 3); +#endif /* defined(POOL_L2) */ + // Perform calculations data0 = POOL_OP(data0, data1); data0 = POOL_OP(data0, data2); DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2); +#if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling -#ifdef POOL_AVG res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ // Store result *(__global DATA_TYPE *)output.ptr = res; @@ -290,7 +345,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32; * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension @@ -326,10 +381,15 @@ __kernel void pooling_layer_3_optimized( // Perform pooling 3x3 for 4 output elements POOLING3x3(res, input, output); +#if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling -#ifdef POOL_AVG res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif // POOL_AVG +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ vstore4(res, 0, (__global DATA_TYPE *)output.ptr); } @@ -339,7 +399,7 @@ __kernel void pooling_layer_3_optimized( * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32; * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension @@ -385,6 +445,17 @@ __kernel void pooling_layer_7( VEC_DATA_TYPE(DATA_TYPE, 8) data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 = POW2_OP(data0, 8); + data1 = POW2_OP(data1, 8); + data2 = POW2_OP(data2, 8); + data3 = POW2_OP(data3, 8); + data4 = POW2_OP(data4, 8); + data5 = POW2_OP(data5, 8); + data6 = POW2_OP(data6, 8); +#endif /* defined(POOL_L2) */ + // Pool operation of all rows data0 = POOL_OP(data0, data1); data2 = POOL_OP(data2, data3); @@ -394,11 +465,11 @@ __kernel void pooling_layer_7( data0 = POOL_OP(data0, data4); // Set last element -#ifdef POOL_AVG +#if defined(POOL_AVG) || defined(POOL_L2) data0.s7 = 0; -#else /* POOL_AVG */ +#else /* defined(POOL_AVG) || defined(POOL_L2) */ data0.s7 = data0.s6; -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ // Reduce result VEC_DATA_TYPE(DATA_TYPE, 4) @@ -407,10 +478,15 @@ __kernel void pooling_layer_7( reduce2 = POOL_OP(reduce4.s01, reduce4.s23); DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); +#if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling -#ifdef POOL_AVG res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ // Store result *(__global DATA_TYPE *)output.ptr = res; @@ -419,9 +495,9 @@ __kernel void pooling_layer_7( #if defined(POOL_SIZE) // Set the initial value for the pooling operation accordingly with the data type -#if defined(POOL_AVG) +#if defined(POOL_AVG) || defined(POOL_L2) #define INITIAL_VALUE 0 -#else // POOL_AVG +#else /* defined(POOL_AVG) || defined(POOL_L2) */ #ifdef FIXED_POINT_POSITION #define MIN_VAL_EXPAND(type) type##_MIN #define MIN_VAL(type) MIN_VAL_EXPAND(type) @@ -485,6 +561,10 @@ __kernel void pooling_layer_N( { VEC_DATA_TYPE(DATA_TYPE, 8) data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ vdata = POOL_OP(vdata, data0); } @@ -492,7 +572,11 @@ __kernel void pooling_layer_N( for(; x < (int)POOL_SIZE; ++x) { DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); - sdata = POOL_OP(sdata, data0); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ + sdata = POOL_OP(sdata, data0); } } @@ -504,10 +588,15 @@ __kernel void pooling_layer_N( DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); res = POOL_OP(res, sdata); +#if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling -#ifdef POOL_AVG res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* POOL_AVG */ +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ // Store result *(__global DATA_TYPE *)output.ptr = res; diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 22c7730963..497e87b2b5 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -101,14 +101,14 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, // Set build options std::set build_opts; build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG")))); + build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type))); if(is_data_type_fixed_point(input->info()->data_type())) { build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); } build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x))); - if(pool_type == PoolingType::AVG) + if(pool_type != PoolingType::MAX) { build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x))); build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y))); diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index fdcbd5a898..b97564e77b 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" @@ -36,6 +37,7 @@ #include #include +#include #include #include #include @@ -111,6 +113,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type())); ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()); ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32); ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); @@ -235,41 +238,146 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons case 2: if(input->info()->data_type() == DataType::QS8) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8 : &NEPoolingLayerKernel::pooling2_q8; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_q8; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::QS16) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16 : &NEPoolingLayerKernel::pooling2_q16; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_q16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::F16) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_f16; + break; + case PoolingType::L2: + _func = &NEPoolingLayerKernel::pooling2_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::F32) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_f32; + break; + case PoolingType::L2: + _func = &NEPoolingLayerKernel::pooling2_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } break; case 3: if(input->info()->data_type() == DataType::QS8) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8 : &NEPoolingLayerKernel::pooling3_q8; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_q8; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::QS16) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16 : &NEPoolingLayerKernel::pooling3_q16; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_q16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::F16) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_f16; + break; + case PoolingType::L2: + _func = &NEPoolingLayerKernel::pooling3_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } else if(input->info()->data_type() == DataType::F32) { - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_f32; + break; + case PoolingType::L2: + _func = &NEPoolingLayerKernel::pooling3_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } break; case 7: - _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling7_f32; + break; + case PoolingType::L2: + _func = &NEPoolingLayerKernel::pooling7_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling7_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling size"); @@ -436,11 +544,20 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window execute_window_loop(window, [&](const Coordinates & id) { - const float16x4_t top_data = vld1_f16(reinterpret_cast(input_top_ptr + input.offset())); - const float16x4_t middle_data = vld1_f16(reinterpret_cast(input_middle_ptr + input.offset())); - const float16x4_t bottom_data = vld1_f16(reinterpret_cast(input_bottom_ptr + input.offset())); - float16x4_t res = {}; - if(pooling_type == PoolingType::AVG) + float16x4_t top_data = vld1_f16(reinterpret_cast(input_top_ptr + input.offset())); + float16x4_t middle_data = vld1_f16(reinterpret_cast(input_middle_ptr + input.offset())); + float16x4_t bottom_data = vld1_f16(reinterpret_cast(input_bottom_ptr + input.offset())); + float16x4_t res = {}; + + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + middle_data = vmul_f16(middle_data, middle_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } + + if(pooling_type != PoolingType::MAX) { // Calculate scale const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); @@ -456,6 +573,13 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window res = vpmax_f16(vset_lane_f16(-std::numeric_limits::max(), max_data, 3), max_data); res = vpmax_f16(res, res); } + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + res = vinv_f16(vinvsqrt_f16(res)); + } + *(reinterpret_cast(output.ptr())) = vget_lane_f16(res, 0); }, input, output); @@ -484,11 +608,20 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window execute_window_loop(window, [&](const Coordinates & id) { - const auto top_data = vld2q_f16(reinterpret_cast(input_top_ptr + input.offset())); - const auto bottom_data = vld2q_f16(reinterpret_cast(input_bottom_ptr + input.offset())); + auto top_data = vld2q_f16(reinterpret_cast(input_top_ptr + input.offset())); + auto bottom_data = vld2q_f16(reinterpret_cast(input_bottom_ptr + input.offset())); float16x8_t res = {}; - if(pooling_type == PoolingType::AVG) + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]); + top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]); + bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]); + bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]); + } + + if(pooling_type != PoolingType::MAX) { const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); const float16x8_t scale_v = vdupq_n_f16(scale); @@ -498,6 +631,14 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window { res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1]))); } + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + res = vinvq_f16(vinvsqrtq_f16(res)); + } + + // Store result vst1q_f16(reinterpret_cast(output.ptr()), res); }, input, output); @@ -529,10 +670,19 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window execute_window_loop(window, [&](const Coordinates & id) { - const float32x2_t top_data = vld1_f32(reinterpret_cast(input_top_ptr + input.offset())); - const float32x2_t bottom_data = vld1_f32(reinterpret_cast(input_bottom_ptr + input.offset())); - float32x2_t res = {}; - if(pooling_type == PoolingType::AVG) + float32x2_t top_data = vld1_f32(reinterpret_cast(input_top_ptr + input.offset())); + float32x2_t bottom_data = vld1_f32(reinterpret_cast(input_bottom_ptr + input.offset())); + float32x2_t res = {}; + float final_res = 0; + + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + top_data = vmul_f32(top_data, top_data); + bottom_data = vmul_f32(bottom_data, bottom_data); + } + + if(pooling_type != PoolingType::MAX) { // Calculate scale float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); @@ -547,7 +697,16 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window const float32x2_t max_data = vmax_f32(top_data, bottom_data); res = vpmax_f32(max_data, max_data); } - *(reinterpret_cast(output.ptr())) = vget_lane_f32(res, 0); + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast(output.ptr())) = final_res; }, input, output); } @@ -719,11 +878,21 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window execute_window_loop(window, [&](const Coordinates & id) { - const float32x4_t top_data = vld1q_f32(reinterpret_cast(input_top_ptr + input.offset())); - const float32x4_t middle_data = vld1q_f32(reinterpret_cast(input_middle_ptr + input.offset())); - const float32x4_t bottom_data = vld1q_f32(reinterpret_cast(input_bottom_ptr + input.offset())); - float32x2_t res = {}; - if(pooling_type == PoolingType::AVG) + float32x4_t top_data = vld1q_f32(reinterpret_cast(input_top_ptr + input.offset())); + float32x4_t middle_data = vld1q_f32(reinterpret_cast(input_middle_ptr + input.offset())); + float32x4_t bottom_data = vld1q_f32(reinterpret_cast(input_bottom_ptr + input.offset())); + float32x2_t res = {}; + float final_res = 0; + + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + top_data = vmulq_f32(top_data, top_data); + middle_data = vmulq_f32(middle_data, middle_data); + bottom_data = vmulq_f32(bottom_data, bottom_data); + } + + if(pooling_type != PoolingType::MAX) { // Calculate scale float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); @@ -740,7 +909,16 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits::max(), max_data, 3)), vget_low_f32(max_data)); res = vpmax_f32(res, res); } - *(reinterpret_cast(output.ptr())) = vget_lane_f32(res, 0); + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast(output.ptr())) = final_res; }, input, output); } @@ -769,19 +947,32 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window execute_window_loop(window, [&](const Coordinates & id) { - float32x2_t res = {}; - if(pooling_type == PoolingType::AVG) + float32x2_t res = {}; + float final_res = 0.f; + if(pooling_type != PoolingType::MAX) { // Calculate scale float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling - float32x4x2_t data = vld2q_f32(reinterpret_cast(input_ptrs[0] + input.offset())); - float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); + float32x4x2_t data = vld2q_f32(reinterpret_cast(input_ptrs[0] + input.offset())); + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); for(int i = 1; i < pool_size; ++i) { - data = vld2q_f32(reinterpret_cast(input_ptrs[i] + input.offset())); + data = vld2q_f32(reinterpret_cast(input_ptrs[i] + input.offset())); + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } sum_data = vaddq_f32(sum_data, data.val[0]); sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); } @@ -800,7 +991,16 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0]))); res = vpmax_f32(res, res); } - *(reinterpret_cast(output.ptr())) = vget_lane_f32(res, 0); + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast(output.ptr())) = final_res; }, input, output); } diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 66d3fe8f78..99d39569c7 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -228,6 +228,18 @@ const std::string &arm_compute::string_from_norm_type(NormType type) return norm_type_map[type]; } +const std::string &arm_compute::string_from_pooling_type(PoolingType type) +{ + static std::map pool_type_map = + { + { PoolingType::MAX, "MAX" }, + { PoolingType::AVG, "AVG" }, + { PoolingType::L2, "L2" }, + }; + + return pool_type_map[type]; +} + std::string arm_compute::lower_string(const std::string &val) { std::string res = val; diff --git a/tests/TypePrinter.h b/tests/TypePrinter.h index 2f9909ca2e..d3d9f8f5ac 100644 --- a/tests/TypePrinter.h +++ b/tests/TypePrinter.h @@ -367,6 +367,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PoolingType &pool_ty case PoolingType::MAX: os << "MAX"; break; + case PoolingType::L2: + os << "L2"; + break; default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } diff --git a/tests/datasets/PoolingTypesDataset.h b/tests/datasets/PoolingTypesDataset.h index 5ba8aaf6d0..5994d6ecd0 100644 --- a/tests/datasets/PoolingTypesDataset.h +++ b/tests/datasets/PoolingTypesDataset.h @@ -41,7 +41,7 @@ public: PoolingTypes() : ContainerDataset("PoolType", { - PoolingType::MAX, PoolingType::AVG + PoolingType::MAX, PoolingType::AVG, PoolingType::L2 }) { } diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp index e82df07a91..44617f624c 100644 --- a/tests/validation/CL/PoolingLayer.cpp +++ b/tests/validation/CL/PoolingLayer.cpp @@ -48,7 +48,7 @@ const auto PoolingLayerDatasetFP = combine(combine(datasets::PoolingTypes(), fra framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })); /** Input data set for quantized data types */ -const auto PoolingLayerDatasetQS = combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3 })), +const auto PoolingLayerDatasetQS = combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })); constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */ diff --git a/tests/validation/CPP/PoolingLayer.cpp b/tests/validation/CPP/PoolingLayer.cpp index f7273f073f..85a8343d87 100644 --- a/tests/validation/CPP/PoolingLayer.cpp +++ b/tests/validation/CPP/PoolingLayer.cpp @@ -104,7 +104,7 @@ SimpleTensor pooling_layer(const SimpleTensor &src, PoolingLayerInfo info) } } } - else // Average pooling + else // Average or l2 pooling { for(int r = 0; r < upper_dims; ++r) { @@ -123,14 +123,29 @@ SimpleTensor pooling_layer(const SimpleTensor &src, PoolingLayerInfo info) wend = std::min(wend, w_src); hend = std::min(hend, h_src); - for(int y = hstart; y < hend; ++y) + if(type == PoolingType::AVG) { - for(int x = wstart; x < wend; ++x) + for(int y = hstart; y < hend; ++y) + { + for(int x = wstart; x < wend; ++x) + { + avg_val += src[r * h_src * w_src + y * w_src + x]; + } + } + dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool; + } + else + { + for(int y = hstart; y < hend; ++y) { - avg_val += src[r * h_src * w_src + y * w_src + x]; + for(int x = wstart; x < wend; ++x) + { + const T val = src[r * h_src * w_src + y * w_src + x]; + avg_val += val * val; + } } + dst[r * h_dst * w_dst + h * w_dst + w] = std::sqrt(avg_val / pool); } - dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool; } } } @@ -192,7 +207,7 @@ SimpleTensor pooling_layer(const SimpleTensor &src, PoolingLayerInfo info) } } } - else // Average pooling + else // Average or l2 pooling { for(int r = 0; r < upper_dims; ++r) { @@ -213,18 +228,35 @@ SimpleTensor pooling_layer(const SimpleTensor &src, PoolingLayerInfo info) using namespace fixed_point_arithmetic; const int fixed_point_position = src.fixed_point_position(); + const fixed_point const_1(1, fixed_point_position); const fixed_point invpool_fp(1.f / static_cast(pool), fixed_point_position); fixed_point avg_val(0, fixed_point_position, true); - for(int y = hstart; y < hend; ++y) + if(type == PoolingType::AVG) { - for(int x = wstart; x < wend; ++x) + for(int y = hstart; y < hend; ++y) + { + for(int x = wstart; x < wend; ++x) + { + const fixed_point in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true); + avg_val = add(avg_val, in_fp); + } + } + dst[r * h_dst * w_dst + h * w_dst + w] = mul(avg_val, invpool_fp).raw(); + } + else + { + for(int y = hstart; y < hend; ++y) { - const fixed_point in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true); - avg_val = add(avg_val, in_fp); + for(int x = wstart; x < wend; ++x) + { + const fixed_point in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true); + avg_val = add(avg_val, mul(in_fp, in_fp)); + } } + auto res = div(const_1, (inv_sqrt(mul(avg_val, invpool_fp)))); + dst[r * h_dst * w_dst + h * w_dst + w] = res.raw(); } - dst[r * h_dst * w_dst + h * w_dst + w] = mul(avg_val, invpool_fp).raw(); } } } diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp index 98ec478267..5ebbc1bc96 100644 --- a/tests/validation/NEON/PoolingLayer.cpp +++ b/tests/validation/NEON/PoolingLayer.cpp @@ -48,7 +48,7 @@ const auto PoolingLayerDatasetFP = combine(combine(datasets::PoolingTypes(), fra framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })); /** Input data set for quantized data types */ -const auto PoolingLayerDatasetQS = combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3 })), +const auto PoolingLayerDatasetQS = combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })); constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */ -- cgit v1.2.1