From cdf51455df8835e9e3bfd3e31ed389146af9a573 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 31 Aug 2017 14:21:36 +0100
Subject: COMPMID-515: L2 Pooling for FP32/FP16 in CL.

Change-Id: I43641fa672f5905ca62edd1f63fc93e0cf7ea382
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/85963
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 arm_compute/core/NEON/NEMath.h                     |  41 +++-
 arm_compute/core/NEON/NEMath.inl                   |  33 +++
 arm_compute/core/Types.h                           |   3 +-
 arm_compute/core/Utils.h                           |   7 +
 .../runtime/NEON/functions/NEPoolingLayer.h        |   2 +-
 scripts/clang-tidy.h                               |  20 ++
 src/core/CL/cl_kernels/fixed_point.h               |  16 +-
 src/core/CL/cl_kernels/pooling_layer.cl            | 139 +++++++++--
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       |   4 +-
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     | 268 ++++++++++++++++++---
 src/core/Utils.cpp                                 |  12 +
 tests/TypePrinter.h                                |   3 +
 tests/datasets/PoolingTypesDataset.h               |   2 +-
 tests/validation/CL/PoolingLayer.cpp               |   2 +-
 tests/validation/CPP/PoolingLayer.cpp              |  54 ++++-
 tests/validation/NEON/PoolingLayer.cpp             |   2 +-
 16 files changed, 524 insertions(+), 84 deletions(-)

diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 523649c65d..ba65926802 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -42,17 +42,23 @@ float32x4_t vfloorq_f32(float32x4_t val);
  *
  * @return The calculated inverse square root.
  */
-float32x4_t vinvsqrtq_f32(float32x4_t x);
+float32x2_t vinvsqrt_f32(float32x2_t x);
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
 /** Calculate inverse square root.
  *
  * @param[in] x Input value.
  *
  * @return The calculated inverse square root.
  */
-float16x8_t vinvsqrtq_f16(float16x8_t x);
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x2_t vinv_f32(float32x2_t x);
 
 /** Calculate reciprocal.
  *
@@ -122,6 +128,31 @@ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
  * @return The calculated Hyperbolic Tangent.
  */
 float16x8_t vtanhq_f16(float16x8_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x4_t vinv_f16(float16x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x8_t vinvq_f16(float16x8_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float16x4_t vinvsqrt_f16(float16x4_t x);
+
 /** Calculate inverse square root.
  *
  * @param[in] x Input value.
@@ -129,6 +160,7 @@ float16x8_t vtanhq_f16(float16x8_t val);
  * @return The calculated inverse square root.
  */
 float16x8_t vinvsqrtq_f16(float16x8_t x);
+
 /** Calculate exponential
  *
  * @param[in] x Input vector value in F16 format.
@@ -136,6 +168,7 @@ float16x8_t vinvsqrtq_f16(float16x8_t x);
  * @return The calculated exponent.
  */
 float16x8_t vexpq_f16(float16x8_t x);
+
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index bdd747c4e9..50d85396d4 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -64,6 +64,15 @@ inline float32x4_t vfloorq_f32(float32x4_t val)
     return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
 }
 
+inline float32x2_t vinvsqrt_f32(float32x2_t x)
+{
+    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
@@ -73,6 +82,14 @@ inline float32x4_t vinvsqrtq_f32(float32x4_t x)
     return sqrt_reciprocal;
 }
 
+inline float32x2_t vinv_f32(float32x2_t x)
+{
+    float32x2_t recip = vrecpe_f32(x);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    return recip;
+}
+
 inline float32x4_t vinvq_f32(float32x4_t x)
 {
     float32x4_t recip = vrecpeq_f32(x);
@@ -182,6 +199,14 @@ const std::array<float16x8_t, 8> log_tab_f16 =
     }
 };
 
+inline float16x4_t vinvsqrt_f16(float16x4_t x)
+{
+    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
@@ -190,6 +215,14 @@ inline float16x8_t vinvsqrtq_f16(float16x8_t x)
     return sqrt_reciprocal;
 }
 
+inline float16x4_t vinv_f16(float16x4_t x)
+{
+    float16x4_t recip = vrecpe_f16(x);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    return recip;
+}
+
 inline float16x8_t vinvq_f16(float16x8_t x)
 {
     float16x8_t recip = vrecpeq_f16(x);
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 7d9cd4e0cc..8750a9cf1f 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -398,7 +398,8 @@ enum class DimensionRoundingType
 enum class PoolingType
 {
     MAX, /**< Max Pooling */
-    AVG  /**< Average Pooling */
+    AVG, /**< Average Pooling */
+    L2   /**< L2 Pooling */
 };
 
 /** Padding and stride information class */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 39ec6587de..ab5d110f91 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -609,6 +609,13 @@ const std::string &string_from_border_mode(BorderMode border_mode);
  * @return The string describing the normalization type.
  */
 const std::string &string_from_norm_type(NormType type);
+/** Translates a given pooling type to a string.
+ *
+ * @param[in] type @ref PoolingType to be translated to string.
+ *
+ * @return The string describing the pooling type.
+ */
+const std::string &string_from_pooling_type(PoolingType type);
 /** Lower a given string.
  *
  * @param[in] val Given string to lower.
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 5a9cffa5ae..5c36e80f37 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -42,7 +42,7 @@ class NEPoolingLayer : public INESimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32.
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h
index 2c932ca3a4..318b85ed58 100644
--- a/scripts/clang-tidy.h
+++ b/scripts/clang-tidy.h
@@ -1,6 +1,11 @@
 #include <arm_neon.h>
 //FIXME: Remove this file before the release
 
+inline float16x4_t vrsqrts_f16 (float16x4_t, float16x4_t)
+{
+  return vdup_n_f16(0);
+}
+
 inline float16x8_t vrsqrtsq_f16 (float16x8_t, float16x8_t)
 {
   return vdupq_n_f16(0);
@@ -121,6 +126,11 @@ inline float16x4_t vbsl_f16 (uint16x4_t,float16x4_t, float16x4_t)
   return vdup_n_f16(0);
 }
 
+inline float16x4_t vrsqrte_f16(float16x4_t)
+{
+   return vdup_n_f16(0);
+}
+
 inline float16x8_t vrsqrteq_f16(float16x8_t)
 {
    return vdupq_n_f16(0);
@@ -131,11 +141,21 @@ inline float16x8_t vfmsq_f16 (float16x8_t, float16x8_t, float16x8_t)
    return vdupq_n_f16(0);
 }
 
+inline float16x4_t vrecpe_f16 (float16x4_t)
+{
+   return vdup_n_f16(0);
+}
+
 inline float16x8_t vrecpeq_f16 (float16x8_t)
 {
    return vdupq_n_f16(0);
 }
 
+inline float16x4_t vrecps_f16 (float16x4_t, float16x4_t)
+{
+   return vdup_n_f16(0);
+}
+
 inline float16x8_t vrecpsq_f16 (float16x8_t, float16x8_t)
 {
    return vdupq_n_f16(0);
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index 478a414cad..5476a6e070 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -241,9 +241,17 @@ MULQ_IMPL(qs16x16, qs32x16)
         return CONVERT_SAT((res >> (itype)fixed_point_position), type);                       \
     }
 
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
 MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
 MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
 MULQ_SAT_IMPL(qs16x16, qs32x16)
 
 #define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position))
@@ -411,7 +419,7 @@ LOGQ_IMPL(qs16, qs16x16, 16)
     {                                                                                                                                                                                                                                  \
         type const_three = (type)(3 << (fixed_point_position));                                                                                                                                                                        \
         type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position);                                                                                                                                      \
-        type temp        = select(VopA >> shift_value, select((type)stype##_MAX, VopA << (-shift_value), clz(VopA) > (-shift_value)), shift_value < (type)0);                                                                          \
+        type temp        = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0));                                          \
         type x           = temp;                                                                                                                                                                                                       \
         x                = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
         x                = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
@@ -422,9 +430,11 @@ LOGQ_IMPL(qs16, qs16x16, 16)
             x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1;            \
         }                                                                                                                                                                                                                              \
         type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0);                                                                                                                                      \
-        return select(x >> shift_value2, select((type)stype##_MAX, x << shift_value2, clz(x) > shift_value2), shift_value < (type)0); /* Saturate result if needed */                                                                  \
+        return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */                                  \
     }
 
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
 INVSQRTQ_IMPL(qs8, qs8x16, 16)
 INVSQRTQ_IMPL(qs16, qs16x8, 8)
 
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 0497bf4b91..99d7e6e01b 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -33,18 +33,32 @@
 #define POOL_OP(x, y) (max((x), (y)))
 #endif /* POOL_AVG */
 
-#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), y, DATA_TYPE, FIXED_POINT_POSITION)
+#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)
 #define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)
+#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
 
 #else /* FIXED_POINT_POSITION */
 
-#if defined(POOL_AVG)
+#if defined(POOL_AVG) || defined(POOL_L2)
 #define POOL_OP(x, y) ((x) + (y))
-#else /* POOL_AVG */
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
 #define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
 
 #define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
 
 #endif /* FIXED_POINT_POSITION */
 
@@ -70,6 +84,12 @@
         data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                          \
         VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
         data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                                                      \
+        data00 = POW2_OP(data00, 4);                                                                                                         \
+        data01 = POW2_OP(data01, 2);                                                                                                         \
+        data10 = POW2_OP(data10, 4);                                                                                                         \
+        data11 = POW2_OP(data11, 2);                                                                                                         \
+        data20 = POW2_OP(data20, 4);                                                                                                         \
+        data21 = POW2_OP(data21, 2);                                                                                                         \
         \
         VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
         values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323);                                                                          \
@@ -104,6 +124,12 @@
         VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
         data20           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                \
         DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                                    \
+        data00           = POW2_OP(data00, 8);                                                                                               \
+        data01           = POW2_OP(data01, 1);                                                                                               \
+        data10           = POW2_OP(data10, 8);                                                                                               \
+        data11           = POW2_OP(data11, 1);                                                                                               \
+        data20           = POW2_OP(data20, 8);                                                                                               \
+        data21           = POW2_OP(data21, 1);                                                                                               \
         \
         VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
         values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445);                                                                          \
@@ -141,6 +167,12 @@
         data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                  \
         VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
         data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                              \
+        data00 = POW2_OP(data00, 8);                                                                                                 \
+        data01 = POW2_OP(data01, 4);                                                                                                 \
+        data10 = POW2_OP(data10, 8);                                                                                                 \
+        data11 = POW2_OP(data11, 4);                                                                                                 \
+        data20 = POW2_OP(data20, 8);                                                                                                 \
+        data21 = POW2_OP(data21, 4);                                                                                                 \
         \
         data00 = POOL_OP(data00, data10);                                                                                            \
         data01 = POOL_OP(data01, data11);                                                                                            \
@@ -165,7 +197,7 @@ DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, cons
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -201,14 +233,25 @@ __kernel void pooling_layer_2(
     VEC_DATA_TYPE(DATA_TYPE, 2)
     data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
 
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 2);
+    data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
     // Perform calculations
     data0         = POOL_OP(data0, data1);
     DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
 
-    // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average or l2 pooling
     res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
@@ -218,7 +261,7 @@ __kernel void pooling_layer_2(
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -256,15 +299,27 @@ __kernel void pooling_layer_3(
     VEC_DATA_TYPE(DATA_TYPE, 3)
     data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
 
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 3);
+    data1 = POW2_OP(data1, 3);
+    data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
     // Perform calculations
     data0         = POOL_OP(data0, data1);
     data0         = POOL_OP(data0, data2);
     DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
 
+#if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
     res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
@@ -290,7 +345,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -326,10 +381,15 @@ __kernel void pooling_layer_3_optimized(
     // Perform pooling 3x3 for 4 output elements
     POOLING3x3(res, input, output);
 
+#if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
     res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif // POOL_AVG
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
 }
@@ -339,7 +399,7 @@ __kernel void pooling_layer_3_optimized(
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
@@ -385,6 +445,17 @@ __kernel void pooling_layer_7(
     VEC_DATA_TYPE(DATA_TYPE, 8)
     data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
 
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 8);
+    data1 = POW2_OP(data1, 8);
+    data2 = POW2_OP(data2, 8);
+    data3 = POW2_OP(data3, 8);
+    data4 = POW2_OP(data4, 8);
+    data5 = POW2_OP(data5, 8);
+    data6 = POW2_OP(data6, 8);
+#endif /* defined(POOL_L2) */
+
     // Pool operation of all rows
     data0 = POOL_OP(data0, data1);
     data2 = POOL_OP(data2, data3);
@@ -394,11 +465,11 @@ __kernel void pooling_layer_7(
     data0 = POOL_OP(data0, data4);
 
     // Set last element
-#ifdef POOL_AVG
+#if defined(POOL_AVG) || defined(POOL_L2)
     data0.s7 = 0;
-#else  /* POOL_AVG */
+#else  /* defined(POOL_AVG) || defined(POOL_L2) */
     data0.s7 = data0.s6;
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
     // Reduce result
     VEC_DATA_TYPE(DATA_TYPE, 4)
@@ -407,10 +478,15 @@ __kernel void pooling_layer_7(
     reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
     DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
 
+#if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
     res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
@@ -419,9 +495,9 @@ __kernel void pooling_layer_7(
 #if defined(POOL_SIZE)
 
 // Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG)
+#if defined(POOL_AVG) || defined(POOL_L2)
 #define INITIAL_VALUE 0
-#else // POOL_AVG
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
 #ifdef FIXED_POINT_POSITION
 #define MIN_VAL_EXPAND(type) type##_MIN
 #define MIN_VAL(type) MIN_VAL_EXPAND(type)
@@ -485,6 +561,10 @@ __kernel void pooling_layer_N(
         {
             VEC_DATA_TYPE(DATA_TYPE, 8)
             data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
             vdata = POOL_OP(vdata, data0);
         }
 
@@ -492,7 +572,11 @@ __kernel void pooling_layer_N(
         for(; x < (int)POOL_SIZE; ++x)
         {
             DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            sdata           = POOL_OP(sdata, data0);
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            sdata = POOL_OP(sdata, data0);
         }
     }
 
@@ -504,10 +588,15 @@ __kernel void pooling_layer_N(
     DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
     res           = POOL_OP(res, sdata);
 
+#if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-#ifdef POOL_AVG
     res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* POOL_AVG */
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 22c7730963..497e87b2b5 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -101,14 +101,14 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     // Set build options
     std::set<std::string> build_opts;
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+    build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
     if(is_data_type_fixed_point(input->info()->data_type()))
     {
         build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
     }
 
     build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
-    if(pool_type == PoolingType::AVG)
+    if(pool_type != PoolingType::MAX)
     {
         build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
         build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index fdcbd5a898..b97564e77b 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -36,6 +37,7 @@
 
 #include <algorithm>
 #include <arm_neon.h>
+#include <cmath>
 #include <limits>
 #include <set>
 #include <string>
@@ -111,6 +113,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
     ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
     ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
@@ -235,41 +238,146 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
         case 2:
             if(input->info()->data_type() == DataType::QS8)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::QS16)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F16)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F32)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             break;
         case 3:
             if(input->info()->data_type() == DataType::QS8)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::QS16)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F16)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F32)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             break;
         case 7:
-            _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
+                    break;
+                case PoolingType::L2:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported pooling size");
@@ -436,11 +544,20 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
-        const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
-        const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
-        float16x4_t       res         = {};
-        if(pooling_type == PoolingType::AVG)
+        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x4_t res         = {};
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f16(top_data, top_data);
+            middle_data = vmul_f16(middle_data, middle_data);
+            bottom_data = vmul_f16(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
             const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -456,6 +573,13 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window
             res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
             res                        = vpmax_f16(res, res);
         }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = vinv_f16(vinvsqrt_f16(res));
+        }
+
         *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
     },
     input, output);
@@ -484,11 +608,20 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const auto  top_data    = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
-        const auto  bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        auto        top_data    = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        auto        bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
         float16x8_t res         = {};
 
-        if(pooling_type == PoolingType::AVG)
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data.val[0]    = vmulq_f16(top_data.val[0], top_data.val[0]);
+            top_data.val[1]    = vmulq_f16(top_data.val[1], top_data.val[1]);
+            bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
+            bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
+        }
+
+        if(pooling_type != PoolingType::MAX)
         {
             const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
@@ -498,6 +631,14 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window
         {
             res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
         }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = vinvq_f16(vinvsqrtq_f16(res));
+        }
+
+        // Store result
         vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
     },
     input, output);
@@ -529,10 +670,19 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t       res         = {};
-        if(pooling_type == PoolingType::AVG)
+        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f32(top_data, top_data);
+            bottom_data = vmul_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
             float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -547,7 +697,16 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window
             const float32x2_t max_data = vmax_f32(top_data, bottom_data);
             res                        = vpmax_f32(max_data, max_data);
         }
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
@@ -719,11 +878,21 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
-        const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t       res         = {};
-        if(pooling_type == PoolingType::AVG)
+        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmulq_f32(top_data, top_data);
+            middle_data = vmulq_f32(middle_data, middle_data);
+            bottom_data = vmulq_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
             float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -740,7 +909,16 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window
             res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
             res                        = vpmax_f32(res, res);
         }
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
@@ -769,19 +947,32 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        float32x2_t res = {};
-        if(pooling_type == PoolingType::AVG)
+        float32x2_t res       = {};
+        float       final_res = 0.f;
+        if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
             float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
-            float32x4x2_t data     = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
-            float32x4_t   sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+            }
+            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
             for(int i = 1; i < pool_size; ++i)
             {
-                data     = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                // Get power of 2 in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                }
                 sum_data = vaddq_f32(sum_data, data.val[0]);
                 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
             }
@@ -800,7 +991,16 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
             res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
             res = vpmax_f32(res, res);
         }
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 66d3fe8f78..99d39569c7 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -228,6 +228,18 @@ const std::string &arm_compute::string_from_norm_type(NormType type)
     return norm_type_map[type];
 }
 
+const std::string &arm_compute::string_from_pooling_type(PoolingType type)
+{
+    static std::map<PoolingType, const std::string> pool_type_map =
+    {
+        { PoolingType::MAX, "MAX" },
+        { PoolingType::AVG, "AVG" },
+        { PoolingType::L2, "L2" },
+    };
+
+    return pool_type_map[type];
+}
+
 std::string arm_compute::lower_string(const std::string &val)
 {
     std::string res = val;
diff --git a/tests/TypePrinter.h b/tests/TypePrinter.h
index 2f9909ca2e..d3d9f8f5ac 100644
--- a/tests/TypePrinter.h
+++ b/tests/TypePrinter.h
@@ -367,6 +367,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PoolingType &pool_ty
         case PoolingType::MAX:
             os << "MAX";
             break;
+        case PoolingType::L2:
+            os << "L2";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
diff --git a/tests/datasets/PoolingTypesDataset.h b/tests/datasets/PoolingTypesDataset.h
index 5ba8aaf6d0..5994d6ecd0 100644
--- a/tests/datasets/PoolingTypesDataset.h
+++ b/tests/datasets/PoolingTypesDataset.h
@@ -41,7 +41,7 @@ public:
     PoolingTypes()
         : ContainerDataset("PoolType",
     {
-        PoolingType::MAX, PoolingType::AVG
+        PoolingType::MAX, PoolingType::AVG, PoolingType::L2
     })
     {
     }
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index e82df07a91..44617f624c 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -48,7 +48,7 @@ const auto PoolingLayerDatasetFP = combine(combine(datasets::PoolingTypes(), fra
                                            framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) }));
 
 /** Input data set for quantized data types */
-const auto PoolingLayerDatasetQS = combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3 })),
+const auto PoolingLayerDatasetQS = combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })),
                                            framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) }));
 
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
diff --git a/tests/validation/CPP/PoolingLayer.cpp b/tests/validation/CPP/PoolingLayer.cpp
index f7273f073f..85a8343d87 100644
--- a/tests/validation/CPP/PoolingLayer.cpp
+++ b/tests/validation/CPP/PoolingLayer.cpp
@@ -104,7 +104,7 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
             }
         }
     }
-    else // Average pooling
+    else // Average or l2 pooling
     {
         for(int r = 0; r < upper_dims; ++r)
         {
@@ -123,14 +123,29 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
                     wend       = std::min(wend, w_src);
                     hend       = std::min(hend, h_src);
 
-                    for(int y = hstart; y < hend; ++y)
+                    if(type == PoolingType::AVG)
                     {
-                        for(int x = wstart; x < wend; ++x)
+                        for(int y = hstart; y < hend; ++y)
+                        {
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                avg_val += src[r * h_src * w_src + y * w_src + x];
+                            }
+                        }
+                        dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool;
+                    }
+                    else
+                    {
+                        for(int y = hstart; y < hend; ++y)
                         {
-                            avg_val += src[r * h_src * w_src + y * w_src + x];
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                const T val = src[r * h_src * w_src + y * w_src + x];
+                                avg_val += val * val;
+                            }
                         }
+                        dst[r * h_dst * w_dst + h * w_dst + w] = std::sqrt(avg_val / pool);
                     }
-                    dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool;
                 }
             }
         }
@@ -192,7 +207,7 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
             }
         }
     }
-    else // Average pooling
+    else // Average or l2 pooling
     {
         for(int r = 0; r < upper_dims; ++r)
         {
@@ -213,18 +228,35 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
                     using namespace fixed_point_arithmetic;
 
                     const int            fixed_point_position = src.fixed_point_position();
+                    const fixed_point<T> const_1(1, fixed_point_position);
                     const fixed_point<T> invpool_fp(1.f / static_cast<float>(pool), fixed_point_position);
                     fixed_point<T>       avg_val(0, fixed_point_position, true);
 
-                    for(int y = hstart; y < hend; ++y)
+                    if(type == PoolingType::AVG)
                     {
-                        for(int x = wstart; x < wend; ++x)
+                        for(int y = hstart; y < hend; ++y)
+                        {
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                const fixed_point<T> in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true);
+                                avg_val = add(avg_val, in_fp);
+                            }
+                        }
+                        dst[r * h_dst * w_dst + h * w_dst + w] = mul(avg_val, invpool_fp).raw();
+                    }
+                    else
+                    {
+                        for(int y = hstart; y < hend; ++y)
                         {
-                            const fixed_point<T> in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true);
-                            avg_val = add(avg_val, in_fp);
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                const fixed_point<T> in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true);
+                                avg_val = add(avg_val, mul(in_fp, in_fp));
+                            }
                         }
+                        auto res                               = div(const_1, (inv_sqrt(mul(avg_val, invpool_fp))));
+                        dst[r * h_dst * w_dst + h * w_dst + w] = res.raw();
                     }
-                    dst[r * h_dst * w_dst + h * w_dst + w] = mul(avg_val, invpool_fp).raw();
                 }
             }
         }
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 98ec478267..5ebbc1bc96 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -48,7 +48,7 @@ const auto PoolingLayerDatasetFP = combine(combine(datasets::PoolingTypes(), fra
                                            framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) }));
 
 /** Input data set for quantized data types */
-const auto PoolingLayerDatasetQS = combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3 })),
+const auto PoolingLayerDatasetQS = combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })),
                                            framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) }));
 
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
-- 
cgit v1.2.1