From cc1f6c94f1fc3b5d5ccbd5aa43e2a08487664f50 Mon Sep 17 00:00:00 2001
From: morgolock <pablo.tello@arm.com>
Date: Tue, 24 Mar 2020 09:26:48 +0000
Subject: MLCE-166: Add support for extracting indices in NEPoolingLayer 2x2
 NCHW

 * Added initial support for pooling indices
 * Only supported for NCHW Poolsize 2

Change-Id: I92ce767e64fcc01aae89411064b4cb2be272a1e9
Signed-off-by: morgolock <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2927
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/kernels/CLPoolingLayerKernel.h |   7 +-
 .../GLES_COMPUTE/kernels/GCPoolingLayerKernel.h    |   9 +-
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  13 +-
 arm_compute/runtime/CL/functions/CLPoolingLayer.h  |   6 +-
 .../GLES_COMPUTE/functions/GCPoolingLayer.h        |   8 +-
 .../runtime/NEON/functions/NEPoolingLayer.h        |   6 +-
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       |  23 ++-
 .../GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp  |  55 ++---
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     | 230 ++++++++++++++-------
 src/runtime/CL/functions/CLPoolingLayer.cpp        |   9 +-
 .../GLES_COMPUTE/functions/GCPoolingLayer.cpp      |  13 +-
 src/runtime/NEON/functions/NEPoolingLayer.cpp      |   8 +-
 tests/validation/NEON/PoolingLayer.cpp             |  19 ++
 tests/validation/fixtures/PoolingLayerFixture.h    |  42 ++--
 tests/validation/reference/PoolingLayer.cpp        |  42 ++--
 tests/validation/reference/PoolingLayer.h          |   4 +-
 16 files changed, 321 insertions(+), 173 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index 4b3ee24333..fdd10f3f66 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -55,17 +55,19 @@ public:
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -74,6 +76,7 @@ public:
 public:
     const ICLTensor *_input;
     ICLTensor       *_output;
+    ICLTensor       *_indices;
     PoolingLayerInfo _pool_info;
     DataLayout       _data_layout;
     BorderSize       _border_size;
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
index 85c051c326..7a2fb84f34 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,18 +54,20 @@ public:
      * @param[in]  input     Source tensor. Data types supported: F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref GCPoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window) override;
@@ -74,6 +76,7 @@ public:
 private:
     const IGCTensor *_input;
     IGCTensor       *_output;
+    IGCTensor       *_indices;
     PoolingLayerInfo _pool_info;
     BorderSize       _border_size;
     unsigned int     _num_elems_processed_per_iteration;
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 654dfad701..6519ac72fe 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -57,8 +57,9 @@ public:
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
      *
      * @note F16 are supported for pool sizes 2 and 3 only
@@ -66,10 +67,11 @@ public:
      * @param[in] input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -84,6 +86,12 @@ private:
      * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
     void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    void pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 32-bit floating point values.
      *
      * @param[in] window_input    Input region on which to execute the kernel.
@@ -197,6 +205,7 @@ private:
     PoolingFunction  _func;
     const ITensor   *_input;
     ITensor         *_output;
+    ITensor         *_indices;
     PoolingLayerInfo _pool_info;
     DataLayout       _data_layout;
     unsigned int     _num_elems_processed_per_iteration;
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index c78b558ac8..05b35dcee8 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -46,17 +46,19 @@ public:
      * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
index 13b7ad363f..b29f808f99 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,17 +51,19 @@ public:
      * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref GCPoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 
     void run() override final;
 
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index eb840b52f2..e43741c95b 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -51,8 +51,9 @@ public:
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+    void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayer
      *
      * @note F16 is supported for pool sizes 2 and 3 only
@@ -60,10 +61,11 @@ public:
      * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 767d6d6ca0..dbbca4771b 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -56,10 +56,11 @@ void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo p
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in the CL backend.");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
@@ -166,7 +167,7 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
 } // namespace
 
 CLPoolingLayerKernel::CLPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
+    : _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
@@ -175,16 +176,16 @@ BorderSize CLPoolingLayerKernel::border_size() const
     return _border_size;
 }
 
-void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Set instance variables
-    _input       = input;
-    _output      = output;
-    _pool_info   = pool_info;
-    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
-
+    _input                              = input;
+    _output                             = output;
+    _pool_info                          = pool_info;
+    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
+    _indices                            = indices;
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     const PoolingType   pool_type       = pool_info.pool_type;
@@ -215,7 +216,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
 
     // Check output dimensions
     auto_init(input->info(), output->info(), pool_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
     const DataType data_type = input->info()->data_type();
 
@@ -331,9 +332,9 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
 
     return Status{};
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index 14cedfe3d2..36499eb4fd 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -55,9 +55,10 @@ void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int poole
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in GLES backend");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
@@ -77,8 +78,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        unsigned int pooled_w        = 0;
-        unsigned int pooled_h        = 0;
+        unsigned int pooled_w = 0;
+        unsigned int pooled_h = 0;
         std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
                                                          input->dimension(1),
                                                          pool_size,
@@ -93,14 +94,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 
 std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
-    int                 pool_pad_x         = 0;
-    int                 pool_pad_y         = 0;
-    int                 pool_stride_x      = 0;
-    int                 pool_stride_y      = 0;
-    unsigned int        pooled_w           = 0;
-    unsigned int        pooled_h           = 0;
-    int                 pool_size          = pool_info.pool_size.width;
-    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    int                 pool_size       = pool_info.pool_size.width;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
@@ -215,7 +216,7 @@ std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITenso
 } // namespace
 
 GCPoolingLayerKernel::GCPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+    : _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
@@ -224,18 +225,18 @@ BorderSize GCPoolingLayerKernel::border_size() const
     return _border_size;
 }
 
-void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
+void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices)
 {
-    int                 pool_pad_x         = 0;
-    int                 pool_pad_y         = 0;
-    int                 pool_stride_x      = 0;
-    int                 pool_stride_y      = 0;
-    unsigned int        pooled_w           = 0;
-    unsigned int        pooled_h           = 0;
-    const PoolingType   pool_type          = pool_info.pool_type;
-    int                 pool_size          = pool_info.pool_size.width;
-    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
-    const bool          exclude_padding    = pool_info.exclude_padding;
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    const PoolingType   pool_type       = pool_info.pool_type;
+    int                 pool_size       = pool_info.pool_size.width;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const bool          exclude_padding = pool_info.exclude_padding;
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
@@ -253,13 +254,13 @@ void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output,
 
     auto_init(input->info(), output->info(), pooled_w, pooled_h);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
     // Set instance variables
     _input     = input;
     _output    = output;
     _pool_info = pool_info;
-
+    _indices   = indices;
     // Set build options
     std::set<std::string> build_opts;
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
@@ -321,9 +322,9 @@ void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output,
     _border_size                       = pooling_config.second;
 }
 
-Status GCPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status GCPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index d6a3fadd33..fdbba815b4 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -123,7 +123,8 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates
     v = wrapper::vsetlane(elems[7], v, 7);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
+                          unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
@@ -134,6 +135,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    if(indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
@@ -146,6 +152,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
                                     || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
+
+        if(indices)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "Pool indices only supported in NCHW");
+            ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
+                                        || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
+        }
     }
 
     return Status{};
@@ -159,13 +173,18 @@ Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsign
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
-                                                        BorderSize &border_size,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
+                                                        unsigned int &num_elems_processed_per_iteration,
+                                                        BorderSize   &border_size,
                                                         unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
-
+    if(indices)
+    {
+        // Indices auto inizialitation if not yet initialized
+        auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info))).set_data_type(DataType::U32) /* we store the offset to the element */);
+    }
     const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
     unsigned int        num_elems_read_per_iteration = 0;
     unsigned int        num_elems_horizontal_window  = 0;
@@ -286,25 +305,28 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     {
         // Number of iterations in X dimension
         const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
         // Upper limit for the number of right/bottom border elements that are accessed
         const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
-
-        border_size        = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-        border_size.right  = std::max(upper_bound_w, pool_pad_right);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
+        border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+        border_size.right       = std::max(upper_bound_w, pool_pad_right);
+        border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
         TensorShape output_shape{ input->tensor_shape() };
         output_shape.set(0, pooled_w);
         output_shape.set(1, pooled_h);
         TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-
         win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
-
+        AccessWindowStatic     input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
         AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
-        window_changed = update_window_and_padding(win, input_access, output_access);
+        if(indices)
+        {
+            AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
+            window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+        }
+        else
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access);
+        }
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
     else
@@ -313,12 +335,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         output_shape.set(1, pooled_w);
         output_shape.set(2, pooled_h);
         TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-
         win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
         AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, output_access);
+        if(indices)
+        {
+            AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+        }
+        else
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access);
+        }
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
 
@@ -438,7 +466,7 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
 } // namespace
 
 NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
 {
 }
 
@@ -447,10 +475,9 @@ BorderSize NEPoolingLayerKernel::border_size() const
     return _border_size;
 }
 
-void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
     const bool          is_global_pooling = pool_info.is_global_pooling;
     const int           pool_stride_x     = pad_stride_info.stride().first;
@@ -478,11 +505,12 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                                                      pad_stride_info);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
 
     // Set instance variables
     _input       = input;
     _output      = output;
+    _indices     = indices;
     _pool_info   = pool_info;
     _data_layout = input->info()->data_layout();
     _is_square   = (pool_size.x() == pool_size.y());
@@ -690,7 +718,8 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
+    auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
+                                                    pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
@@ -1435,7 +1464,6 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
                     res              = std::max(res, data);
                 }
             }
-
 #if defined(__aarch64__)
             // Reduction operation available on 64 bit architectures only
             res = std::max(vmaxvq_f32(vres), res);
@@ -1459,66 +1487,117 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
     input, output);
 }
 
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+void NEPoolingLayerKernel::pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr int pool_size       = 2;
-    const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
+    Iterator  input(_input, window_input);
+    Iterator  output(_output, window);
+    Iterator  indices(_indices, window);
+    int       final_index   = 0;
+    const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    const Strides &input_strides = _input->info()->strides_in_bytes();
+    const auto     in_stridew    = input_strides[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
     {
-        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
+        const auto        input_offset_top    = input_top_ptr + input.offset();
+        const auto        input_offset_bottom = input_bottom_ptr + input.offset();
+        const auto        in_top_ptr          = reinterpret_cast<const float *>(input_offset_top);
+        const auto        in_bottom_ptr       = reinterpret_cast<const float *>(input_offset_bottom);
+        float32x2_t       top_data            = vld1_f32(in_top_ptr);
+        float32x2_t       bottom_data         = vld1_f32(in_bottom_ptr);
+        float32x2_t       res                 = {};
+        float             final_res           = 0;
+        const float32x2_t max_data            = vmax_f32(top_data, bottom_data);
+        res                                   = vpmax_f32(max_data, max_data);
+        final_res                             = vget_lane_f32(res, 0);
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+        const uint32_t   offset_top                = (uint32_t)(input.offset() / sizeof(float));
+        const uint32_t   offset_bottom             = (uint32_t)offset_top + (in_stridew / sizeof(float));
+        const uint32x2_t voffset_top               = { offset_top, offset_top + 1u };
+        const uint32x2_t voffset_bottom            = { offset_bottom, offset_bottom + 1u };
+        const uint32x2_t tmp_indices               = vbsl_u32(vcgt_f32(top_data, bottom_data), voffset_top, voffset_bottom);
+        final_index                                = vget_lane_u32(vbsl_u32(vcgt_f32(max_data, vrev64_f32(max_data)), tmp_indices, vrev64_u32(tmp_indices)), 0);
+        *(reinterpret_cast<int *>(indices.ptr()))  = final_index;
+    },
+    input, output, indices);
+}
 
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
+                                             bool exclude_padding)
+{
+    if(pooling_type == PoolingType::MAX && _indices)
+    {
+        pooling2_f32_nchw_maxpool_indices(window_input, window);
+    }
+    else
+    {
+        Iterator      input(_input, window_input);
+        Iterator      output(_output, window);
+        constexpr int pool_size       = 2;
+        const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x   = 0;
+        int           pool_stride_y   = 0;
+        std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+        const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+        const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+        const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            top_data    = vmul_f32(top_data, top_data);
-            bottom_data = vmul_f32(bottom_data, bottom_data);
-        }
+            const auto  in_top_ptr    = reinterpret_cast<const float *>(input_top_ptr + input.offset());
+            const auto  in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
+            float32x2_t top_data      = vld1_f32(in_top_ptr);
+            float32x2_t bottom_data   = vld1_f32(in_bottom_ptr);
+            float32x2_t res           = {};
+            float       final_res     = 0;
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                top_data    = vmul_f32(top_data, top_data);
+                bottom_data = vmul_f32(bottom_data, bottom_data);
+            }
 
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
+            if(pooling_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
 
-            // Perform pooling
-            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
-        }
-        else
-        {
-            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-            res                        = vpmax_f32(max_data, max_data);
-        }
-        final_res = vget_lane_f32(res, 0);
+                // Perform pooling
+                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+            }
+            else
+            {
+                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+                res                        = vpmax_f32(max_data, max_data);
+            }
+            final_res = vget_lane_f32(res, 0);
 
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
+            // Store result
+            *(reinterpret_cast<float *>(output.ptr())) = final_res;
+        },
+        input, output);
+    }
 }
 
 void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
@@ -2001,7 +2080,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
     input, output);
 }
 
-Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
@@ -2032,8 +2111,9 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf
                                                      pool_size_y,
                                                      pool_info.pad_stride_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
                                                               pool_size_x, pool_size_y)
                                 .first);
 
@@ -2094,4 +2174,4 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     // Run function
     (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index ebdae0b8ad..9c4fa4a2ba 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -30,14 +30,13 @@
 
 namespace arm_compute
 {
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
     // Configure pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
     k->set_target(CLScheduler::get().target());
-    k->configure(input, output, pool_info);
+    k->configure(input, output, pool_info, indices);
     _kernel = std::move(k);
 
     const DataType data_type = input->info()->data_type();
@@ -81,8 +80,8 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return CLPoolingLayerKernel::validate(input, output, pool_info);
+    return CLPoolingLayerKernel::validate(input, output, pool_info, indices);
 }
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index 328c5e9762..accf60e204 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
@@ -29,18 +29,18 @@
 
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 GCPoolingLayer::GCPoolingLayer()
     : _kernel(nullptr), _border_handler(), _shift_handler()
 {
 }
 
-void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
+void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices)
 {
     // Configure pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<GCPoolingLayerKernel>();
-    k->configure(input, output, pool_info);
+    k->configure(input, output, pool_info, indices);
     _kernel = std::move(k);
 
     // Configure border depending on operation required
@@ -50,9 +50,9 @@ void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const Poolin
     _shift_handler.configure(input);
 }
 
-Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return GCPoolingLayerKernel::validate(input, output, pool_info);
+    return GCPoolingLayerKernel::validate(input, output, pool_info, indices);
 }
 
 void GCPoolingLayer::run()
@@ -63,3 +63,4 @@ void GCPoolingLayer::run()
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(*_kernel);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 0a3219375e..12921cf40e 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -33,7 +33,7 @@ NEPoolingLayer::NEPoolingLayer()
 {
 }
 
-void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
 {
     // Check if we have Global Pooling Layer
     _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size.width) && (input->info()->dimension(1) == pool_info.pool_size.height);
@@ -42,7 +42,7 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
 
     // Configure pooling kernel
-    _pooling_layer_kernel.configure(input, output, pool_info);
+    _pooling_layer_kernel.configure(input, output, pool_info, indices);
 
     switch(_data_layout)
     {
@@ -65,9 +65,9 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     }
 }
 
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
-    return NEPoolingLayerKernel::validate(input, output, pool_info);
+    return NEPoolingLayerKernel::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 1012320b0d..a5876dcd0a 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -111,14 +111,33 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+template <typename T>
+using NEPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
+
 template <typename T>
 using NEPoolingLayerFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
 
 template <typename T>
 using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
 
+const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true, false }));
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })
+
+                                                                                                                  ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
 {
     // Validate output
diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index ec186564b7..7f2d7ac225 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h
@@ -34,7 +34,6 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/PoolingLayer.h"
-
 #include <random>
 
 namespace arm_compute
@@ -48,7 +47,7 @@ class PoolingLayerValidationGenericFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout)
+    void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout, bool indices = false)
     {
         std::mt19937                    gen(library->seed());
         std::uniform_int_distribution<> offset_dis(0, 20);
@@ -59,8 +58,8 @@ public:
         const QuantizationInfo          output_qinfo(scale, scale_out);
 
         _pool_info = pool_info;
-        _target    = compute_target(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo);
-        _reference = compute_reference(shape, pool_info, data_type, input_qinfo, output_qinfo);
+        _target    = compute_target(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo, indices);
+        _reference = compute_reference(shape, pool_info, data_type, input_qinfo, output_qinfo, indices);
     }
 
 protected:
@@ -79,7 +78,9 @@ protected:
     }
 
     TensorType compute_target(TensorShape shape, PoolingLayerInfo info,
-                              DataType data_type, DataLayout data_layout, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+                              DataType data_type, DataLayout data_layout,
+                              QuantizationInfo input_qinfo, QuantizationInfo output_qinfo,
+                              bool indices)
     {
         // Change shape in case of NHWC.
         if(data_layout == DataLayout::NHWC)
@@ -91,20 +92,24 @@ protected:
         TensorType        src       = create_tensor<TensorType>(shape, data_type, 1, input_qinfo, data_layout);
         const TensorShape dst_shape = misc::shape_calculator::compute_pool_shape(*(src.info()), info);
         TensorType        dst       = create_tensor<TensorType>(dst_shape, data_type, 1, output_qinfo, data_layout);
+        _target_indices             = create_tensor<TensorType>(dst_shape, DataType::U32, 1);
 
         // Create and configure function
         FunctionType pool_layer;
-        pool_layer.configure(&src, &dst, info);
+        pool_layer.configure(&src, &dst, info, (indices) ? &_target_indices : nullptr);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_target_indices.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
+        _target_indices.allocator()->allocate();
 
         ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!_target_indices.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Fill tensors
         fill(AccessorType(src));
@@ -115,20 +120,33 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, PoolingLayerInfo info, DataType data_type, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, PoolingLayerInfo info, DataType data_type,
+                                      QuantizationInfo input_qinfo, QuantizationInfo output_qinfo, bool indices)
     {
         // Create reference
         SimpleTensor<T> src{ shape, data_type, 1, input_qinfo };
-
         // Fill reference
         fill(src);
 
-        return reference::pooling_layer<T>(src, info, output_qinfo);
+        return reference::pooling_layer<T>(src, info, output_qinfo, indices ? &_ref_indices : nullptr);
     }
 
-    TensorType       _target{};
-    SimpleTensor<T>  _reference{};
-    PoolingLayerInfo _pool_info{};
+    TensorType             _target{};
+    SimpleTensor<T>        _reference{};
+    PoolingLayerInfo       _pool_info{};
+    TensorType             _target_indices{};
+    SimpleTensor<uint32_t> _ref_indices{};
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class PoolingLayerIndicesValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout)
+    {
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding),
+                                                                                               data_type, data_layout, true);
+    }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index ed2eb2c7ec..1a1aebd1b4 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -38,13 +38,15 @@ namespace reference
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info)
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices)
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling && (src.shape().x() != src.shape().y()));
-
     // Create reference
     SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
-
+    if(indices)
+    {
+        *indices = SimpleTensor<uint32_t> { compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), DataType::U32, 1 };
+    }
     const int   pool_size_x     = info.is_global_pooling ? src.shape().x() : info.pool_size.width;
     const int   pool_size_y     = info.is_global_pooling ? src.shape().y() : info.pool_size.height;
     PoolingType type            = info.pool_type;
@@ -79,6 +81,7 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                     hstart     = std::max(hstart, 0);
 
                     auto max_val = std::numeric_limits<ACC_T>::lowest();
+                    int  max_index{ 0 };
                     for(int y = hstart; y < hend; ++y)
                     {
                         for(int x = wstart; x < wend; ++x)
@@ -86,12 +89,17 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                             const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                             if(val > max_val)
                             {
-                                max_val = val;
+                                max_val   = val;
+                                max_index = coord2index(src.shape(), Coordinates(x, y, r));
                             }
                         }
                     }
 
                     dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
+                    if(indices)
+                    {
+                        (*indices)[r * h_dst * w_dst + h * w_dst + w] = max_index;
+                    }
                 }
             }
         }
@@ -151,48 +159,48 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
     return dst;
 }
 
-template SimpleTensor<float> pooling_layer_internal<float>(const SimpleTensor<float> &src, const PoolingLayerInfo &info);
-template SimpleTensor<half> pooling_layer_internal<half>(const SimpleTensor<half> &src, const PoolingLayerInfo &info);
-template SimpleTensor<half> pooling_layer_internal<half, float>(const SimpleTensor<half> &src, const PoolingLayerInfo &info);
+template SimpleTensor<float> pooling_layer_internal<float>(const SimpleTensor<float> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices);
+template SimpleTensor<half> pooling_layer_internal<half>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices);
+template SimpleTensor<half> pooling_layer_internal<half, float>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices);
 
 template <typename T>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
 {
     ARM_COMPUTE_UNUSED(output_qinfo);
-    return pooling_layer_internal<T, T>(src, info);
+    return pooling_layer_internal<T, T>(src, info, indices);
 }
 
 template <>
-SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
 {
     SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>   dst_tmp = pooling_layer_internal<float>(src_tmp, info);
+    SimpleTensor<float>   dst_tmp = pooling_layer_internal<float>(src_tmp, info, indices);
     SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
     return dst;
 }
 
 template <>
-SimpleTensor<int8_t> pooling_layer<int8_t>(const SimpleTensor<int8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+SimpleTensor<int8_t> pooling_layer<int8_t>(const SimpleTensor<int8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
 {
     SimpleTensor<float>  src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>  dst_tmp = pooling_layer_internal<float>(src_tmp, info);
+    SimpleTensor<float>  dst_tmp = pooling_layer_internal<float>(src_tmp, info, indices);
     SimpleTensor<int8_t> dst     = convert_to_asymmetric<int8_t>(dst_tmp, output_qinfo);
     return dst;
 }
 
 template <>
-SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
 {
     ARM_COMPUTE_UNUSED(output_qinfo);
     if(src.data_type() == DataType::F16 && info.fp_mixed_precision)
     {
-        return pooling_layer_internal<half, float>(src, info);
+        return pooling_layer_internal<half, float>(src, info, indices);
     }
 
-    return pooling_layer_internal<half>(src, info);
+    return pooling_layer_internal<half>(src, info, indices);
 }
 
-template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PoolingLayer.h b/tests/validation/reference/PoolingLayer.h
index 92d97d548e..3ca7f28d5a 100644
--- a/tests/validation/reference/PoolingLayer.h
+++ b/tests/validation/reference/PoolingLayer.h
@@ -36,9 +36,9 @@ namespace validation
 namespace reference
 {
 template <typename T, typename ACC_T = T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info);
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices);
 template <typename T>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1