From 2697fd8fa42425f7bfdd60dd486d4c2132b06523 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Tue, 15 Oct 2019 16:49:24 +0100
Subject: COMPMID-2707: add keep_dims parameter to Reduction Operation

The added parameter is used to decide whether or not to keep
the target dimension of reduction operation. ArgMinMax operations
will always remove the reduced dimension. Following things
are updated to support the parameter.

- [CL/NEON] functions and reference kernel
- [CL/NEON] ArgMinMax function to use ReductionOperation function
- [CL/NEON] validation test suite for Reduction and ArgMinMax operations
  to validate the added parameter
- ReductionOperationFixture is modified NOT to pre-populate output
  tensor and now relies on underlying kernel/function.
- Adjust CL validation test suite for Reduction operation to remove
  excessive test cases with axis values beyond input tensor's
  dimension.

Change-Id: I3e24d276ed469a4201f323001708f0c525f11c4f
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2167
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 arm_compute/core/Utils.h                           |  10 ++
 arm_compute/core/utils/misc/ShapeCalculator.h      |  17 ++-
 .../runtime/CL/functions/CLArgMinMaxLayer.h        |  25 +++-
 .../runtime/CL/functions/CLReductionOperation.h    |  28 ++--
 .../runtime/NEON/functions/NEArgMinMaxLayer.h      |   8 +-
 .../runtime/NEON/functions/NEReductionOperation.h  |  32 ++--
 src/core/CL/kernels/CLReductionOperationKernel.cpp |  18 +--
 src/core/Utils.cpp                                 |  10 ++
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp      |  23 ++-
 src/runtime/CL/functions/CLReductionOperation.cpp  | 161 +++++++++++++++++----
 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp    |  24 +--
 .../NEON/functions/NEReductionOperation.cpp        |  80 +++++++++-
 tests/validation/CL/ArgMinMax.cpp                  |  28 ++--
 tests/validation/CL/ReductionOperation.cpp         |  56 +++++--
 tests/validation/NEON/ArgMinMax.cpp                |  16 +-
 tests/validation/NEON/ReductionOperation.cpp       |  36 +++--
 tests/validation/fixtures/ArgMinMaxFixture.h       |   4 +-
 .../fixtures/ReductionOperationFixture.h           |  34 ++---
 18 files changed, 437 insertions(+), 173 deletions(-)
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 3f04ed9963..3939491bb2 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -881,6 +881,16 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(unsigned int width, unsi
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
+/** Check if the given reduction operation should be handled in a serial way.
+ *
+ * @param[in] op   Reduction operation to perform
+ * @param[in] dt   Data type
+ * @param[in] axis Axis along which to reduce
+ *
+ * @return True if the given reduction operation should be handled in a serial way.
+ */
+bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis);
+
 /** Convert a tensor format into a string.
  *
  * @param[in] format @ref Format to be translated to string.
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index c4c360842f..080d63f60d 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1179,15 +1179,24 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
 
 /** Calculate the reduced shape of a tensor given an axis
  *
- * @param[in] input Input tensor info
- * @param[in] axis  Axis on which to perform reduction
+ * @param[in] input     Input tensor info
+ * @param[in] axis      Axis on which to perform reduction
+ * @param[in] keep_dims (Optional) Whether to keep the dimension after reduction operation. Defaults to true.
  *
  * @return the calculated shape
  */
-inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis)
+inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
     TensorShape output_shape{ input };
-    output_shape.set(axis, 1);
+
+    if(!keep_dims)
+    {
+        output_shape.remove_dimension(axis);
+    }
+    else
+    {
+        output_shape.set(axis, 1);
+    }
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index 2384ebcd37..28feee09ab 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -24,13 +24,16 @@
 #ifndef __ARM_COMPUTE_CLARGMINMAXLAYER_H__
 #define __ARM_COMPUTE_CLARGMINMAXLAYER_H__
 
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
 {
+class ITensorInfo;
 class ICLTensor;
+class CLReductionOperation;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -39,17 +42,23 @@ class ICLTensor;
  *       responsibility to check that the results do not overflow in case the
  *       output data type is set to signed 32-bit integer (S32).
  */
-class CLArgMinMaxLayer : public ICLSimpleFunction
+class CLArgMinMaxLayer : public IFunction
 {
 public:
+    /** Default Constructor.
+     *
+     * @param[in] memory_manager (Optional) Memory manager.
+     */
+    CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Input source tensor. Data types supported: F16/F32.
+     * @param[in]  input  Input source tensor, this could be written if @ref CLReductionOperation
+     *                    manipulates its border for better performance. Data types supported: F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32/S32.
      * @param[in]  op     Operation to perform: min or max
      */
-    void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+    void configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
      * @param[in] input  Input source tensor info. Data types supported: F16/F32.
@@ -60,6 +69,12 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<CLReductionOperation> _reduction_function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLARGMINMAXLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index f71313f235..405e1177fd 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -53,35 +54,42 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
-     * @param[in]  op     Reduction operation to perform.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p input.
+     * @param[in]  axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
+     * @param[in]  op        Reduction operation to perform.
+     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
-     * @param[in] op     Reduction operation to perform.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p input.
+     * @param[in] axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
+     * @param[in] op        Reduction operation to perform.
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output);
+
     MemoryGroup                             _memory_group;
     std::vector<CLTensor>                   _results_vector;
     std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
     std::vector<CLFillBorderKernel>         _border_handlers_vector;
+    CLReshapeLayerKernel                    _reshape_kernel;
+    ReductionOperation                      _op;
     unsigned int                            _num_of_stages;
     unsigned int                            _reduction_axis;
     bool                                    _is_serial;
+    bool                                    _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCTIONOPERATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 85bf7d92c9..b0e2d783b3 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -24,8 +24,6 @@
 #ifndef __ARM_COMPUTE_NEARGMINMAXLAYER_H__
 #define __ARM_COMPUTE_NEARGMINMAXLAYER_H__
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEReductionOperation;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -74,10 +73,7 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEReductionOperationKernel _reduction_kernel;
-    NEFillBorderKernel         _fill_border_kernel;
-    bool                       _run_fill_border;
+    std::unique_ptr<NEReductionOperation> _reduction_function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEARGMINMAXLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 5bc7059b62..1e72c4f74d 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,9 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
@@ -44,35 +46,41 @@ class NEReductionOperation : public IFunction
 {
 public:
     /** Default constructor */
-    NEReductionOperation();
+    NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]  axis   Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]  op     Reduction operation to perform.
+     * @param[in]  input     Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[out] output    Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]  axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]  op        Reduction operation to perform.
+     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
+    void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] axis   Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in] op     Reduction operation to perform.
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in] op        Reduction operation to perform.
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    MemoryGroup                _memory_group;
     NEReductionOperationKernel _reduction_kernel;
     NEFillBorderKernel         _fill_border_kernel;
+    NEReshapeLayerKernel       _reshape_kernel;
+    Tensor                     _output_internal;
     size_t                     _window_split;
     int                        _reduction_axis;
+    bool                       _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEREDUCTIONOPERATION_H__ */
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 8e92b591d1..a085ab1683 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -33,6 +33,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/ToolchainSupport.h"
 
@@ -80,17 +81,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Output tensor auto initialization if not yet initialized
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(axis, 1);
-    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+    const bool        is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, !is_arg_min_max);
+    const DataType    output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
     Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool               window_changed                    = false;
-    const bool         is_serial_op                      = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN
-                                                            || op == ReductionOperation::MAX || is_data_type_quantized(input->data_type()));
+    const bool         is_serial_op                      = needs_serialized_reduction(op, input->data_type(), axis);
 
     switch(axis)
     {
@@ -198,8 +197,8 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     // Create kernel
     cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
-    const bool  is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX
-                                || is_data_type_quantized(input->info()->data_type()));
+    const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
+
     switch(axis)
     {
         case 0:
@@ -264,8 +263,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || _op == ReductionOperation::MIN || _op == ReductionOperation::MAX
-                               || is_data_type_quantized(_input->info()->data_type()));
+    const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
     switch(_reduction_axis)
     {
         case 0:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 7e1af0e27d..fa335d757b 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -427,6 +427,16 @@ std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned in
     return std::make_pair(w, h);
 }
 
+bool arm_compute::needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
+{
+    const bool is_arg_min_max    = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+    const bool is_min_max        = (op == ReductionOperation::MAX || op == ReductionOperation::MIN);
+    const bool is_quantized_type = is_data_type_quantized(dt);
+    const bool is_first_dim      = (axis == 0);
+
+    return !is_first_dim || is_arg_min_max || is_min_max || is_quantized_type;
+}
+
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
 {
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index a6393c57c1..fd172d5f2c 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,26 +23,33 @@
  */
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _reduction_function(support::cpp14::make_unique<CLReductionOperation>(std::move(memory_manager)))
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
-    k->configure(input, output, axis, op);
-    _kernel = std::move(k);
+}
+
+void CLArgMinMaxLayer::configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    _reduction_function->configure(input, output, axis, op, false);
 }
 
 Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
-    return CLReductionOperationKernel::validate(input, output, axis, op);
+    return CLReductionOperation::validate(input, output, axis, op, false);
+}
+
+void CLArgMinMaxLayer::run()
+{
+    _reduction_function->run();
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 38f0a7523c..447c15b1e8 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -26,15 +26,17 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
@@ -56,17 +58,52 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int a
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+      _is_reshape_required(false)
 {
 }
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+    const unsigned int num_of_stages       = calculate_number_of_stages(input, axis);
+    const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
+    const bool         is_arg_min_max      = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    const bool         is_reshape_required = !keep_dims || is_arg_min_max;
+
+    if(is_reshape_required)
+    {
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+    }
+
+    auto *output_internal = output;
+
+    TensorInfo output_before_reshape;
+    const auto input_shape        = input->tensor_shape();
+    const auto input_data_type    = input->data_type();
+    const auto input_num_channles = input->num_channels();
+    const auto input_qinfo        = input->quantization_info();
+    const auto output_data_type   = is_arg_min_max ? DataType::U32 : output->data_type();
+
+    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
+    {
+        ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
+    };
+
+    if(is_reshape_required)
+    {
+        auto shape_before_reshape = input_shape;
+        shape_before_reshape.set(axis, 1);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        output_internal = &output_before_reshape;
+    }
+
     if(is_serial)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
     }
     else
     {
@@ -74,14 +111,13 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
         std::vector<TensorInfo> sums_vector(num_of_stages - 1);
 
         // Create intermediate tensor info
-        TensorShape shape{ input->tensor_shape() };
+        TensorShape shape{ input_shape };
+
+        shape.set(0, ceil(shape.x() / 128.f));
 
         for(unsigned int i = 0; i < num_of_stages - 1; i++)
         {
-            shape.set(0, ceil(shape.x() / 128.f));
-            sums_vector[i].set_data_type(input->data_type());
-            sums_vector[i].set_tensor_shape(shape);
-            sums_vector[i].set_num_channels(input->num_channels());
+            initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
         }
 
         ReductionOperation first_kernel_op;
@@ -130,17 +166,72 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
+    }
+
+    if(is_reshape_required)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output));
     }
 
     return Status{};
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
+{
+    if(!_is_reshape_required && _is_serial)
+    {
+        return output;
+    }
+
+    auto       intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;
+    const auto is_arg_min_max                  = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN);
+
+    if(!_is_reshape_required)
+    {
+        --intermediate_result_vector_size;
+    }
+
+    _results_vector.resize(intermediate_result_vector_size);
+    auto shape = input->info()->tensor_shape();
+
+    shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));
+
+    for(auto &v : _results_vector)
+    {
+        if(&v == &_results_vector.back() && _is_reshape_required)
+        {
+            shape.set(_reduction_axis, 1);
+        }
+        v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+    }
+
+    if(is_arg_min_max)
+    {
+        _results_vector.back().info()->set_data_type(DataType::U32).set_is_resizable(true).reset_padding();
+    }
+
+    return _is_reshape_required ? &_results_vector.back() : output;
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    _num_of_stages  = calculate_number_of_stages(input->info(), axis);
-    _reduction_axis = axis;
-    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
+    _op                       = op;
+    _num_of_stages            = calculate_number_of_stages(input->info(), axis);
+    _reduction_axis           = axis;
+    _is_serial                = needs_serialized_reduction(op, input->info()->data_type(), axis);
+    const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    _is_reshape_required      = !keep_dims || is_arg_min_max;
+
+    auto *output_internal = configure_intermediate_result_vector(input, output);
+
+    // ArgMinMax might not give initialized output tensor, so initialize here.
+    if(_is_reshape_required)
+    {
+        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto        output_data_type = is_arg_min_max ? DataType::U32 : input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    }
 
     // Configure reduction operation kernels
     _reduction_kernels_vector.resize(_num_of_stages);
@@ -148,20 +239,16 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
     // Create temporary tensors
     if(_is_serial)
     {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        if(_is_reshape_required)
+        {
+            _memory_group.manage(&_results_vector.back());
+        }
+
+        _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0);
     }
     else
     {
         _border_handlers_vector.resize(_num_of_stages);
-        _results_vector.resize(_num_of_stages - 1);
-        TensorShape shape{ input->info()->tensor_shape() };
-        for(unsigned int i = 0; i < _num_of_stages - 1; i++)
-        {
-            shape.set(0, ceil(shape.x() / 128.f));
-            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
-        }
-
-        // Apply ReductionOperation only on first kernel
         _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
@@ -262,10 +349,22 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+
+        if(_is_reshape_required)
+        {
+            _memory_group.manage(&_results_vector.back());
+        }
+
+        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
         _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
+
+    if(_is_reshape_required)
+    {
+        _reshape_kernel.configure(&_results_vector.back(), output);
+        _results_vector.back().allocator()->allocate();
+    }
 }
 
 void CLReductionOperation::run()
@@ -284,4 +383,10 @@ void CLReductionOperation::run()
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
+
+    if(_is_reshape_required)
+    {
+        CLScheduler::get().enqueue(_reshape_kernel, false);
+    }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 6863bb0b3b..ab2d6f0c1f 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -23,47 +23,35 @@
  */
 
 #include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
 {
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+    : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
 {
+    ARM_COMPUTE_UNUSED(memory_manager);
 }
 void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
 {
-    _reduction_kernel.configure(input, output, axis, op);
-
-    if(axis == 0)
-    {
-        _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
-        _run_fill_border = true;
-    }
+    _reduction_function->configure(input, output, axis, op, false);
 }
 
 Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
-    return Status{};
+    return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
 void NEArgMinMaxLayer::run()
 {
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_run_fill_border)
-    {
-        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    }
-    NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+    _reduction_function->run();
 }
 
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index dc6cf59019..09cd765d4b 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
@@ -52,25 +53,78 @@ size_t reduction_window_split_dimension(unsigned int axis)
 }
 } // namespace
 
-NEReductionOperation::NEReductionOperation()
-    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+    const auto is_reshape_required = !keep_dims;
+
+    auto *output_internal = output;
+
+    TensorInfo info_before_reshape;
+
+    if(is_reshape_required)
+    {
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+
+        auto shape_before_reshape = input->tensor_shape();
+        shape_before_reshape.set(axis, 1);
+
+        const auto input_num_channles = input->num_channels();
+        const auto input_qinfo        = input->quantization_info();
+        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type   = is_arg_min_max ? DataType::U32 : output->data_type();
+
+        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+
+        output_internal = &info_before_reshape;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
+
+    if(is_reshape_required)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+    }
 
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
+
+    _is_reshape_required = !keep_dims;
+
+    auto      *output_internal = output;
+    const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+
+    if(_is_reshape_required)
+    {
+        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type      = is_arg_min_max ? DataType::U32 : input->info()->data_type();
+        const auto num_channels          = input->info()->num_channels();
+        const auto qinfo                 = input->info()->quantization_info();
+
+        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
+                                               num_channels).set_quantization_info(qinfo));
+        _memory_group.manage(&_output_internal);
+        output_internal = &_output_internal;
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
 
     // Configure reduction kernel
-    _reduction_kernel.configure(input, output, axis, op);
+    _reduction_kernel.configure(input, output_internal, axis, op);
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
@@ -150,7 +204,13 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
             default:
                 ARM_COMPUTE_ERROR("Reduction Operation unsupported");
         }
-        _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+        _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue);
+    }
+
+    if(_is_reshape_required)
+    {
+        _reshape_kernel.configure(output_internal, output);
+        _output_internal.allocator()->allocate();
     }
 }
 
@@ -161,5 +221,9 @@ void NEReductionOperation::run()
         NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     }
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+    if(_is_reshape_required)
+    {
+        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+    }
 }
 } // namespace arm_compute
diff --git a/tests/validation/CL/ArgMinMax.cpp b/tests/validation/CL/ArgMinMax.cpp
index 6de09bed25..845fdbf493 100644
--- a/tests/validation/CL/ArgMinMax.cpp
+++ b/tests/validation/CL/ArgMinMax.cpp
@@ -25,7 +25,9 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SplitDataset.h"
@@ -49,16 +51,18 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
         framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis
                                                 TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape
                                                 TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) // Invalid operation
+                                                TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32), // Invalid operation
+                                                TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) // Not allowed keeping the dimension 
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
-                                                 TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
-                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32),
-                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
+        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 2U), 1, DataType::F32),
+                                                 TensorInfo(TensorShape(27U, 3U, 2U), 1, DataType::F32),
+                                                 TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::U32),
+                                                 TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32)
         })),
-        framework::dataset::make("Axis", { 4, 0, 2, 0 })),
-        framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::MEAN_SUM })),
-        framework::dataset::make("Expected", { false, false, true, false })),
+        framework::dataset::make("Axis", { 4, 0, 2, 0, 2 })),
+        framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::MEAN_SUM, ReductionOperation::ARG_IDX_MAX })),
+        framework::dataset::make("Expected", { false, false, true, false, false })),
         input_info, output_info, axis, operation, expected)
 {
     const Status status = CLArgMinMaxLayer::validate(&input_info.clone()->set_is_resizable(false), axis, &output_info.clone()->set_is_resizable(false), operation);
@@ -76,13 +80,13 @@ DATA_TEST_CASE(Configuration,
     CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
     CLTensor dst;
 
+    constexpr int axis = 1;
+
     // Create and Configure function
     CLArgMinMaxLayer arg_min_max_layer;
-    arg_min_max_layer.configure(&ref_src, 1, &dst, ReductionOperation::ARG_IDX_MAX);
+    arg_min_max_layer.configure(&ref_src, axis, &dst, ReductionOperation::ARG_IDX_MAX);
 
-    // Validate valid region
-    TensorShape output_shape = shape;
-    output_shape.set(1, 1);
+    const auto        output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(shape, axis, false);
     const ValidRegion valid_region = shape_to_valid_region(output_shape);
     validate(dst.info()->valid_region(), valid_region);
 }
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 9a3cd996fa..1dec020d18 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -57,6 +57,7 @@ const auto ReductionOperations = framework::dataset::make("ReductionOperation",
 
 });
 
+const auto KeepDimensions = framework::dataset::make("KeepDims", { true, false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -64,29 +65,34 @@ TEST_SUITE(ReductionOperation)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
     framework::dataset::make("InputInfo",          { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
                                                      TensorInfo(TensorShape(128U, 64U), 3, DataType::F32), // Number of Input channels != 1
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != QASYMM8/F16/F32
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::QASYMM8), // Axis == 0 and SUM_SQUARE and QASYMM8
-                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) // Kept Dimension when keep_dims = false
+
                                                    }),
     framework::dataset::make("OutputInfo",         { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::S16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::QASYMM8),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32)
                                                    })),
-    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
-    framework::dataset::make("Expected",           { false, false, false, false, false, true })),
-    input_info, output_info, axis, expected)
+    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U, 0U })),
+    framework::dataset::make("KeepDims",           { true, true, true, true, true, true, false })),
+    framework::dataset::make("Expected",           { false, false, false, false, false, true , false })),
+    input_info, output_info, axis, keep_dims, expected)
 {
     bool is_valid = bool(CLReductionOperation::validate(&input_info.clone()->set_is_resizable(false),
                                                         &output_info.clone()->set_is_resizable(true),
                                                         axis,
-                                                        ReductionOperation::SUM_SQUARE));
+                                                        ReductionOperation::SUM_SQUARE,
+                                                        keep_dims));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -97,28 +103,54 @@ using CLReductionOperationFixture = ReductionOperationFixture<CLTensor, CLAccess
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1 })), ReductionOperations), KeepDimensions))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall3D, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2 })), ReductionOperations), KeepDimensions))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations),
+                               KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations), KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0, tolerance_f16);
 }
 TEST_SUITE_END() // F16
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1 })), ReductionOperations), KeepDimensions))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall3D, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2 })), ReductionOperations), KeepDimensions))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations),
+                               KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations), KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, tolerance_f32);
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index 71fb39a30d..642a69ba5f 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SplitDataset.h"
@@ -54,7 +56,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
         }),
         framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
-                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32),
+                                                 TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::U32),
                                                  TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
         })),
         framework::dataset::make("Axis", { 4, 0, 2, 0 })),
@@ -74,17 +76,17 @@ DATA_TEST_CASE(Configuration,
                shape, data_type)
 {
     // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
+    Tensor    ref_src = create_tensor<Tensor>(shape, data_type);
+    Tensor    dst;
+    const int axis = 1;
 
     // Create and Configure function
     NEArgMinMaxLayer arg_min_max_layer;
-    arg_min_max_layer.configure(&ref_src, 1, &dst, ReductionOperation::ARG_IDX_MAX);
+    arg_min_max_layer.configure(&ref_src, axis, &dst, ReductionOperation::ARG_IDX_MAX);
 
     // Validate valid region
-    TensorShape output_shape = shape;
-    output_shape.set(1, 1);
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
+    const auto        expected_output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(shape, axis, false);
+    const ValidRegion valid_region          = shape_to_valid_region(expected_output_shape);
     validate(dst.info()->valid_region(), valid_region);
 }
 
diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index 5b697a5efa..3a7f707d23 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp
@@ -66,6 +66,8 @@ const auto QuantizationInfos = framework::dataset::make("QuantizationInfo",
 const auto Axises = framework::dataset::make("Axis",
 { 0, 1, 2, 3 });
 
+const auto KeepDims = framework::dataset::make("KeepDims", { true, false });
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -73,27 +75,31 @@ TEST_SUITE(ReductionOperation)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
     framework::dataset::make("InputInfo",          { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
                                                      TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
-                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) // Kept dimension when keep_dims = false
                                                    }),
     framework::dataset::make("OutputInfo",         { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::S16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32)
                                                    })),
-    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 0U })),
-    framework::dataset::make("Expected",           { false, false, false, false, true })),
-    input_info, output_info, axis, expected)
+    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 0U, 0U })),
+    framework::dataset::make("KeepDims",           { true, true, true, true, true, false})),
+    framework::dataset::make("Expected",           { false, false, false, false, true, false })),
+    input_info, output_info, axis, keep_dims, expected)
 {
     bool is_valid = bool(NEReductionOperation::validate(&input_info.clone()->set_is_resizable(false),
                                                         &output_info.clone()->set_is_resizable(true),
                                                         axis,
-                                                        ReductionOperation::SUM_SQUARE));
+                                                        ReductionOperation::SUM_SQUARE,
+                                                        keep_dims));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -104,13 +110,13 @@ using NEReductionOperationFixture = ReductionOperationFixture<Tensor, Accessor,
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), Axises), ReductionOperations))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), Axises), ReductionOperations), KeepDims))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), Axises), ReductionOperations))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), Axises), ReductionOperations), KeepDims))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, tolerance_f32);
@@ -122,17 +128,19 @@ using NEReductionOperationQuantizedFixture = ReductionOperationQuantizedFixture<
 
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises),
-                                       ReductionOperations),
-                               QuantizationInfos))
+                       combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises),
+                                               ReductionOperations),
+                                       QuantizationInfos),
+                               KeepDims))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises),
-                                       ReductionOperations),
-                               QuantizationInfos))
+                       combine(combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), Axises),
+                                               ReductionOperations),
+                                       QuantizationInfos),
+                               KeepDims))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index ed6b51abe5..f8fe4ff1ee 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
@@ -121,8 +122,7 @@ protected:
         // Fill reference
         fill(src);
 
-        TensorShape output_shape = src_shape;
-        output_shape.set(axis, 1);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(src_shape, axis, false);
         return reference::reduction_operation<T, uint32_t>(src, output_shape, axis, op);
     }
 
diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index d01f41abf0..867c08ec3a 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
@@ -45,11 +46,15 @@ class ReductionOperationValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info)
+    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info, bool keep_dims = false)
     {
-        const TensorShape output_shape = get_output_shape(shape, axis);
-        _target                        = compute_target(shape, output_shape, data_type, axis, op, quantization_info);
-        _reference                     = compute_reference(shape, output_shape, data_type, axis, op, quantization_info);
+        const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        _keep_dims                = keep_dims && !is_arg_min_max;
+
+        const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(shape, axis, _keep_dims);
+
+        _target    = compute_target(shape, data_type, axis, op, quantization_info);
+        _reference = compute_reference(shape, output_shape, data_type, axis, op, quantization_info);
     }
 
 protected:
@@ -70,15 +75,15 @@ protected:
         }
     }
 
-    TensorType compute_target(const TensorShape &src_shape, const TensorShape &dst_shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info)
+    TensorType compute_target(const TensorShape &src_shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info)
     {
         // Create tensors
         TensorType src = create_tensor<TensorType>(src_shape, data_type, 1, quantization_info);
-        TensorType dst = create_tensor<TensorType>(dst_shape, data_type, 1, quantization_info);
+        TensorType dst;
 
         // Create and configure function
         FunctionType reduction_func;
-        reduction_func.configure(&src, &dst, axis, op);
+        reduction_func.configure(&src, &dst, axis, op, _keep_dims);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -114,12 +119,7 @@ protected:
     SimpleTensor<T> _reference{};
 
 private:
-    TensorShape get_output_shape(TensorShape shape, unsigned int axis)
-    {
-        TensorShape output_shape(shape);
-        output_shape.set(axis, 1);
-        return output_shape;
-    }
+    bool _keep_dims{ false };
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -127,9 +127,9 @@ class ReductionOperationQuantizedFixture : public ReductionOperationValidationFi
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info = QuantizationInfo())
+    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info = QuantizationInfo(), bool keep_dims = false)
     {
-        ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info);
+        ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info, keep_dims);
     }
 };
 
@@ -138,9 +138,9 @@ class ReductionOperationFixture : public ReductionOperationValidationFixture<Ten
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op)
+    void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, bool keep_dims = false)
     {
-        ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo());
+        ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo(), keep_dims);
     }
 };
 } // namespace validation
-- 
cgit v1.2.1