6 files changed, 83 insertions, 37 deletions
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 3f04ed9963..3939491bb2 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -881,6 +881,16 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(unsigned int width, unsi
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
+/** Check if the given reduction operation should be handled in a serial way.
+ *
+ * @param[in] op   Reduction operation to perform
+ * @param[in] dt   Data type
+ * @param[in] axis Axis along which to reduce
+ *
+ * @return True if the given reduction operation should be handled in a serial way.
+ */
+bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis);
+
 /** Convert a tensor format into a string.
  *
  * @param[in] format @ref Format to be translated to string.
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index c4c360842f..080d63f60d 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1179,15 +1179,24 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
 
 /** Calculate the reduced shape of a tensor given an axis
  *
- * @param[in] input Input tensor info
- * @param[in] axis  Axis on which to perform reduction
+ * @param[in] input     Input tensor info
+ * @param[in] axis      Axis on which to perform reduction
+ * @param[in] keep_dims (Optional) Whether to keep the dimension after reduction operation. Defaults to true.
  *
  * @return the calculated shape
  */
-inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis)
+inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
     TensorShape output_shape{ input };
-    output_shape.set(axis, 1);
+
+    if(!keep_dims)
+    {
+        output_shape.remove_dimension(axis);
+    }
+    else
+    {
+        output_shape.set(axis, 1);
+    }
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index 2384ebcd37..28feee09ab 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -24,13 +24,16 @@
 #ifndef __ARM_COMPUTE_CLARGMINMAXLAYER_H__
 #define __ARM_COMPUTE_CLARGMINMAXLAYER_H__
 
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
 {
+class ITensorInfo;
 class ICLTensor;
+class CLReductionOperation;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -39,17 +42,23 @@ class ICLTensor;
  *       responsibility to check that the results do not overflow in case the
  *       output data type is set to signed 32-bit integer (S32).
  */
-class CLArgMinMaxLayer : public ICLSimpleFunction
+class CLArgMinMaxLayer : public IFunction
 {
 public:
+    /** Default Constructor.
+     *
+     * @param[in] memory_manager (Optional) Memory manager.
+     */
+    CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Input source tensor. Data types supported: F16/F32.
+     * @param[in]  input  Input source tensor, this could be written if @ref CLReductionOperation
+     *                    manipulates its border for better performance. Data types supported: F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32/S32.
      * @param[in]  op     Operation to perform: min or max
      */
-    void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+    void configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
      * @param[in] input  Input source tensor info. Data types supported: F16/F32.
@@ -60,6 +69,12 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<CLReductionOperation> _reduction_function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLARGMINMAXLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index f71313f235..405e1177fd 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -53,35 +54,42 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
-     * @param[in]  op     Reduction operation to perform.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p input.
+     * @param[in]  axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
+     * @param[in]  op        Reduction operation to perform.
+     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
-     * @param[in] op     Reduction operation to perform.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p input.
+     * @param[in] axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
+     * @param[in] op        Reduction operation to perform.
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output);
+
     MemoryGroup                             _memory_group;
     std::vector<CLTensor>                   _results_vector;
     std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
     std::vector<CLFillBorderKernel>         _border_handlers_vector;
+    CLReshapeLayerKernel                    _reshape_kernel;
+    ReductionOperation                      _op;
     unsigned int                            _num_of_stages;
     unsigned int                            _reduction_axis;
     bool                                    _is_serial;
+    bool                                    _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCTIONOPERATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 85bf7d92c9..b0e2d783b3 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -24,8 +24,6 @@
 #ifndef __ARM_COMPUTE_NEARGMINMAXLAYER_H__
 #define __ARM_COMPUTE_NEARGMINMAXLAYER_H__
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEReductionOperation;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -74,10 +73,7 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEReductionOperationKernel _reduction_kernel;
-    NEFillBorderKernel         _fill_border_kernel;
-    bool                       _run_fill_border;
+    std::unique_ptr<NEReductionOperation> _reduction_function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEARGMINMAXLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 5bc7059b62..1e72c4f74d 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,9 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
@@ -44,35 +46,41 @@ class NEReductionOperation : public IFunction
 {
 public:
     /** Default constructor */
-    NEReductionOperation();
+    NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]  axis   Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]  op     Reduction operation to perform.
+     * @param[in]  input     Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[out] output    Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]  axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]  op        Reduction operation to perform.
+     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
+    void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] axis   Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in] op     Reduction operation to perform.
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in] op        Reduction operation to perform.
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    MemoryGroup                _memory_group;
     NEReductionOperationKernel _reduction_kernel;
     NEFillBorderKernel         _fill_border_kernel;
+    NEReshapeLayerKernel       _reshape_kernel;
+    Tensor                     _output_internal;
     size_t                     _window_split;
     int                        _reduction_axis;
+    bool                       _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEREDUCTIONOPERATION_H__ */