18 files changed, 113 insertions, 9 deletions
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index 21d026e0a1..2dd20e9588 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,6 +72,7 @@ private:
     const ICLTensor *_input;
     ICLTensor       *_output;
     PadStrideInfo    _info;
+    DataLayout       _data_layout;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index 081b01aad3..faf97e45dc 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,6 +94,7 @@ public:
     const ICLTensor *_biases;
     const ICLTensor *_weights;
     ICLTensor       *_output;
+    DataLayout       _data_layout;
     BorderSize       _border_size;
     int              _conv_stride_x;
     int              _conv_stride_y;
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index 0647f5dcec..00cb416e90 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -103,6 +103,7 @@ public:
 public:
     const ICLTensor *_input;
     ICLTensor       *_output;
+    DataLayout       _data_layout;
     std::pair<unsigned int, unsigned int> _convolved_dims;
     unsigned int  _num_elems_processed_per_iteration;
     Size2D        _kernel_dims;
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index db1a756229..68a99039d8 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -75,6 +75,7 @@ public:
     const ICLTensor *_input;
     ICLTensor       *_output;
     PoolingLayerInfo _pool_info;
+    DataLayout       _data_layout;
     BorderSize       _border_size;
     unsigned int     _num_elems_processed_per_iteration;
 };
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index ff72af29fc..1ada3cde85 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -75,6 +75,7 @@ public:
 
 public:
     InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR;
+    DataLayout          _data_layout         = DataLayout::UNKNOWN;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
index dcd4f1bdb4..c8c69002c4 100644
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
@@ -73,6 +73,7 @@ private:
     const ICLTensor *_input;
     ICLTensor       *_output;
     Size2D           _info;
+    DataLayout       _data_layout;
     unsigned int     _num_elems_processed_per_iteration_input_x;
 };
 } // namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
index bc05a0ebf1..30bd3abb43 100644
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
@@ -93,6 +93,7 @@ private:
     BorderSize       _border_size;
     const ICLTensor *_input;
     ICLTensor       *_output;
+    DataLayout       _data_layout;
     int              _num_tiles_x;
     int              _num_tiles_y;
     unsigned int     _step_z;
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 87b1fdf64c..8d526e96c0 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -766,6 +766,20 @@ inline T wrap_around(T x, T m)
     return x >= 0 ? x % m : (x % m + m) % m;
 }
 
+/** Convert negative coordinates to positive in the range [0, num_dims_input]
+ *
+ * @param[out] coords    Array of coordinates to be converted.
+ * @param[in]  max_value Maximum value to be used when wrapping the negative values in coords
+ */
+inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
+{
+    for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+    {
+        coords[i] = wrap_around(coords[i], max_value);
+    }
+    return coords;
+}
+
 /** Given an integer value, this function returns the next power of two
  *
  * @param[in] x Input value
diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index a62da049a5..b34f6d3ebf 100644
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -74,6 +74,7 @@ private:
     const ITensor *_input;       /**< Source tensor */
     ITensor       *_output;      /**< Destination tensor */
     int32_t        _block_shape; /**< Block shape */
+    DataLayout     _data_layout; /**< Data layout of the operation */
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index f76521f770..689da857a7 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,6 +131,7 @@ private:
     unsigned int  _kernel_height;
     bool          _has_bias;
     Size2D        _dilation;
+    DataLayout    _data_layout;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 5f45a90cef..5b143250e9 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -194,6 +194,7 @@ private:
     const ITensor   *_input;
     ITensor         *_output;
     PoolingLayerInfo _pool_info;
+    DataLayout       _data_layout;
     unsigned int     _num_elems_processed_per_iteration;
     BorderSize       _border_size;
     bool             _is_square;
diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index c9ecdd26f8..68bc1737c8 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -75,6 +75,7 @@ private:
     const ITensor *_input;       /**< Source tensor */
     ITensor       *_output;      /**< Destination tensor */
     int32_t        _block_shape; /**< Block shape */
+    DataLayout     _data_layout; /**< Data layout  of the operation */
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H__ */
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index a56227996b..be42fe9a87 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -48,7 +48,7 @@ public:
 
     /** Default constructor: create a window containing a single element. */
     constexpr Window()
-        : _dims()
+        : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
     {
     }
     /** Copy constructor
@@ -170,6 +170,20 @@ public:
      */
     void set(size_t dimension, const Dimension &dim);
 
+    /** Set the dimension as broadcasted dimension
+     *
+     * @param[in] dimension The dimension to set
+     */
+    void set_broadcasted(size_t dimension);
+
+    /** Return whether a dimension has been broadcasted
+     *
+     * @param[in] dimension The requested dimension
+     *
+     * @return true if the dimension has been broadcasted
+     */
+    bool is_broadcasted(size_t dimension) const;
+
     /** Use the tensor's dimensions to fill the window dimensions.
      *
      * @param[in] shape           @ref TensorShape to copy the dimensions from.
@@ -419,6 +433,7 @@ private:
 
 private:
     std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+    std::array<bool, Coordinates::num_max_dimensions>      _is_broadcasted;
 };
 } // namespace arm_compute
 #include "Window.inl"
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index eeef3df7b0..589d6bfafc 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -24,11 +24,12 @@
 namespace arm_compute
 {
 inline Window::Window(const Window &src)
-    : _dims()
+    : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
 {
     for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         set(i, src[i]);
+        _is_broadcasted[i] = src.is_broadcasted(i);
     }
 }
 
@@ -51,6 +52,19 @@ inline void Window::set(size_t dimension, const Window::Dimension &dim)
     _dims[dimension] = dim;
 }
 
+inline void Window::set_broadcasted(size_t dimension)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    set(dimension, Dimension(0, 0, 0));
+    _is_broadcasted[dimension] = true;
+}
+
+inline bool Window::is_broadcasted(size_t dimension) const
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    return _is_broadcasted[dimension];
+}
+
 inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
                                            const size_t last, bool *has_collapsed) const
 {
@@ -110,7 +124,7 @@ inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) co
     {
         if(shape[d] <= 1)
         {
-            broadcastWin.set(d, Dimension(0, 0, 0));
+            broadcastWin.set_broadcasted(d);
         }
     }
     return broadcastWin;
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 65a2a1edf4..698a2b7a45 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -39,6 +39,42 @@ namespace misc
 {
 namespace shape_calculator
 {
+/** Calculate the output tensor shape for the reduce mean operation
+ *
+ * @param[in] input          Input tensor shape
+ * @param[in] reduction_axis Reduction axis
+ * @param[in] keep_dims      Flag to indicate if dimensions are kept
+ *
+ * @return the calculated shape
+ */
+inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+{
+    const int   reduction_ops = reduction_axis.num_dimensions();
+    Coordinates axis_local    = reduction_axis;
+    const int   input_dims    = input->info()->num_dimensions();
+    convert_negative_axis(axis_local, input_dims);
+    TensorShape out_shape = input->info()->tensor_shape();
+    // Configure reshape layer if we want to drop the dimensions
+    if(!keep_dims)
+    {
+        // We have to sort the reduction axis vectors in order for remove_dimension
+        // to work properly
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(int i = 0; i < reduction_ops; ++i)
+        {
+            out_shape.remove_dimension(axis_local[i] - i);
+        }
+        return out_shape;
+    }
+    else
+    {
+        for(int i = 0; i < reduction_ops; ++i)
+        {
+            out_shape.set(axis_local[i], 1);
+        }
+        return out_shape;
+    }
+}
 /** Calculate the output tensor shape of a vector input given the convolution dimensions
  *
  * @param[in] input       Input tensor shape
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index 8dd9afd5cd..2325644e72 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,20 @@ struct index_sequence_generator<0u, S...> : index_sequence<S...>
 
 template <std::size_t N>
 using index_sequence_t = typename index_sequence_generator<N>::type;
+
+template <typename T, std::size_t N, T val, T... vals>
+struct generate_array : generate_array < T, N - 1, val, val, vals... >
+{
+};
+
+template <typename T, T val, T... vals>
+struct generate_array<T, 0, val, vals...>
+{
+    static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+};
+
+template <typename T, T val, T... vals>
+constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
 /** @endcond */
 
 namespace detail
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 9c087eadf1..6836ba3f58 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -71,7 +71,7 @@ private:
     std::vector<CLReductionOperation> _reduction_kernels;
     std::vector<CLTensor>             _reduced_outs;
     CLReshapeLayer                    _reshape;
-    unsigned int                      _reduction_ops;
+    int                               _reduction_ops;
     bool                              _keep_dims;
 };
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index fdd8edfe87..245f7577ce 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -72,7 +72,7 @@ private:
     std::vector<NEReductionOperation> _reduction_kernels;
     std::vector<Tensor>               _reduced_outs;
     NEReshapeLayer                    _reshape;
-    unsigned int                      _reduction_ops;
+    int                               _reduction_ops;
     bool                              _keep_dims;
 };
 } // namespace arm_compute