16 files changed, 241 insertions, 233 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 2d28a496c9..d81ad46b29 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -98,7 +98,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
 
     for(unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
-        offset_first_element += window[n].start() * strides[n];
+        offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
     }
 
     unsigned int idx_start = idx;
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 295fb5c997..177f05f3ca 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
 using namespace arm_compute;
 
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+    : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -72,13 +72,14 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTe
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    _input  = input;
-    _output = output;
-    _info   = info;
-
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
 
+    _input       = input;
+    _output      = output;
+    _info        = info;
+    _data_layout = input->info()->data_layout();
+
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -99,10 +100,8 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const DataLayout data_layout = _input->info()->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     const int out_start_x = _info.pad().first;
     const int out_end_x   = _output->info()->dimension(idx_w) - _info.pad().first + _info.stride().first - 1;
@@ -112,7 +111,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index dc4c431c5d..21685dcf0e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -377,7 +377,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
-    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
 {
 }
 
@@ -390,10 +390,10 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    _data_layout          = input->info()->data_layout();
+    const int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
     const unsigned int kernel_size = weights->info()->dimension(width_idx);
     const DataType     data_type   = input->info()->data_type();
@@ -419,11 +419,11 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
 
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
     }
-    else if(data_layout == DataLayout::NCHW)
+    else if(_data_layout == DataLayout::NCHW)
     {
         _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
     }
@@ -441,15 +441,15 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
 
     std::stringstream kernel_name;
     kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
-        kernel_name << "_" << lower_string(string_from_data_layout(data_layout));
+        kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
     }
 
     CLBuildOptions build_options;
     build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
 
-    const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+    const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
 
     if(run_optimized_for_bifrost)
     {
@@ -466,9 +466,9 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
-        if(data_layout == DataLayout::NHWC)
+        if(_data_layout == DataLayout::NHWC)
         {
-            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
             build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
             build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
             build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
@@ -538,7 +538,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(data_layout));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -562,9 +562,8 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue
     win_in.adjust(Window::DimX, -_border_size.left, true);
     win_in.adjust(Window::DimY, -_border_size.top, true);
 
-    const DataLayout data_layout = _input->info()->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
     win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 10d6e68cd9..24f22c31a5 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -287,7 +287,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
 } // namespace
 
 CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
 {
 }
 
@@ -297,9 +297,10 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    const DataLayout   data_layout  = input->info()->data_layout();
-    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    _data_layout = input->info()->data_layout();
+
+    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int input_width  = input->info()->dimension(width_idx);
     const unsigned int input_height = input->info()->dimension(height_idx);
 
@@ -336,7 +337,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -369,7 +370,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_input->info()->data_layout() == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8eaf5bf76f..032d451aad 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -172,7 +172,7 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
 } // namespace
 
 CLPoolingLayerKernel::CLPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+    : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
@@ -185,13 +185,18 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
+    // Set instance variables
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _data_layout = input->info()->data_layout();
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    DataLayout          data_layout     = input->info()->data_layout();
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const int           pool_size_x     = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
     const int           pool_size_y     = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
@@ -218,11 +223,6 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
 
-    // Set instance variables
-    _input     = input;
-    _output    = output;
-    _pool_info = pool_info;
-
     const DataType data_type = input->info()->data_type();
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -237,7 +237,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
     // Create kernel
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -286,7 +286,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
     ICLKernel::configure_internal(std::get<1>(win_config));
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         CLPoolingConfig pooling_config     = std::get<2>(win_config);
         _num_elems_processed_per_iteration = pooling_config.first;
@@ -302,7 +302,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(data_layout));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
     _config_id += "_";
@@ -333,7 +333,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_input->info()->data_layout())
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 488313fd12..82c5c8a446 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -160,11 +160,12 @@ const ICLTensor *CLScaleKernel::output() const
 
 void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
 {
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+
     _input               = input;
     _output              = output;
     _interpolationPolicy = policy;
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+    _data_layout         = input->info()->data_layout();
 
     float wr = 0.f;
     float hr = 0.f;
@@ -172,10 +173,9 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
 
     const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
 
-    DataLayout data_layout = input->info()->data_layout();
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool is_nhwc     = data_layout == DataLayout::NHWC;
+    const int  idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_nhwc    = _data_layout == DataLayout::NHWC;
 
     // Compute the ratio between source width/height and destination width/height
     const unsigned int input_width   = input->info()->dimension(idx_width);
@@ -215,7 +215,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
     kernel_name += call_quantized_kernel ? "_quantized_" : "_";
-    kernel_name += lower_string(string_from_data_layout(data_layout));
+    kernel_name += lower_string(string_from_data_layout(_data_layout));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
@@ -249,7 +249,7 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index ce5ed86332..2ccd540788 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -37,7 +37,7 @@
 namespace arm_compute
 {
 CLUpsampleLayerKernel::CLUpsampleLayerKernel()
-    : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+    : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x()
 {
 }
 
@@ -71,13 +71,12 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     _input                                     = input;
     _output                                    = output;
     _info                                      = info;
+    _data_layout                               = input->info()->data_layout();
     _num_elems_processed_per_iteration_input_x = 1;
 
-    const DataLayout data_layout = input->info()->data_layout();
-
     TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-    output->info()->set_data_layout(data_layout);
+    output->info()->set_data_layout(_data_layout);
 
     unsigned int num_elems_processed_per_iteration_x = 16;
     const int    output_width_x                      = output->info()->dimension(0);
@@ -88,7 +87,7 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
 
     Window win{};
 
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -140,8 +139,7 @@ void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     Window slice_out        = collapsed_window.first_slice_window_3D();
     Window slice_in         = collapsed_window.first_slice_window_3D();
 
-    DataLayout data_layout = _input->info()->data_layout();
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
             slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 1c31ceba99..6125790491 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,7 +99,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
-    : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+    : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
 {
 }
 
@@ -116,16 +116,17 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    const DataLayout    data_layout      = input->info()->data_layout();
 
-    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    _data_layout = input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute number of elements to process in the X and Y direction
     const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
     const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         // Check if we need to extend the right or bottom border
         const unsigned int extra_border_right  = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -166,7 +167,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
         build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
@@ -184,7 +185,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
+    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -192,7 +193,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     // Append stepz and data layout
     kernel_name += "_stepz";
     kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
+    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -212,7 +213,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     _config_id += "_";
     _config_id += support::cpp11::to_string(conv_info.pad_top());
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -229,11 +230,10 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const DataLayout data_layout   = _input->info()->data_layout();
-    const size_t     idx_w         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t     idx_h         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t     idx_c         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const size_t     total_batches = window.shape().total_size_upper(3);
+    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const size_t total_batches = window.shape().total_size_upper(3);
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -241,7 +241,7 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
     Window slice = window_collapsed.first_slice_window_3D();
     slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
     slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
     }
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index df631c3c03..98b0c106db 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -63,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 } // namespace
 
 NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -80,6 +80,7 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
     _input       = input;
     _output      = output;
     _block_shape = block_shape;
+    _data_layout = input->info()->data_layout();
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -99,7 +100,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const int idx_channel  = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    const int idx_channel  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const int depth_size   = _input->info()->dimension(idx_channel);
     const int r            = (depth_size / (_block_shape * _block_shape));
     const int element_size = _input->info()->element_size();
@@ -112,7 +113,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Main loop for NCHW and NHWC
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         Window slice_in = window.first_slice_window_2D();
         do
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 0641d6cfa3..27c3d66b4f 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -265,10 +265,9 @@ void NEIm2ColKernel::run_im2col(const Window &window)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const DataLayout   data_layout = _input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
     const int input_w        = _input->info()->dimension(width_idx);
     const int input_h        = _input->info()->dimension(height_idx);
@@ -344,7 +343,7 @@ void NEIm2ColKernel::run_im2col(const Window &window)
 }
 
 NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -355,9 +354,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
     ARM_COMPUTE_UNUSED(num_groups);
 
-    const DataLayout   data_layout = input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    _data_layout                  = input->info()->data_layout();
+    const unsigned int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     _input          = input;
     _output         = output;
@@ -370,7 +369,7 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
                                         _conv_info, _dilation);
     _has_bias = has_bias;
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         switch(_input->info()->data_type())
         {
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 62c9ca0d5e..14de4a19d8 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -321,7 +321,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
 {
 }
 
@@ -364,14 +364,15 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
 
     // Set instance variables
-    _input     = input;
-    _output    = output;
-    _pool_info = pool_info;
-    _is_square = (pool_size.x() == pool_size.y());
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _data_layout = input->info()->data_layout();
+    _is_square   = (pool_size.x() == pool_size.y());
 
     // Get data type
     const DataType data_type = input->info()->data_type();
-    const bool     is_nchw   = data_layout == DataLayout::NCHW;
+    const bool     is_nchw   = _data_layout == DataLayout::NCHW;
 
     if(data_type == DataType::QASYMM8)
     {
@@ -1574,7 +1575,12 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
         // Calculate square-root in case of l2 pooling
         if(pooling_type == PoolingType::L2)
         {
-            vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
+            float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
+                                 };
+            vres = l2_res;
         }
 
         // Store result
@@ -1835,7 +1841,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     const bool         exclude_padding = _pool_info.exclude_padding();
 
     Window window_input(window);
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         // Set step for input in x and y direction for the input
         unsigned int window_x_inc = 0;
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index ffa4fa3565..16cd6f77b4 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -742,23 +742,8 @@ struct RedOpYZW
 
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
-                T *in_ptr;
-                switch(axis)
-                {
-                    case 1:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
-                        break;
-                    case 2:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
-                        break;
-                    case 3:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
+                const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim);
                 const auto vec_elements = wrapper::vloadq(in_ptr);
-
                 switch(op)
                 {
                     case ReductionOperation::SUM:
@@ -907,23 +892,8 @@ struct RedOpYZW_qasymm8
 
             for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
             {
-                uint8_t *in_ptr;
-                switch(axis)
-                {
-                    case 1:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
-                        break;
-                    case 2:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
-                        break;
-                    case 3:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-                const auto vec_elements = wrapper::vloadq(in_ptr);
-
+                const uint8_t *in_ptr       = input.ptr() + in_info.strides_in_bytes()[axis] * index_dim;
+                const auto     vec_elements = wrapper::vloadq(in_ptr);
                 switch(op)
                 {
                     case ReductionOperation::SUM:
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index a2a44fca18..5b8e196a2c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -909,7 +909,7 @@ void NEScaleKernel::scale_area_nchw(const Window &window)
 void NEScaleKernel::scale_nhwc(const Window &window)
 {
     // Get data layout and width/height indices
-    const DataLayout data_layout  = _input->info()->data_layout();
+    const DataLayout data_layout  = DataLayout::NHWC;
     const int        idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        idx_width    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 4803365013..ffd2dc14bf 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -66,7 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 } // namespace
 
 NESpaceToDepthLayerKernel::NESpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -82,6 +82,7 @@ void NESpaceToDepthLayerKernel::configure(const ITensor *input, ITensor *output,
     _input       = input;
     _block_shape = block_shape;
     _output      = output;
+    _data_layout = input->info()->data_layout();
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -100,9 +101,8 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const DataLayout data_layout  = _input->info()->data_layout();
-    const int        channel_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        element_size = _input->info()->element_size();
+    const int channel_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int element_size = _input->info()->element_size();
 
     const size_t channel_size = _input->info()->dimension(channel_idx);
 
@@ -111,7 +111,7 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_output->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         do
         {
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index a3634cd46e..c5de43da35 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -26,20 +26,81 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
+namespace
+{
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(keep_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+    Coordinates        axis_local    = reduction_axis;
+
+    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    {
+        //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
+    }
+
+    if(output->tensor_shape().total_size() != 0)
+    {
+        // Only validate if not using auto_init for the output tensor
+        TensorShape out_shape = input->tensor_shape();
+        // Validate output_shape only if not using auto_init
+        convert_negative_axis(axis_local, input_dims);
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(unsigned int i = 0; i < reduction_ops; ++i)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+            if(output->total_size() > 0 && keep_dims)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+            }
+            if(keep_dims)
+            {
+                out_shape.set(axis_local[i], 1);
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+                const unsigned int remove_index = axis_local[i] - i;
+                ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+                out_shape.remove_dimension(remove_index);
+            }
+        }
+        const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+    }
+    return Status{};
+}
+}
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
 {
 }
 void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
@@ -49,14 +110,10 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
     Coordinates axis_local = reduction_axis;
     const int   input_dims = input->info()->num_dimensions();
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
-    }
+    convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
@@ -75,7 +132,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
     }
 
     // Allocate intermediate tensors
-    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
@@ -88,7 +145,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        for(int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
@@ -99,55 +156,16 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-    TensorShape out_shape = input->tensor_shape();
-
-    Coordinates        axis_sorted   = reduction_axis;
-    const unsigned int reduction_ops = reduction_axis.num_dimensions();
-    const int          input_dims    = input->num_dimensions();
-
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
-    }
-
-    std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
-        if(output->total_size() > 0 && keep_dims)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
-        }
-        if(keep_dims)
-        {
-            out_shape.set(axis_sorted[i], 1);
-        }
-        else
-        {
-            out_shape.remove_dimension(axis_sorted[i] - i);
-        }
-    }
-
-    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-    return Status{};
+    return validate_config(input, reduction_axis, keep_dims, output);
 }
 
 void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(auto &kernel : _reduction_kernels)
     {
-        _reduction_kernels[i].run();
+        kernel.run();
     }
 
     if(!_keep_dims)
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b145f034d..96ec8b8587 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -24,80 +24,97 @@
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
 #include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
 {
 }
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    TensorShape        out_shape     = input->tensor_shape();
     const unsigned int reduction_ops = reduction_axis.num_dimensions();
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
+    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
+        //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-    for(unsigned int i = 0; i < reduction_ops; ++i)
+    if(output->tensor_shape().total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-        if(output->total_size() > 0 && keep_dims)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
-        }
-        if(keep_dims)
-        {
-            out_shape.set(axis_local[i], 1);
-        }
-        else
+        // Only validate if not using auto_init for the output tensor
+        TensorShape out_shape = input->tensor_shape();
+        // Validate output_shape only if not using auto_init
+        convert_negative_axis(axis_local, input_dims);
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(unsigned int i = 0; i < reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_local[i] - i);
+            ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+            if(output->total_size() > 0 && keep_dims)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+            }
+            if(keep_dims)
+            {
+                out_shape.set(axis_local[i], 1);
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+                const unsigned int remove_index = axis_local[i] - i;
+                ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+                out_shape.remove_dimension(remove_index);
+            }
         }
+        const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
     }
-    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
     return Status{};
 }
 
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    return validate_config(input, reduction_axis, keep_dims, output);
+}
+
 void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
-    Coordinates        axis_local    = reduction_axis;
-    const int          input_dims    = input->info()->num_dimensions();
-    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    Coordinates axis_local = reduction_axis;
+    const int   input_dims = input->info()->num_dimensions();
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
-    }
+    convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
@@ -116,7 +133,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     }
 
     // Allocate intermediate tensors
-    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
@@ -125,11 +142,10 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     if(!keep_dims)
     {
         TensorShape out_shape = input->info()->tensor_shape();
-
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        for(int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
@@ -141,10 +157,9 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(auto &kernel : _reduction_kernels)
     {
-        _reduction_kernels[i].run();
+        kernel.run();
     }
 
     if(!_keep_dims)
@@ -152,3 +167,5 @@ void NEReduceMean::run()
         _reshape.run();
     }
 }
+
+} // namespace arm_compute