19 files changed, 281 insertions, 164 deletions
diff --git a/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h b/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
index 801934159d..04567ed959 100644
--- a/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
@@ -53,7 +53,7 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to flip. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input  The input tensor to flip. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output The output tensor. Data types supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
@@ -64,17 +64,15 @@ public:
     /** Function to perform flipping.
      *
      * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
      */
     template <typename T>
-    void flip_weights(const Window &window_input, const Window &window);
+    void flip_weights(const Window &window_input);
 
     /** Common signature for all the specialised Flip functions
      *
      * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
      */
-    using FlipWeightsFunction = void (CPPFlipWeightsKernel::*)(const Window &window_input, const Window &window);
+    using FlipWeightsFunction = void (CPPFlipWeightsKernel::*)(const Window &window_input);
 
 private:
     const ITensor      *_input;
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 7ee24e2736..cfd273618c 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -815,16 +815,6 @@ inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t siz
  */
 PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info);
 
-/** Returns expected shape for the deconvolution output tensor.
- *
- * @param[in] out_dims widht and height of the output tensor, these values can be obtained with the function deconvolution_output_dimensions.
- * @param[in] input    Shape of the input tensor.
- * @param[in] weights  Shape of the weights tensor.
- *
- * @return Deconvolution output tensor shape.
- */
-TensorShape deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights);
-
 /** Returns expected width and height of the deconvolution's output tensor.
  *
  * @param[in] in_width      Width of input tensor (Number of columns)
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index f68401c1b9..11d20c919f 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -229,26 +229,49 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     return output_shape;
 }
 
-inline TensorShape compute_deconvolution_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy, unsigned int inner_border_right, unsigned int inner_border_top,
-                                               std::pair<unsigned int, unsigned int> &out_dims)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy, unsigned int inner_border_right,
+                                                         unsigned int inner_border_top,
+                                                         std::pair<unsigned int, unsigned int> &out_dims, unsigned int &padx, unsigned int &pady)
 {
+    const DataLayout data_layout = input.data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     // Find the upsampled dimensions
-    unsigned int out_x = (input.dimension(0) - 1) * sx + inner_border_right + 1;
-    unsigned int out_y = (input.dimension(1) - 1) * sy + inner_border_top + 1;
+    unsigned int out_x = (input.dimension(idx_w) - 1) * sx + inner_border_right + 1;
+    unsigned int out_y = (input.dimension(idx_h) - 1) * sy + inner_border_top + 1;
 
     // Find the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int padx = out_dims.first - (out_x - weights.dimension(0) + 1);
-    unsigned int pady = out_dims.second - (out_y - weights.dimension(1) + 1);
+    padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+    pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
     out_x += padx;
     out_y += pady;
 
     TensorShape scale_out_shape(input.tensor_shape());
-    scale_out_shape.set(0, out_x);
-    scale_out_shape.set(1, out_y);
+    scale_out_shape.set(idx_w, out_x);
+    scale_out_shape.set(idx_h, out_y);
 
     return scale_out_shape;
 }
 
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+{
+    const TensorShape input_shape{ input.tensor_shape() };
+    const TensorShape weights_shape{ weights.tensor_shape() };
+
+    const DataLayout data_layout = input.data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    TensorShape out_shape{ input_shape };
+    out_shape.set(width_idx, out_dims.first);
+    out_shape.set(height_idx, out_dims.second);
+    out_shape.set(channel_idx, weights_shape[batch_idx]);
+    return out_shape;
+}
+
 inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
                                              unsigned int num_groups = 1)
 {
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 6716cd6fdd..39cbe0cafa 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -46,8 +46,12 @@ class ICLTensor;
  * specified value where a < stride - 1, that increases the padding top and right of the input image.
  *
  *  The relation between input to output is as follows:
- *      width_output = round((width_input − 1) ∗ (stride_x - 1) − 2 ∗ padding_x + kernel_x + inner_border_right )
- *      height_output = round((height_input − 1) ∗ (stride_y - 1) − 2 ∗ padding_y + kernel_y + inner_border_top )
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
  *
  *  where:
  *      width_input is the size of the first input dimension.
@@ -55,9 +59,16 @@ class ICLTensor;
  *      width_output is the size of the first output dimension.
  *      height_output is the size of the second output dimension.
  *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      inner_border_right and inner_border_top the number of zeros added to the right and top edges of the input.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref CPPFlipWeightsKernel.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
  */
 class CLDeconvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 0cca555621..73870093b7 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -46,8 +46,12 @@ namespace arm_compute
  * specified value where a < stride - 1 that increases the padding top and right of the input image.
  *
  *  The relation between input to output is as follows:
- *       width_output = round((width_input − 1) ∗ (stride_x - 1) − 2 ∗ padding_x + kernel_x + inner_border_right )
- *       height_output = round((height_input − 1) ∗ (stride_y - 1) − 2 ∗ padding_y + kernel_y + inner_border_top )
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
  *
  *  where
  *      width is the size of the first input dimension.
@@ -55,12 +59,15 @@ namespace arm_compute
  *      width_output is the size of the first output dimension.
  *      height_output is the size of the second output dimension.
  *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      inner_border_right and inner_border_top the number of zeros added to the top and right edges of the input.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
- *  This function calls the following NEON kernels:
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref CPPFlipWeightsKernel.
  *
- * -# @ref NEDirectConvolutionLayer
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsample
+ * -# @ref NEConvolutionLayer
  *
  */
 class NEDeconvolutionLayer : public IFunction
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index be3a926b96..dd7d79002d 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -45,11 +45,19 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -93,28 +101,61 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const DataLayout data_layout = _input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     const int out_start_x = _info.pad().first;
-    const int out_end_x   = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
+    const int out_end_x   = _output->info()->dimension(idx_w) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
     const int out_step_x  = _info.stride().first;
 
     const int out_start_y = _inner_border.top + _info.pad().second;
-    const int out_end_y   = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1;
+    const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    Window slice_out = collapsed.first_slice_window_3D();
-    slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
-    slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-    Window slice_in = collapsed.first_slice_window_3D();
-
-    do
+    switch(data_layout)
     {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
+        case DataLayout::NCHW:
+        {
+            Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+            Window slice_out = collapsed.first_slice_window_3D();
+            slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+            slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+            Window slice_in = collapsed.first_slice_window_3D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, slice_in);
+                add_3D_tensor_argument(idx, _output, slice_out);
+                enqueue(queue, *this, slice_out);
+            }
+            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            // NOTE: not collapsing in NHWC
+            Window slice_out = window.first_slice_window_3D();
+            slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+            slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+            Window slice_in = window.first_slice_window_3D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, slice_in);
+                add_3D_tensor_argument(idx, _output, slice_out);
+                enqueue(queue, *this, slice_out);
+            }
+            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data layout");
     }
-    while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
 }
diff --git a/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
index 741218e4f7..2d4c0ce5c8 100644
--- a/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
+++ b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
@@ -42,25 +42,36 @@ CPPFlipWeightsKernel::CPPFlipWeightsKernel()
 }
 
 template <typename T>
-void CPPFlipWeightsKernel::flip_weights(const Window &window_input, const Window &window)
+void CPPFlipWeightsKernel::flip_weights(const Window &window_input)
 {
     // Create iterators
     Iterator in(_input, window_input);
 
-    Iterator out(_output, window);
+    const DataLayout data_layout = _input->info()->data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    const int kernel_size = _input->info()->dimension(0);
+    const int kernel_width  = _input->info()->dimension(idx_w);
+    const int kernel_height = _input->info()->dimension(idx_h);
 
     execute_window_loop(window_input, [&](const Coordinates & id)
     {
-        *((reinterpret_cast<T *>(out.ptr()) + kernel_size * (kernel_size - id.y() - 1) + (kernel_size - id.x() - 1))) = *(reinterpret_cast<const T *>(in.ptr()));
+        const unsigned int x = kernel_width - id[idx_w] - 1;
+        const unsigned int y = kernel_height - id[idx_h] - 1;
+        Coordinates        output_coord(id);
+        output_coord.set(idx_w, x);
+        output_coord.set(idx_h, y);
+        *(reinterpret_cast<T *>(_output->ptr_to_element(output_coord))) = *(reinterpret_cast<const T *>(in.ptr()));
     },
-    in, out);
+    in);
 }
 
 void CPPFlipWeightsKernel::configure(const ITensor *input, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input  = input;
     _output = output;
@@ -98,9 +109,5 @@ void CPPFlipWeightsKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    Window out_window{ window };
-    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-    out_window.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    (this->*_func)(window, out_window);
+    (this->*_func)(window);
 }
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index a6a5771ec1..41fc87e87a 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -323,15 +323,6 @@ PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorSha
     return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
 }
 
-TensorShape arm_compute::deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights)
-{
-    TensorShape out_shape(input);
-    out_shape.set(0, out_dims.first);
-    out_shape.set(1, out_dims.second);
-    out_shape.set(2, weights[3]);
-    return out_shape;
-}
-
 const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
     unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
     unsigned int stride_x, unsigned int stride_y)
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 26d44e9c96..951d1ec4f0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -53,8 +53,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
     const unsigned int stride_x = info.stride().first;
@@ -63,10 +71,10 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
                                                     info.pad().first, info.pad().second, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
 
@@ -80,15 +88,17 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
 
-    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, *weights, stride_x, stride_y, inner_border_right,
-                                                                                                      inner_border_top,
-                                                                                                      out_dims)));
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
@@ -105,17 +115,22 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
 
+    const DataLayout data_layout = input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     _weights = weights;
-    _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
                                                     info.pad().first, info.pad().second, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
@@ -125,20 +140,13 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const
     _memory_group.manage(&_scaled_output);
     _memory_group.manage(&_weights_flipped);
 
-    // Find the upsampled dimensions
-    unsigned int out_x = (input->info()->dimension(0) - 1) * stride_x + inner_border_right + 1;
-    unsigned int out_y = (input->info()->dimension(1) - 1) * stride_y + inner_border_top + 1;
-
-    // Find the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int padx = out_dims.first - (out_x - weights->info()->dimension(0) + 1);
-    unsigned int pady = out_dims.second - (out_y - weights->info()->dimension(1) + 1);
-    out_x += padx;
-    out_y += pady;
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
 
-    TensorShape scale_out_shape(input->info()->tensor_shape());
-    scale_out_shape.set(0, out_x);
-    scale_out_shape.set(1, out_y);
     TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    scale_out_info.set_data_layout(data_layout);
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 6ca60c66a4..cbe7c51662 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -76,15 +76,17 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+        const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
     }
 
-    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, *weights, stride_x, stride_y, inner_border_right,
-                                                                                                      inner_border_top,
-                                                                                                      out_dims)));
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -116,7 +118,7 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
                                                     info.pad().first, info.pad().second, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
 
@@ -125,19 +127,11 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
 
     _memory_group.manage(&_scaled_output);
 
-    // Find the upsampled dimensions
-    unsigned int out_x = (input->info()->dimension(0) - 1) * stride_x + inner_border_right + 1;
-    unsigned int out_y = (input->info()->dimension(1) - 1) * stride_y + inner_border_top + 1;
-
-    // Find the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int padx = out_dims.first - (out_x - weights->info()->dimension(0) + 1);
-    unsigned int pady = out_dims.second - (out_y - weights->info()->dimension(1) + 1);
-    out_x += padx;
-    out_y += pady;
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
 
-    TensorShape scale_out_shape(input->info()->tensor_shape());
-    scale_out_shape.set(0, out_x);
-    scale_out_shape.set(1, out_y);
     TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
     _scaled_output.allocator()->init(scale_out_info);
 
@@ -171,4 +165,4 @@ void NEDeconvolutionLayer::prepare()
         _conv_f.prepare();
         _is_prepared = true;
     }
-}
-\ No newline at end of file
+}
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index ca4abd1671..bbfc760bf3 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -164,6 +164,9 @@ public:
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        // TODO (micgio01) - COMPMID-1604: investigate issue in GLES and re-enable the following dataset
+        // Single output channel
+        //add_config(TensorShape(5U, 4U, 3U, 2U), TensorShape(4U, 4U, 3U, 1U), TensorShape(1U), TensorShape(2U, 1U, 1U, 2U), PadStrideInfo(1, 1, 0, 0, 0, 0, DimensionRoundingType::FLOOR));
     }
 };
 
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index 84a2b01797..7727d9029d 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
@@ -45,7 +46,7 @@ namespace
 {
 constexpr AbsoluteTolerance<float>  tolerance_fp32(0.001f);               /**< Tolerance for floating point tests */
 RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's for DataType::F16 */
-constexpr AbsoluteTolerance<float>  tolerance_qasymm8(1.0);               /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr AbsoluteTolerance<float>  tolerance_qasymm8(0.0);               /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 5) * framework::dataset::make("StrideY", 1, 5) * framework::dataset::make("PadX", 0, 3)
@@ -57,6 +58,7 @@ const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
                      * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("ax", 0) * framework::dataset::make("ay", 0) * framework::dataset::make("NumKernels", { 1, 3 });
 
+const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 } // namespace
 
 TEST_SUITE(CL)
@@ -72,7 +74,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, (combine(datasets::Sm
     const TensorShape  weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
     const TensorShape  bias_shape(num_kernels);
     auto               out_dim      = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, 1, 1, 1, 1);
-    TensorShape        output_shape = deconvolution_output_shape(out_dim, input_shape, weights_shape);
+    TensorShape        output_shape = compute_deconvolution_output_shape(out_dim, TensorInfo(input_shape, 1, data_type), TensorInfo(weights_shape, 1, data_type));
 
     // Create tensors
     CLTensor src     = create_tensor<CLTensor>(input_shape, data_type, 1);
@@ -169,7 +171,7 @@ TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
 TEST_SUITE(W4x4)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<float>, framework::DatasetMode::ALL, combine(data4x4, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<float>, framework::DatasetMode::ALL, combine(combine(data4x4, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -177,7 +179,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<float>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(data3x3, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(data3x3, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -185,7 +187,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<float>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::ALL, combine(data1x1, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::ALL, combine(combine(data1x1, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -197,7 +199,7 @@ TEST_SUITE_END()
 TEST_SUITE(FP16)
 
 TEST_SUITE(W4x4)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::DatasetMode::ALL, combine(data4x4, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::DatasetMode::ALL, combine(combine(data4x4, framework::dataset::make("DataType", DataType::F16)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -205,7 +207,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::Dat
 TEST_SUITE_END()
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(data3x3, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -213,7 +215,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture3x3<half>, framework::Dat
 TEST_SUITE_END()
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<half>, framework::DatasetMode::ALL, combine(data1x1, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<half>, framework::DatasetMode::ALL, combine(combine(data1x1, framework::dataset::make("DataType", DataType::F16)), data_layouts_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -236,7 +238,8 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
 TEST_SUITE(W4x4)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<uint8_t>, framework::DatasetMode::ALL, combine(combine(data4x4, framework::dataset::make("DataType", DataType::QASYMM8)),
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(data4x4, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                   data_layouts_dataset),
                                                                                                                    framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 0))))
 {
     // Validate output
@@ -245,7 +248,8 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::ALL, combine(combine(data3x3, framework::dataset::make("DataType", DataType::QASYMM8)),
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                   data_layouts_dataset),
                                                                                                                    framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 0))))
 {
     // Validate output
@@ -254,7 +258,8 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(data1x1, framework::dataset::make("DataType", DataType::QASYMM8)),
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                   data_layouts_dataset),
                                                                                                                    framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 0))))
 {
     // Validate output
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index fd034b649e..eab6d5629f 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -302,6 +302,15 @@ void zeros(SimpleTensor<T> &in, const Coordinates &anchor, const TensorShape &sh
     }
 }
 
+std::pair<int, int> get_quantized_bounds(const QuantizationInfo &quant_info, float min, float max)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max");
+
+    const int min_bound = quant_info.quantize(min, RoundingPolicy::TO_NEAREST_UP);
+    const int max_bound = quant_info.quantize(max, RoundingPolicy::TO_NEAREST_UP);
+    return std::pair<int, int>(min_bound, max_bound);
+}
+
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
 template void zeros(SimpleTensor<float> &in, const Coordinates &anchor, const TensorShape &shape);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 779ecdca11..4d1d21440d 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -231,6 +231,14 @@ void get_tile(const SimpleTensor<T> &in, SimpleTensor<T> &tile, const Coordinate
  */
 template <typename T>
 void zeros(SimpleTensor<T> &in, const Coordinates &anchor, const TensorShape &shape);
+
+/** Helper function to compute quantized min and max bounds
+ *
+ * @param[in] quant_info Quantization info to be used for conversion
+ * @param[in] min        Floating point minimum value to be quantized
+ * @param[in] max        Floating point maximum value to be quantized
+ */
+std::pair<int, int> get_quantized_bounds(const QuantizationInfo &quant_info, float min, float max);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index eb643b8e7c..1b74400676 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
@@ -53,6 +54,7 @@ const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
                      * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("ax", 0) * framework::dataset::make("ay", 0) * framework::dataset::make("NumKernels", { 1, 3 });
 
+const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -68,7 +70,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, (combine(datasets::Sm
     const TensorShape  weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
     const TensorShape  bias_shape(num_kernels);
     auto               out_dim      = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, 1, 1, 1, 1);
-    TensorShape        output_shape = deconvolution_output_shape(out_dim, input_shape, weights_shape);
+    TensorShape        output_shape = compute_deconvolution_output_shape(out_dim, TensorInfo(input_shape, 1, data_type), TensorInfo(weights_shape, 1, data_type));
 
     // Create tensors
     Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1);
@@ -172,7 +174,7 @@ TEST_SUITE(Float)
 TEST_SUITE(FP32)
 TEST_SUITE(W4x4)
 
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture4x4<float>, framework::DatasetMode::ALL, combine(data4x4, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture4x4<float>, framework::DatasetMode::ALL, combine(combine(data4x4, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -181,7 +183,7 @@ TEST_SUITE_END()
 
 TEST_SUITE(W3x3)
 
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(data3x3, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(data3x3, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -189,7 +191,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture3x3<float>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::ALL, combine(data1x1, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::ALL, combine(combine(data1x1, framework::dataset::make("DataType", DataType::F32)), data_layouts_dataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 3b420eac09..795b9de6cd 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -77,7 +77,8 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint8_t> distribution(0, 3);
+                std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
                 library->fill(tensor, distribution, i);
                 break;
             }
diff --git a/tests/validation/fixtures/DeconvolutionLayerFixture.h b/tests/validation/fixtures/DeconvolutionLayerFixture.h
index d3a7be74b0..85c7ed5604 100644
--- a/tests/validation/fixtures/DeconvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DeconvolutionLayerFixture.h
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -39,6 +40,8 @@ namespace test
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class DeconvolutionLayerFixtureBase : public framework::Fixture
 {
@@ -48,12 +51,15 @@ public:
 public:
     template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
-               const std::pair<unsigned int, unsigned int> &inner_border, DataType data_type, QuantizationInfo quantization_info)
+               const std::pair<unsigned int, unsigned int> &inner_border, DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info)
     {
-        _data_type = data_type;
+        _data_type         = data_type;
+        _bias_data_type    = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+        _data_layout       = data_layout;
+        _quantization_info = quantization_info;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, quantization_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, quantization_info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, inner_border);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, inner_border);
     }
 
 protected:
@@ -64,7 +70,8 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint8_t> distribution(0, 3);
+                std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -86,14 +93,21 @@ protected:
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
-                              const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &inner_border, DataType data_type, QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape bias_shape, TensorShape output_shape,
+                              const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &inner_border)
     {
+        if(_data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(output_shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, quantization_info);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type, 1, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info, _data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, _data_type, 1, _quantization_info, _data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info, _data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _quantization_info, _data_layout);
 
         // Create and configure function
         FunctionType conv;
@@ -127,12 +141,12 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
-                                      const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> inner_border, DataType data_type, QuantizationInfo quantization_info)
+                                      const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> inner_border)
     {
         // Create reference
-        SimpleTensor<T>     src{ input_shape, data_type, 1, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, data_type, 1, quantization_info };
+        SimpleTensor<T>     src{ input_shape, _data_type, 1, _quantization_info };
+        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _quantization_info };
+        SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, _quantization_info };
 
         // Fill reference
         fill(src, 0);
@@ -142,9 +156,12 @@ protected:
         return reference::deconvolution_layer<T>(src, weights, bias, output_shape, info, inner_border);
     }
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    DataType        _data_type{};
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    DataType         _data_type{};
+    DataType         _bias_data_type{};
+    DataLayout       _data_layout{};
+    QuantizationInfo _quantization_info{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, unsigned int kernel_size_x, unsigned int kernel_size_y>
@@ -153,16 +170,18 @@ class DeconvolutionValidationFixture : public DeconvolutionLayerFixtureBase<Tens
 public:
     template <typename...>
     void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int padx, unsigned int pady,
-               unsigned int inner_border_right, unsigned int inner_border_top, unsigned int num_kernels, DataType data_type)
+               unsigned int inner_border_right, unsigned int inner_border_top, unsigned int num_kernels, DataType data_type, DataLayout data_layout)
     {
         ARM_COMPUTE_ERROR_ON_MSG(kernel_size_x != kernel_size_y, "Only square kernels supported");
         const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(sx, sy, padx, pady, DimensionRoundingType::CEIL);
         const std::pair<unsigned int, unsigned int> inner_border(inner_border_right, inner_border_top);
-        auto        out_dim      = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, padx, pady, sx, sy);
-        TensorShape output_shape = deconvolution_output_shape(out_dim, input_shape, weights_shape);
-        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, QuantizationInfo());
+        auto        out_dim = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, padx, pady, sx, sy);
+        TensorInfo  input_info(input_shape, 1, data_type);
+        TensorInfo  weights_info(weights_shape, 1, data_type);
+        TensorShape output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, data_layout, QuantizationInfo());
     }
 };
 
@@ -172,16 +191,18 @@ class DeconvolutionValidationQuantizedFixture : public DeconvolutionLayerFixture
 public:
     template <typename...>
     void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int padx, unsigned int pady,
-               unsigned int inner_border_right, unsigned int inner_border_top, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info)
+               unsigned int inner_border_right, unsigned int inner_border_top, unsigned int num_kernels, DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info)
     {
         ARM_COMPUTE_ERROR_ON_MSG(kernel_size_x != kernel_size_y, "Only square kernels supported");
         const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(sx, sy, padx, pady, DimensionRoundingType::CEIL);
         const std::pair<unsigned int, unsigned int> inner_border(inner_border_right, inner_border_top);
-        auto        out_dim      = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, padx, pady, sx, sy);
-        TensorShape output_shape = deconvolution_output_shape(out_dim, input_shape, weights_shape);
-        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, quantization_info);
+        auto        out_dim = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, padx, pady, sx, sy);
+        TensorInfo  input_info(input_shape, 1, data_type, quantization_info);
+        TensorInfo  weights_info(weights_shape, 1, data_type, quantization_info);
+        TensorShape output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, inner_border, data_type, data_layout, quantization_info);
     }
 };
 
diff --git a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
index 3bb935e49f..93e4e64830 100644
--- a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
+++ b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
@@ -68,11 +68,10 @@ protected:
         }
         else if(is_data_type_quantized_asymmetric(_data_type))
         {
-            const QuantizationInfo          quant_info = src_tensor.quantization_info();
-            const int                       min_bound  = quant_info.quantize(-1.f, RoundingPolicy::TO_NEAREST_UP);
-            const int                       max_bound  = quant_info.quantize(1.f, RoundingPolicy::TO_NEAREST_UP);
-            std::uniform_int_distribution<> distribution(min_bound, max_bound);
-            std::uniform_int_distribution<> distribution_std(quant_info.quantize(0.1f, RoundingPolicy::TO_NEAREST_UP), max_bound);
+            const QuantizationInfo quant_info = src_tensor.quantization_info();
+            std::pair<int, int> bounds = get_quantized_bounds(quant_info, -1.f, 1.0f);
+            std::uniform_int_distribution<> distribution(bounds.first, bounds.second);
+            std::uniform_int_distribution<> distribution_std(quant_info.quantize(0.1f, RoundingPolicy::TO_NEAREST_UP), bounds.second);
             library->fill(src_tensor, distribution, 0);
             library->fill(mean_tensor, distribution, 1);
             library->fill(std_tensor, distribution_std, 2);
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 6debd4a038..8692213641 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -32,6 +32,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ReductionOperation.h"
 #include "tests/validation/reference/ReshapeLayer.h"
 
@@ -63,10 +64,8 @@ protected:
         }
         else
         {
-            const QuantizationInfo          quant_info = tensor.quantization_info();
-            const int                       min_bound  = quant_info.quantize(-1.f, RoundingPolicy::TO_NEAREST_UP);
-            const int                       max_bound  = quant_info.quantize(1.f, RoundingPolicy::TO_NEAREST_UP);
-            std::uniform_int_distribution<> distribution(min_bound, max_bound);
+            std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+            std::uniform_int_distribution<> distribution(bounds.first, bounds.second);
 
             library->fill(tensor, distribution, 0);
         }