8 files changed, 262 insertions, 111 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index fe65ac1a43..84d3594426 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -82,6 +82,8 @@ private:
     bool                                      _is_quantized;
     bool                                      _is_optimized;
     bool                                      _are_weights_reshaped;
+    bool                                      _is_nchw;
+    bool                                      _is_first_run;
 };
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 08d8f8ce56..edda2cd9da 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -44,6 +44,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
                                                          DataType::QS16, DataType::F16,
                                                          DataType::QS32, DataType::S32, DataType::F32);
@@ -68,6 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         }
 
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }
     else
@@ -79,6 +81,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     if((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
         if(is_data_type_fixed_point(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
@@ -101,6 +105,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
+    ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
     bool         window_changed                    = false;
     unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
 
@@ -138,8 +144,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         }
         else
         {
-            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, bias_access);
+            if(input->data_layout() == DataLayout::NCHW)
+            {
+                AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+                window_changed = update_window_and_padding(win, input_access, bias_access);
+            }
+            else
+            {
+                AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, bias_access);
+            }
         }
 
         input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
@@ -253,6 +267,7 @@ template <typename T1, typename T2, bool in_place, bool has_bias>
 void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);
@@ -303,6 +318,66 @@ void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITe
     }
 }
 
+template <typename T1, typename T2, bool in_place, bool has_bias>
+void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
+            const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+            }
+            else
+            {
+                internal_vst1q(in_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in, bi);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
+            const auto out_ptr  = reinterpret_cast<T2 *>(out.ptr());
+            const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+            }
+            else
+            {
+                internal_vst1q(out_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in, bi);
+    }
+}
+
 // QASYMM8 specializations
 template <>
 void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
@@ -415,61 +490,79 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
     INEKernel::configure(win_config.second);
 
     // Set appropriate function
-    switch(input->info()->data_type())
+    if(input->info()->data_layout() == DataLayout::NCHW)
     {
-        case DataType::QS8:
+        switch(input->info()->data_type())
         {
-            if(bias == nullptr)
+            case DataType::QS8:
             {
-                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+                if(bias == nullptr)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+                }
+                break;
             }
-            else
+            case DataType::QS16:
             {
-                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+                if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+                }
+                else if(bias == nullptr)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Not implemented");
+                }
+                break;
             }
-            break;
-        }
-        case DataType::QS16:
-        {
-            if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+            case DataType::QS32:
             {
-                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+                _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
+                break;
             }
-            else if(bias == nullptr)
+            case DataType::S32:
             {
-                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+                _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+                break;
             }
-            else
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
             {
-                ARM_COMPUTE_ERROR("Not implemented");
+                _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+                break;
             }
-            break;
-        }
-        case DataType::QS32:
-        {
-            _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
-            break;
-        }
-        case DataType::S32:
-        {
-            _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
-            break;
-        }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
-            break;
+            case DataType::F32:
+            {
+                _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
         }
-        default:
+    }
+    else
+    {
+        switch(input->info()->data_type())
         {
-            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            case DataType::F32:
+            {
+                _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
         }
     }
 }
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f28ed715f6..8691fb9f76 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -37,7 +37,7 @@ using namespace arm_compute::misc::shape_calculator;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
     : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
-      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
 {
 }
 
@@ -52,30 +52,38 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
     _has_bias     = biases != nullptr;
     _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
                                                                                           conv_info,
-                                                                                          input->info()->data_type());
+                                                                                          input->info()->data_type(),
+                                                                                          input->info()->data_layout());
     _are_weights_reshaped = false;
+    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
+
+    ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
 
     if(_is_optimized)
     {
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+        if(_is_nchw)
+        {
+            // Configure the function to transform the input tensor from NCHW -> NHWC
+            _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
 
-        // Configure optimized depthwise
-        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+            // Configure the function to transform the weights tensor from IHW -> HWI
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
 
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+            // Configure optimized depthwise
+            _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
 
-        // Allocate tensors
-        _input_nhwc.allocator()->allocate();
-        _weights_hwio.allocator()->allocate();
-        _output_nhwc.allocator()->allocate();
+            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+            _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
 
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
+            // Allocate tensors
+            _input_nhwc.allocator()->allocate();
+            _weights_hwio.allocator()->allocate();
+            _output_nhwc.allocator()->allocate();
+        }
+        else
+        {
+            _dwc_kernel.configure(input, weights, output, conv_info, DataLayout::NHWC);
+        }
     }
     else
     {
@@ -116,8 +124,15 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
+    if(_is_first_run && _is_optimized)
+    {
+        _is_first_run = false;
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
+    }
+
     // Permute weights in HWIO format if the optimized kernel will be executedd
-    if(!_are_weights_reshaped && _is_optimized)
+    if(!_are_weights_reshaped && _is_optimized && _is_nchw)
     {
         _are_weights_reshaped = true;
         _permute_weights.run();
@@ -126,8 +141,11 @@ void NEDepthwiseConvolutionLayer3x3::run()
     // Handle input
     if(_is_optimized)
     {
-        // Permute input to NHWC format execution
-        _permute_input.run();
+        if(_is_nchw)
+        {
+            // Permute input to NHWC format execution
+            _permute_input.run();
+        }
     }
     else
     {
@@ -139,7 +157,7 @@ void NEDepthwiseConvolutionLayer3x3::run()
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
     // Permute output to ACL's native NCHW format in case of NHWC execution
-    if(_is_optimized)
+    if(_is_optimized && _is_nchw)
     {
         _permute_output.run();
     }
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 8ac882cc60..1779ff1aee 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -54,14 +54,17 @@ template <typename T>
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset(), framework::dataset::make("DataType",
-                                                                                                                 DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType",
+                                                                                                                         DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -73,15 +76,17 @@ using CLDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidatio
 TEST_SUITE(Float)
 TEST_SUITE(F16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -90,15 +95,17 @@ TEST_SUITE_END()
 
 TEST_SUITE(FP32)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
diff --git a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
index cacf6962ee..2baa93e413 100644
--- a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,15 +55,17 @@ using GCDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidatio
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 0cdd4c0296..49e146c084 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -95,15 +95,17 @@ TEST_SUITE(F32)
 TEST_SUITE(Generic)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)))
+                                                                                                                               DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -112,21 +114,24 @@ TEST_SUITE_END()
 TEST_SUITE(W3x3)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index df5436fcf7..ccdd443999 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -52,15 +52,22 @@ public:
 
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
         _quantization_info = quantization_info;
         _data_type         = data_type;
         const TensorShape biases_shape(weights_shape[2]);
         const DataType    bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
-        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(in_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(out_shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
+        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
     }
 
 protected:
@@ -94,13 +101,13 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &output_shape, PadStrideInfo &pad_stride_info,
-                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info)
+                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info);
-        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info, data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info, data_layout);
 
         // Create Depthwise Convolution configure function
         FunctionType dwc;
@@ -134,11 +141,11 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &in_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &out_shape, const PadStrideInfo &pad_stride_info,
-                                      const DataType data_type, const DataType bias_data_type, QuantizationInfo quantization_info)
+                                      const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
-        SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, 0, quantization_info };
-        SimpleTensor<TBias> biases{ biases_shape, bias_data_type, 1, 0, quantization_info };
+        SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info, data_layout };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, 0, quantization_info, data_layout };
+        SimpleTensor<TBias> biases{ biases_shape, bias_data_type, 1, 0, quantization_info, data_layout };
 
         fill(src, 0);
         fill(weights, 1);
@@ -158,10 +165,10 @@ class DepthwiseConvolutionLayerValidationFixture : public DepthwiseConvolutionLa
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type)
+    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, DataLayout data_layout)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, QuantizationInfo());
+                                                                                                            data_type, QuantizationInfo(), data_layout);
     }
 };
 
@@ -173,7 +180,7 @@ public:
     void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, quantization_info);
+                                                                                                            data_type, quantization_info, DataLayout::NCHW);
     }
 };
 } // namespace validation
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index b2a7067709..ab61b7dd65 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -24,6 +24,7 @@
 #include "DepthwiseConvolutionLayer.h"
 
 #include "ConvolutionLayer.h"
+#include "Permute.h"
 #include "Utils.h"
 
 #include "tests/validation/FixedPoint.h"
@@ -50,11 +51,8 @@ namespace reference
  *
  */
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+void depthwise_convolution_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
 {
-    // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
-
     // Compute reference
     const int filter_width  = weights.shape().x();
     const int filter_height = weights.shape().y();
@@ -108,8 +106,6 @@ SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTe
             }
         }
     }
-
-    return dst;
 }
 
 template <>
@@ -195,6 +191,27 @@ SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, co
     return dst;
 }
 
+template <typename T, typename TB>
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+{
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
+
+    if(src.data_layout() == DataLayout::NHWC && src.data_type() == DataType::F32)
+    {
+        SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+
+        depthwise_convolution_nchw<T, TB>(src_nchw, weights_nchw, biases, dst_nchw, conv_info);
+
+        return reference::permute<T>(dst_nchw, PermutationVector(2U, 0U, 1U));
+    }
+
+    depthwise_convolution_nchw<T, TB>(src, weights, biases, dst, conv_info);
+
+    return dst;
+}
+
 template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
                                                    const PadStrideInfo &conv_info);