From 30271c779c36a2abe6995c4454674d92bbc1f91f Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 24 Jun 2019 14:56:34 +0100
Subject: COMPMID-2156: Optimized dilated convolution for NEON.

Change-Id: I3a8abe8cc9637c8983d9bd69dcbaee1a15eac8d0
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1492
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 SConscript                                         |   7 +-
 .../kernels/convolution/depthwise/depthwise.hpp    |  41 +-
 .../convolution/depthwise/depthwise_dilated.hpp    | 156 ++++
 .../convolution/depthwise/depthwise_quantized.hpp  |  38 +-
 .../depthwise/depthwise_quantized_dilated.hpp      |  88 ++
 .../kernels/convolution/depthwise/impl_base.hpp    |  32 +-
 .../kernels/convolution/depthwise/impl_dilated.hpp | 295 ++++++
 arm_compute/graph/backends/FunctionHelpers.h       |   2 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   | 119 +++
 .../NEDepthwiseConvolutionAssemblyDispatch.h       |  39 +-
 docs/00_introduction.dox                           |   1 +
 .../NEON/kernels/convolution/common/padding.cpp    |   4 +-
 .../NEON/kernels/convolution/common/qasymm8.cpp    |   2 +-
 .../convolution/depthwise/depthwise_dilated.cpp    |  32 +
 .../depthwise/depthwise_dilated_qa8_qa8.cpp        | 142 +++
 .../depthwise/depthwise_pack_parameters.cpp        |   2 +-
 .../convolution/depthwise/impl_fp16_fp16.hpp       |  29 +-
 .../convolution/depthwise/impl_fp32_fp32.hpp       |  28 +-
 .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 997 ++++++++++++++++-----
 src/graph/backends/CL/CLFunctionsFactory.cpp       |   4 +-
 src/graph/backends/NEON/NEFunctionFactory.cpp      |   4 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 327 +++++++
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     | 127 +--
 tests/datasets/DepthwiseConvolutionLayerDataset.h  |   1 +
 .../validation/NEON/DepthwiseConvolutionLayer.cpp  |   8 +-
 25 files changed, 2192 insertions(+), 333 deletions(-)
 create mode 100644 arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
 create mode 100644 arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
 create mode 100644 arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp

diff --git a/SConscript b/SConscript
index 45c4ccc414..a170a4a7c1 100644
--- a/SConscript
+++ b/SConscript
@@ -205,10 +205,13 @@ if env['neon']:
 
     core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
 
-    # build winograd sources for either v7a / v8a
+    # build winograd/depthwise sources for either v7a / v8a
     core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
     core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp')
-    arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/winograd/","arm_compute/core/NEON/kernels/convolution/common/" , "arm_compute/core/NEON/kernels/assembly/"])
+    arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/common/",
+                                      "arm_compute/core/NEON/kernels/convolution/winograd/",
+                                      "arm_compute/core/NEON/kernels/convolution/depthwise/",
+                                      "arm_compute/core/NEON/kernels/assembly/"])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index e0cb616a3d..a4a833d90a 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -25,8 +25,8 @@
 #pragma once
 
 #include <arm_neon.h>
-#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+#include "activation.hpp"
+#include "padding.hpp"
 
 namespace depthwise
 {
@@ -127,6 +127,23 @@ class DepthwiseConvolutionBase : public IDepthwiseConvolution
       unsigned int padding_right
     );
 
+    /** Create a new depthwise convolution engine.
+     *
+     * @param[in] n_batches Number of batches tensors.
+     * @param[in] n_input_rows Number of rows in input tensor.
+     * @param[in] n_input_cols Number of columns in input tensor.
+     * @param[in] n_channels Number of channels in input and output tensors.
+     */
+    DepthwiseConvolutionBase(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
     // Cannot copy or move a DepthwiseConvolution.
     DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
     DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
@@ -417,6 +434,16 @@ class DepthwiseConvolution<
       unsigned int padding_right
     );
 
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
   protected:
     template <nck::ActivationFunction Activation>
     void execute_tile(
@@ -488,6 +515,16 @@ class DepthwiseConvolution<
       unsigned int padding_right
     );
 
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
   protected:
     template <nck::ActivationFunction Activation>
     void execute_tile(
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
new file mode 100644
index 0000000000..e0d7f0c7f1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <deque>
+#include <functional>
+#include <memory>
+
+#include "depthwise.hpp"
+
+namespace depthwise
+{
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut
+>
+class DilatedDepthwiseConvolution : public IDepthwiseConvolution
+{
+  public:
+    /** Create a new dilated depthwise convolution engine.
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    /** Create a new dilated depthwise convolution engine.
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    // Cannot copy or move a DilatedDepthwiseConvolution.
+    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
+    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
+
+    /* Set input tensor and stride. */
+    void set_input(const void *inptr) override;
+    void set_input(const void *inptr, int column_stride) override;
+    void set_input(const void *inptr, int row_stride, int column_stride) override;
+    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /* Set output tensor and stride. */
+    void set_output(void *outptr) override;
+    void set_output(void *outptr, int column_stride) override;
+    void set_output(void *outptr, int row_stride, int column_stride) override;
+    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
+
+    static int get_output_size(
+      int dim_size,
+      unsigned int padding_before,
+      unsigned int padding_after,
+      int dilation_factor
+    );
+
+    int output_size(
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    ) const override;
+
+    /* Weights and biases are re-ordered to improve memory access patterns. Use
+     * these methods to determine the size of the re-pack buffer and to set the
+     * address (and implicitly reorder the weights and biases into) the buffer.
+     */
+    size_t get_packed_params_size(void) const override;
+    void set_packed_params_buffer(void *) override;
+
+    void pack_params(const void *weights, const void *biases=nullptr) const override;
+    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
+    void pack_params(
+      void *buffer,
+      const void* weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const override;
+
+    /* Working space is used to pad tensors on the fly. Before running any
+     * inference check the amount of space required, allocate and provide a
+     * pointer to the convolution engine.
+     */
+    size_t get_working_space_size(unsigned int nthreads=1) const override;
+    void set_working_space(void *) override;
+
+    unsigned int get_window(void) const override;
+    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
+
+  protected:
+    /** Protected constructor which also accepts a function to construct a new
+     * subconvolution
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right,
+      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
+    );
+
+    const int _dilation_factor;
+    const int _n_input_rows, _n_input_cols, _n_channels;
+    const int _padding_top, _padding_left;
+    const int _n_output_rows, _n_output_cols;
+
+    /* Dilated depthwise convolution is performed through repeated calls to
+     * non-dilated convolutions. If the dilation factor is $n$, then we perform
+     * $(n + 1)^2$ depthwise convolutions.
+     */
+    using BaseDepthwise = DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      TIn, TBias, TOut
+    >;
+    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
+};
+
+}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index e34023faf1..b65ced6f35 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 
 #pragma once
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+#include "depthwise.hpp"
+#include "qasymm8.hpp"
 
 namespace depthwise
 {
@@ -70,6 +70,33 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
 
     QAsymm8DepthwiseConvolution(
       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
       nck::ActivationFunction activation,
       const qasymm8::QAsymm8Params& weight_quantisation,
       const qasymm8::QAsymm8Params& input_quantisation,
@@ -82,6 +109,11 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
     );
 
   protected:
+    static nck::ActivationFunction get_activation_fn(
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& output_quantisation
+    );
+
     uint8_t _input_padding_value(void) const;
 
     void _pack_params(
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
new file mode 100644
index 0000000000..cf1c6f581f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "depthwise_dilated.hpp"
+#include "depthwise_quantized.hpp"
+
+namespace depthwise {
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+class QAsymm8DilatedDepthwiseConvolution
+    : public DilatedDepthwiseConvolution<
+          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+          StrideCols, uint8_t, int32_t, uint8_t> {
+public:
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+};
+
+}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index 493b2991dc..b102a24250 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -32,9 +32,9 @@
 
 #include <algorithm>
 #include <cstdint>
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
+#include "depthwise.hpp"
+#include "padding.hpp"
+#include "utils.hpp"
 
 #pragma once
 
@@ -95,6 +95,28 @@ MEMBERFN()::DepthwiseConvolutionBase(
   const unsigned int padding_left,
   const unsigned int padding_bottom,
   const unsigned int padding_right
+) : DepthwiseConvolutionBase(
+      n_batches, n_input_rows, n_input_cols, n_channels,
+      get_output_size(n_input_rows, padding_top, padding_bottom),
+      get_output_size(n_input_cols, padding_left, padding_right),
+      activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
+
+MEMBERFN()::DepthwiseConvolutionBase(
+  const int n_batches,
+  const int n_input_rows,
+  const int n_input_cols,
+  const int n_channels,
+  const int n_output_rows,
+  const int n_output_cols,
+  ActivationFunction activation,
+  const unsigned int padding_top,
+  const unsigned int padding_left,
+  const unsigned int padding_bottom,
+  const unsigned int padding_right
 ) : _input(nullptr), _output(nullptr),
     _packed_parameters(nullptr),
     _working_space(nullptr),
@@ -102,8 +124,8 @@ MEMBERFN()::DepthwiseConvolutionBase(
     _n_input_rows(n_input_rows),
     _n_input_cols(n_input_cols),
     _n_channels(n_channels),
-    _n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)),
-    _n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)),
+    _n_output_rows(n_output_rows),
+    _n_output_cols(n_output_cols),
     _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
     _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
     _padding_top(padding_top),
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
new file mode 100644
index 0000000000..2ef965ba4b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_dilated.hpp"
+#include "utils.hpp"
+
+#define MEMBERFN(TOUT)                                                         \
+  template <unsigned int OutputTileRows, unsigned int OutputTileColumns,       \
+            unsigned int KernelRows, unsigned int KernelColumns,               \
+            unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \
+            typename TBias, typename TOut>                                     \
+  TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns,          \
+                                   KernelRows, KernelColumns, StrideRows,      \
+                                   StrideColumns, TIn, TBias, TOut>
+
+namespace depthwise {
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
+                              const int n_input_cols, const int n_channels,
+                              const int dilation_factor,
+                              nck::ActivationFunction activation,
+                              const unsigned int padding_top,
+                              const unsigned int padding_left,
+                              const unsigned int padding_bottom,
+                              const unsigned int padding_right)
+    : DilatedDepthwiseConvolution(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          DilatedDepthwiseConvolution::get_output_size(
+              n_input_rows, padding_top, padding_bottom, dilation_factor),
+          DilatedDepthwiseConvolution::get_output_size(
+              n_input_cols, padding_left, padding_right, dilation_factor),
+          activation, padding_top, padding_left, padding_bottom,
+          padding_right) {}
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
+                              const int n_input_cols, const int n_channels,
+                              const int dilation_factor,
+                              const int n_output_rows, const int n_output_cols,
+                              nck::ActivationFunction activation,
+                              const unsigned int padding_top,
+                              const unsigned int padding_left,
+                              const unsigned int, // padding_bottom
+                              const unsigned int  // padding_right
+                              )
+    : DilatedDepthwiseConvolution(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          n_output_rows, n_output_cols, activation, padding_top, padding_left,
+          0, 0,
+          // Function which creates a new (standard) depthwise convolution
+          [](const int n_batches, const int n_input_rows,
+             const int n_input_cols, const int n_channels,
+             const int n_output_rows, const int n_output_cols,
+             const nck::ActivationFunction activation,
+             const unsigned int padding_top, const unsigned int padding_left,
+             const unsigned int padding_bottom,
+             const unsigned int padding_right) -> IDepthwiseConvolution * {
+            return new DepthwiseConvolution<
+                OutputTileRows, OutputTileColumns, KernelRows, KernelColumns,
+                StrideRows, StrideColumns, TIn, TBias, TOut>(
+                n_batches, n_input_rows, n_input_cols, n_channels,
+                n_output_rows, n_output_cols, activation, padding_top,
+                padding_left, padding_bottom, padding_right);
+          }) {}
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(
+    const int n_batches, const int n_input_rows, const int n_input_cols,
+    const int n_channels, const int dilation_factor, const int n_output_rows,
+    const int n_output_cols, nck::ActivationFunction activation,
+    const unsigned int padding_top, const unsigned int padding_left,
+    const unsigned int, // padding_bottom
+    const unsigned int, // padding_right
+    std::function<IDepthwiseConvolution *(
+        int, int, int, int, int, int, nck::ActivationFunction, unsigned int,
+        unsigned int, unsigned int, unsigned int)>
+        subconvfn // Function to create a new convolution
+    )
+    : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows),
+      _n_input_cols(n_input_cols), _n_channels(n_channels),
+      _padding_top(static_cast<int>(padding_top)),
+      _padding_left(static_cast<int>(padding_left)),
+      _n_output_rows(n_output_rows), _n_output_cols(n_output_cols),
+      _convs(_dilation_factor) {
+  // Instantiate the base convolutions
+  for (int i = 0; i < _dilation_factor; i++) {
+    // Compute properties of this row of base convolutions
+    const int row_top =
+        i * StrideRows - _padding_top; // -ve values are in the padding
+    const int row_pad_top =
+        row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0;
+
+    const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor);
+    const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor);
+
+    for (int j = 0; j < _dilation_factor; j++) {
+      // Compute properties of the base convolution
+      const int col_left =
+          j * StrideColumns - padding_left; // -ve values are in the padding
+      const int col_pad_left =
+          col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0;
+
+      const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor);
+      const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor);
+
+      // Create new depthwise convolution engine and include it in the vector
+      // of engines. The new depthwise convolution engine is created by calling
+      // the delegate function we received as an argument.
+      _convs[i].emplace_back(subconvfn(
+          n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows,
+          _n_output_cols, activation,
+          // Note: since we have computed the output tensor size we don't need
+          // to explicitly provide bottom and right padding values to the
+          // depthwise convolution.
+          row_pad_top, col_pad_left, 0, 0));
+    }
+  }
+}
+
+MEMBERFN(void)::set_input(const void *const inptr) {
+  set_input(inptr, _n_channels);
+}
+
+MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) {
+  set_input(inptr, _n_input_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)
+::set_input(const void *const inptr, const int ldrow, const int ldcol) {
+  set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)
+::set_input(const void *const inptr, const int ldbatch, const int ldrow,
+            const int ldcol) {
+  // Compute dilated strides
+  const int ldrow_dilated = ldrow * _dilation_factor;
+  const int ldcol_dilated = ldcol * _dilation_factor;
+
+  // Pass input parameters on to base convolutions
+  for (int i = 0; i < _dilation_factor; i++) {
+    const int top_pos =
+        i * StrideRows - _padding_top +
+        ((static_cast<int>(i * StrideRows) < _padding_top)
+             ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) *
+                   _dilation_factor
+             : 0);
+    const TIn *const inptr_i =
+        static_cast<const TIn *>(inptr) + top_pos * ldrow;
+
+    for (int j = 0; j < _dilation_factor; j++) {
+      int left_pos = j * StrideColumns - _padding_left;
+      while (left_pos < 0)
+        left_pos += _dilation_factor;
+
+      // Modify the pointer to point to the first element of the dilated input
+      // tensor, then set the input for this convolution engine.
+      const void *const inptr_ij = inptr_i + left_pos * ldcol;
+      _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated);
+    }
+  }
+}
+
+MEMBERFN(void)::set_output(void *const outptr) {
+  set_output(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output(void *const outptr, const int ldcol) {
+  set_output(outptr, _n_output_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)
+::set_output(void *const outptr, const int ldrow, const int ldcol) {
+  set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)
+::set_output(void *const outptr, const int ldbatch, const int ldrow,
+             const int ldcol) {
+  // Compute dilated strides
+  const int ldrow_dilated = ldrow * _dilation_factor;
+  const int ldcol_dilated = ldcol * _dilation_factor;
+
+  // Pass input parameters on to base convolutions
+  for (int i = 0; i < _dilation_factor; i++) {
+    for (int j = 0; j < _dilation_factor; j++) {
+      // Modify the pointer to point to the first element of the dilated input
+      // tensor, then set the input for this convolution engine.
+      void *const outptr_ij =
+          static_cast<TOut *>(outptr) + i * ldrow + j * ldcol;
+      _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated,
+                               ldcol_dilated);
+    }
+  }
+}
+
+MEMBERFN(int)
+::get_output_size(const int dim_size, const unsigned int padding_before,
+                  const unsigned int padding_after, const int dilation_factor) {
+  const int input_size =
+      dim_size + static_cast<int>(padding_before + padding_after);
+  const int window_size = (KernelRows - 1) * dilation_factor + 1;
+  return iceildiv(input_size - window_size + 1, StrideRows);
+}
+
+MEMBERFN(int)
+::output_size(const int dim_size, const unsigned int padding_before,
+              const unsigned int padding_after) const {
+  return get_output_size(dim_size, padding_before, padding_after,
+                         _dilation_factor);
+}
+
+MEMBERFN(size_t)::get_packed_params_size(void) const {
+  return _convs[0][0]->get_packed_params_size();
+}
+
+MEMBERFN(void)::set_packed_params_buffer(void *buffer) {
+  // Set the buffer for all convolution engines
+  for (auto &&row : _convs) {
+    for (auto &&conv : row) {
+      conv->set_packed_params_buffer(buffer);
+    }
+  }
+}
+
+MEMBERFN(void)
+::pack_params(const void *const weights, const void *const biases) const {
+  _convs[0][0]->pack_params(weights, biases);
+}
+
+MEMBERFN(void)
+::pack_params(void *const buffer, const void *const weights,
+              const void *const biases) const {
+  _convs[0][0]->pack_params(buffer, weights, biases);
+}
+
+MEMBERFN(void)
+::pack_params(void *const buffer, const void *const weights,
+              const unsigned int ldrow, const unsigned int ldcol,
+              const void *const biases) const {
+  _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases);
+}
+
+MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const {
+  return _convs[0][0]->get_working_space_size(nthreads);
+}
+
+MEMBERFN(void)::set_working_space(void *const ws) {
+  // Use the same working space set for all contained depthwise engines.
+  for (auto &&row : _convs) {
+    for (auto &&conv : row) {
+      conv->set_working_space(ws);
+    }
+  }
+}
+
+MEMBERFN(unsigned int)::get_window(void) const {
+  return _convs[0][0]->get_window();
+}
+
+MEMBERFN(void)
+::run(const unsigned int start, const unsigned int stop,
+      const unsigned int threadid) {
+  // Run each contained convolution in turn
+  for (auto &&row : _convs) {
+    for (auto &&conv : row) {
+      conv->run(start, stop, threadid);
+    }
+  }
+}
+
+} // namespace depthwise
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 785f6dc3b9..fbf8d17f67 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -523,7 +523,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     std::string                func_name;
     if(dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3)
     {
-        std::tie(func, func_name) = create_named_function<typename DepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
+        std::tie(func, func_name) = create_named_function<typename DepthwiseConvolutionLayerFunctions::OptimizedDepthwiseConvolutionLayer>(
                                         std::string("DepthwiseConvolutionLayer3x3"),
                                         input, weights, biases, output, conv_info, depth_multiplier, fused_act);
     }
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 396e2368c3..81bf53ace6 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -42,6 +42,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following NEON kernels:
@@ -157,6 +158,124 @@ private:
     bool                                      _is_prepared;
 };
 
+/** Basic function to execute optimized depthwise convolution routines. This function calls the following NEON kernels:
+ *
+ * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+ *
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+ * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
+ * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
+ * -# @ref NEActivationLayer if fused activation is required
+ *
+ */
+class NEDepthwiseConvolutionLayerOptimized : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayerOptimized(const NEDepthwiseConvolutionLayerOptimized &) = delete;
+    /** Default move constructor */
+    NEDepthwiseConvolutionLayerOptimized(NEDepthwiseConvolutionLayerOptimized &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayerOptimized &operator=(const NEDepthwiseConvolutionLayerOptimized &) = delete;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionLayerOptimized &operator=(NEDepthwiseConvolutionLayerOptimized &&) = default;
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                                  Data type supported: Same as @p input.
+     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]      conv_info        Padding and stride information to use for the convolution.
+     * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
+     *
+     * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in] weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in] biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                             Data type supported: Same as @p input.
+     * @param[in] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in] conv_info        Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+
+    // Inherited methods overriden:
+    void run() override;
+    void prepare() override;
+
+private:
+    /** Configure the kernels/functions for the generic pipeline.
+     *
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                                  Data type supported: Same as @p input.
+     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]      conv_info        Padding and stride information to use for the convolution.
+     * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]      act_info         Activation layer information in case of a fused activation.
+     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
+     */
+    void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
+    /** Configure the kernels/functions for the optimized pipeline.
+     *
+     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+     * @param[in]  biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input.
+     * @param[out] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]  conv_info        Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info         Activation layer information in case of a fused activation.
+     */
+    void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
+    /** Run generic kernel */
+    void run_generic();
+    /** Run optimized function */
+    void run_optimized();
+
+private:
+    MemoryGroup                               _memory_group;
+    NEDepthwiseConvolutionLayer3x3Kernel      _dwc_kernel;
+    NEDepthwiseConvolutionAssemblyDispatch    _dwc_optimized_func;
+    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
+    NEFillBorderKernel                        _border_handler;
+    NEPermute                                 _permute_input;
+    NEPermute                                 _permute_weights;
+    NEPermute                                 _permute_output;
+    NEActivationLayer                         _activationlayer_function;
+    Tensor                                    _accumulator;
+    Tensor                                    _permuted_input;
+    Tensor                                    _permuted_weights;
+    Tensor                                    _permuted_output;
+    const ITensor                            *_original_weights;
+    bool                                      _has_bias;
+    bool                                      _is_quantized;
+    bool                                      _is_optimized;
+    bool                                      _is_nchw;
+    bool                                      _permute;
+    bool                                      _is_activationlayer_enabled;
+    bool                                      _is_prepared;
+};
+
 /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
  *
  * -# @ref NEDepthwiseIm2ColKernel
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
index 7d2cff7315..b88e750fa9 100644
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
@@ -30,9 +30,6 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
 namespace arm_compute
 {
 /** Depthwise convolution assembly kernel glue */
@@ -52,38 +49,44 @@ public:
     NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
     /** Default move assignment operator */
     NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Default destructor */
+    ~NEDepthwiseConvolutionAssemblyDispatch();
     /** Initialize the function's source, destination, kernels and border_size.
      *
      * @note Supports only NHWC format
      *
      * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
      * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                              Data type supported: Same as @p input.
      * @param[out] output           Destination tensor. Data type supported: same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
     void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
-                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+                   const Size2D &dilation = Size2D(1, 1));
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
      *
      * @note Supports only NHWC format
      *
      * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
      * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                              Data type supported: Same as @p input.
      * @param[out] output           Destination tensor. Data type supported: same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return An error status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
-                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+                           const Size2D &dilation = Size2D(1, 1));
     /** Check if the optimized kernel can be used for the given kernel sizes and strides
      *
      * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
@@ -103,16 +106,18 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                                       _memory_group;
-    const ITensor                                    *_input;
-    const ITensor                                    *_weights;
-    const ITensor                                    *_bias;
-    ITensor                                          *_output;
-    Tensor                                            _packed_weights;
-    Tensor                                            _workspace;
-    bool                                              _is_prepared;
-    std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel;
-    NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel;
+    struct LocalImpl;
+
+private:
+    MemoryGroup                _memory_group;
+    const ITensor             *_input;
+    const ITensor             *_weights;
+    const ITensor             *_bias;
+    ITensor                   *_output;
+    Tensor                     _packed_weights;
+    Tensor                     _workspace;
+    bool                       _is_prepared;
+    std::unique_ptr<LocalImpl> _pImpl;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 8aa43201ad..f216519e58 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -241,6 +241,7 @@ v19.08 Public major release
  - Various optimisations.
  - Deprecated functions/interfaces
     - Altered @ref QuantizationInfo interface to support per-channel quantization.
+    - The @ref NEDepthwiseConvolutionLayer3x3 will be replaced by @ref NEDepthwiseConvolutionLayerOptimized to accommodate for future optimizations.
 
 v19.05 Public major release
  - Various bug fixes.
diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
index b50067b4e0..88b37b8a83 100644
--- a/src/core/NEON/kernels/convolution/common/padding.cpp
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp
@@ -24,8 +24,8 @@
 #include <cstring>
 #include <cstdint>
 
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+#include "arm.hpp"
+#include "padding.hpp"
 
 namespace padding
 {
diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
index 1de9ebf28a..64e3156bff 100644
--- a/src/core/NEON/kernels/convolution/common/qasymm8.cpp
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
@@ -28,7 +28,7 @@
 #include <cmath>
 #include <limits>
 
-#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+#include "qasymm8.hpp"
 
 namespace qasymm8
 {
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
new file mode 100644
index 0000000000..3e2bbbb61a
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "impl_dilated.hpp"
+
+template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
new file mode 100644
index 0000000000..879e06158d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_quantized_dilated.hpp"
+#include "impl_dilated.hpp"
+
+namespace depthwise {
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+                                   KernelCols, StrideRows, StrideCols>::
+    QAsymm8DilatedDepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        int dilation_factor, nck::ActivationFunction activation,
+        const qasymm8::QAsymm8Params &weight_quantisation,
+        const qasymm8::QAsymm8Params &input_quantisation,
+        const qasymm8::QAsymm8Params &output_quantisation,
+        unsigned int padding_top, unsigned int padding_left,
+        unsigned int padding_bottom, unsigned int padding_right)
+    : QAsymm8DilatedDepthwiseConvolution(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          QAsymm8DilatedDepthwiseConvolution::get_output_size(
+              n_input_rows, padding_top, padding_bottom, dilation_factor),
+          QAsymm8DilatedDepthwiseConvolution::get_output_size(
+              n_input_cols, padding_left, padding_right, dilation_factor),
+          activation, weight_quantisation, input_quantisation,
+          output_quantisation, padding_top, padding_left, padding_bottom,
+          padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+                                   KernelCols, StrideRows, StrideCols>::
+    QAsymm8DilatedDepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        int dilation_factor, int n_output_rows, int n_output_cols,
+        nck::ActivationFunction activation,
+        const qasymm8::QAsymm8Params &weight_quantisation,
+        const qasymm8::QAsymm8Params &input_quantisation,
+        const qasymm8::QAsymm8Params &output_quantisation,
+        unsigned int padding_top, unsigned int padding_left,
+        unsigned int padding_bottom, unsigned int padding_right)
+    : QAsymm8DilatedDepthwiseConvolution(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          n_output_rows, n_output_cols, activation, weight_quantisation,
+          input_quantisation, output_quantisation,
+          qasymm8::QAsymm8RescaleParams::make_rescale_params(
+              weight_quantisation, input_quantisation, output_quantisation),
+          padding_top, padding_left, padding_bottom, padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+                                   KernelCols, StrideRows, StrideCols>::
+    QAsymm8DilatedDepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        int dilation_factor, nck::ActivationFunction activation,
+        const qasymm8::QAsymm8Params &weight_quantisation,
+        const qasymm8::QAsymm8Params &input_quantisation,
+        const qasymm8::QAsymm8Params &output_quantisation,
+        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+        unsigned int padding_top, unsigned int padding_left,
+        unsigned int padding_bottom, unsigned int padding_right)
+    : QAsymm8DilatedDepthwiseConvolution(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          QAsymm8DilatedDepthwiseConvolution::get_output_size(
+              n_input_rows, padding_top, padding_bottom, dilation_factor),
+          QAsymm8DilatedDepthwiseConvolution::get_output_size(
+              n_input_cols, padding_left, padding_right, dilation_factor),
+          activation, weight_quantisation, input_quantisation,
+          output_quantisation, rescale_parameters, padding_top, padding_left,
+          padding_bottom, padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+                                   KernelCols, StrideRows, StrideCols>::
+    QAsymm8DilatedDepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        int dilation_factor, int n_output_rows, int n_output_cols,
+        nck::ActivationFunction activation,
+        const qasymm8::QAsymm8Params &weight_quantisation,
+        const qasymm8::QAsymm8Params &input_quantisation,
+        const qasymm8::QAsymm8Params &output_quantisation,
+        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+        unsigned int padding_top, unsigned int padding_left,
+        unsigned int padding_bottom, unsigned int padding_right)
+    : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+                                  KernelCols, StrideRows, StrideCols, uint8_t,
+                                  int32_t, uint8_t>(
+          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+          n_output_rows, n_output_cols, activation, padding_top, padding_left,
+          padding_bottom, padding_right,
+          [weight_quantisation, input_quantisation, output_quantisation,
+           rescale_parameters](
+              const int n_batches, const int n_input_rows,
+              const int n_input_cols, const int n_channels,
+              const int n_output_rows, const int n_output_cols,
+              const nck::ActivationFunction activation,
+              const unsigned int padding_top, const unsigned int padding_left,
+              const unsigned int padding_bottom,
+              const unsigned int padding_right) -> IDepthwiseConvolution * {
+            return new QAsymm8DepthwiseConvolution<
+                OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+                StrideRows, StrideCols>(
+                n_batches, n_input_rows, n_input_cols, n_channels,
+                n_output_rows, n_output_cols, activation, weight_quantisation,
+                input_quantisation, output_quantisation, rescale_parameters,
+                padding_top, padding_left, padding_bottom, padding_right);
+          }) {}
+
+} // namespace depthwise
+
+template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
+template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
index 692086c74a..f86f1bad73 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "impl_base.hpp"
 
 // TODO Move to common utilities somewhere
 template <size_t Size> struct DType { };
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index cbdb19a067..87d2bfd8e6 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -30,8 +30,8 @@
  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
 
 #pragma once
 
@@ -63,6 +63,31 @@ DepthwiseConvolution<
 {
 }
 
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+>::DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  int n_output_rows, int n_output_cols,
+  ActivationFunction activation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels,
+      n_output_rows, n_output_cols, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
+
 template <
   unsigned int OutputTileRows, unsigned int OutputTileCols,
   unsigned int KernelRows, unsigned int KernelCols,
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 264576137c..e19e4c668c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -30,8 +30,8 @@
  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  */
 
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
 
 #pragma once
 
@@ -63,6 +63,30 @@ DepthwiseConvolution<
 {
 }
 
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  int n_output_rows, int n_output_cols,
+  ActivationFunction activation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels,
+      n_output_rows, n_output_cols, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
 
 template <
   unsigned int OutputTileRows, unsigned int OutputTileCols,
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index 5546d37e59..bda875dfe1 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -32,15 +32,38 @@
 
 #include <limits>
 
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
+#include "depthwise_quantized.hpp"
 
 #pragma once
 
+// Comment the following to use floating-point based quantisation, leave
+// uncommented to use fixed-point.
+#define FIXED_POINT_REQUANTISATION 1
+
 using namespace neon_convolution_kernels;
 using namespace qasymm8;
 
+template <typename T>
+struct clamp_to_limits
+{
+  template <typename U>
+  static inline U clamp(const U& v)
+  {
+    const std::numeric_limits<T> limits;
+    const U min = static_cast<U>(limits.min());
+    const U max = static_cast<U>(limits.max());
+    return std::min(std::max(v, min), max);
+  }
+
+  template <typename U>
+  static inline T clamp_and_cast(const U& v)
+  {
+    return static_cast<U>(clamp(v));
+  }
+};
+
 template <typename T>
 inline T saturating_doubling_high_mul(const T&, const int32_t&);
 
@@ -87,103 +110,214 @@ inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
 namespace depthwise
 {
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
 QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
 >::QAsymm8DepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        const ActivationFunction activation,
-        const QAsymm8Params& weight_quantisation,
-        const QAsymm8Params& input_quantisation,
-        const QAsymm8Params& output_quantisation,
-        unsigned int padding_top,
-        unsigned int padding_left,
-        unsigned int padding_bottom,
-        unsigned int padding_right
-                              ) : QAsymm8DepthwiseConvolution(
-        n_batches, n_input_rows, n_input_cols, n_channels,
-        activation, weight_quantisation, input_quantisation, output_quantisation,
-        QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-        padding_top, padding_left, padding_bottom, padding_right
-)
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  const ActivationFunction activation,
+  const QAsymm8Params& weight_quantisation,
+  const QAsymm8Params& input_quantisation,
+  const QAsymm8Params& output_quantisation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : QAsymm8DepthwiseConvolution(
+    n_batches, n_input_rows, n_input_cols, n_channels,
+    activation, weight_quantisation, input_quantisation, output_quantisation,
+    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+    padding_top, padding_left, padding_bottom, padding_right
+  )
 {
 }
 
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
 QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
 >::QAsymm8DepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        const ActivationFunction activation,
-        const QAsymm8Params& weight_quantisation,
-        const QAsymm8Params& input_quantisation,
-        const QAsymm8Params& output_quantisation,
-        const QAsymm8RescaleParams& rescale_params,
-        unsigned int padding_top,
-        unsigned int padding_left,
-        unsigned int padding_bottom,
-        unsigned int padding_right
-                              ) : Base(
-        n_batches, n_input_rows, n_input_cols, n_channels, activation,
-        padding_top, padding_left, padding_bottom, padding_right
-),
-                                  _weights_quant(weight_quantisation),
-                                  _inputs_quant(input_quantisation),
-                                  _output_quant(output_quantisation),
-                                  rescale_parameters(rescale_params)
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  int n_output_rows, int n_output_cols,
+  const ActivationFunction activation,
+  const QAsymm8Params& weight_quantisation,
+  const QAsymm8Params& input_quantisation,
+  const QAsymm8Params& output_quantisation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : QAsymm8DepthwiseConvolution(
+    n_batches, n_input_rows, n_input_cols, n_channels,
+    n_output_rows, n_output_cols,
+    activation, weight_quantisation, input_quantisation, output_quantisation,
+    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+    padding_top, padding_left, padding_bottom, padding_right
+  )
 {
 }
 
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  const ActivationFunction activation,
+  const QAsymm8Params& weight_quantisation,
+  const QAsymm8Params& input_quantisation,
+  const QAsymm8Params& output_quantisation,
+  const QAsymm8RescaleParams& rescale_params,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+    n_batches, n_input_rows, n_input_cols, n_channels,
+    get_activation_fn(activation, output_quantisation),
+    padding_top, padding_left, padding_bottom, padding_right
+  ),
+  _weights_quant(weight_quantisation),
+  _inputs_quant(input_quantisation),
+  _output_quant(output_quantisation),
+  rescale_parameters(rescale_params)
+{
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  int n_output_rows, int n_output_cols,
+  const ActivationFunction activation,
+  const QAsymm8Params& weight_quantisation,
+  const QAsymm8Params& input_quantisation,
+  const QAsymm8Params& output_quantisation,
+  const QAsymm8RescaleParams& rescale_params,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+    n_batches, n_input_rows, n_input_cols, n_channels,
+    n_output_rows, n_output_cols,
+    get_activation_fn(activation, output_quantisation),
+    padding_top, padding_left, padding_bottom, padding_right
+  ),
+  _weights_quant(weight_quantisation),
+  _inputs_quant(input_quantisation),
+  _output_quant(output_quantisation),
+  rescale_parameters(rescale_params)
+{
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+ActivationFunction QAsymm8DepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::get_activation_fn(
+  const ActivationFunction activation,
+  const QAsymm8Params& output_quant
+)
+{
+  if (
+    (activation == ActivationFunction::ReLU &&
+     output_quant.quantize(0) == 0) ||
+    (activation == ActivationFunction::ReLU6 &&
+     output_quant.quantize(0) == 0 &&
+     output_quant.dequantize(255) <= 6.0f)
+  )
+  {
+    // If the range of values which can be represented by a quantized value are
+    // within the range that would be produced by the activation function, then
+    // the activation function is redundant and can be skipped.
+    return ActivationFunction::None;
+  }
+  else if(
+    activation == ActivationFunction::ReLU6 &&
+    output_quant.dequantize(255) <= 6.0f
+  )
+  {
+    // If the largest value that can be represented by a quantized value is
+    // lower than the upper boundary, then the activation function can be
+    // relaxed to a ReLU.
+    return ActivationFunction::ReLU;
+  }
+
+  return activation;
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
 uint8_t QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
 >::_input_padding_value(void) const
 {
   return _inputs_quant.offset;
 }
 
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
 void QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
 >::_pack_params(
-        void * const buffer,
-        const void * const weights,
-        const unsigned int weight_row_stride,
-        const unsigned int weight_col_stride,
-        const void * const biases
-               ) const
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
 {
   const uint8_t *wptr = static_cast<const uint8_t *>(weights);
   const int32_t *bptr = static_cast<const int32_t *>(biases);
   uint8_t *outptr = static_cast<uint8_t *>(buffer);
 
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
+  // We set the vector length to use quad registers on Aarch64 and only doubles
+  // on Aarch32. NOTE For SVE set this to the actual vector length.
+#if defined(__aarch64__)
+  unsigned int veclen = 16;
+#else
+#if defined(__arm__)
   unsigned int veclen = 8;
+#endif
+#endif
+
+  // Compute the rank 0 offset arising from the quantisation parameters.
+  const int32_t rank0_offset = (KernelRows * KernelCols *
+                                static_cast<int32_t>(_weights_quant.offset) *
+                                static_cast<int32_t>(_inputs_quant.offset));
 
   // While there are channels left to process, pack a vector length of them at
   // a time and reduce the size of vector used as the size of the tensor
   // decreases.
   for (
-          unsigned int n_channels = this->n_channels(); n_channels;
-          n_channels -= veclen,
-                  outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-          )
+    unsigned int n_channels = this->n_channels(); n_channels;
+    n_channels -= veclen,
+    outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
+  )
   {
     // NOTE Ignore this section if using SVE, the vector length remains the
     // same and we just don't fill a full register for the tail.
@@ -201,8 +335,8 @@ void QAsymm8DepthwiseConvolution<
     // Copy a vector length of elements
     for (unsigned int n = 0; n < veclen && n < n_channels; n++)
     {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      out_bptr[n] = bias;
+      int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
+      uint32_t weight_sum = 0;
 
       for (unsigned int i = 0; i < KernelRows; i++)
       {
@@ -211,140 +345,297 @@ void QAsymm8DepthwiseConvolution<
         {
           uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
           row_outptr[j*veclen + n] = w;
+          weight_sum += static_cast<uint32_t>(w);
         }
       }
       wptr++;
+
+      // Include in the bias contributions from the quantisation offset
+      int32_t rank1_offset = static_cast<int32_t>(
+        static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
+      );
+      out_bptr[n] = bias + rank0_offset - rank1_offset;
     }
   }
 }
 
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols,
-        typename FInput, typename FOutput
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
-static inline void tilefn(
-        int n_channels,
-        const void* packed_params,
-        FInput &get_input_ptr,
-        FOutput &get_output_ptr,
-        const int32_t clamp_max,
-        const int32_t clamp_min,
-        const uint8_t input_offset,
-        const uint8_t weight_offset,
-        const uint8_t output_offset,
-        const int32_t requant_multiplier,
-        const int32_t requant_shift
-                         )
+template<ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+  int n_channels,
+  const void* packed_params,
+  const uint8_t* inptr,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  uint8_t* outptr,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride
+)
 {
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
+  // Activation parameters (unused if Activation is None)
+  const uint8_t aqmin = _output_quant.offset;
+  const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
+    std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
 
   // Byte type pointer to weights and biases
   const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
 
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
+#if defined(__aarch64__)  // Under Aarch64 only use quad registers
+  for (; n_channels >= 16; n_channels -= 16)
+  {
+    // Load biases
+    const int32x4_t biases[4] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
+    };
+    wbptr += 16*sizeof(int32_t);
+
+    // Load weights
+    uint8x16_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = vld1q_u8(wbptr);
+        wbptr += 16;
+      }
+    }
+
+    // Load the input activations
+    uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride);
+      }
+    }
+    inptr += 16;
+
+    // Perform the convolution
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        // Two sets of operations are required, we perform the
+        // multiply-accumulates for the convolution proper but must also sum
+        // the tile elements to account for the _weight_ offset.
+        uint32x4_t accs[4];
+        for (unsigned int i = 0; i < 4; i++)
+        {
+          accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+        }
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            // Get relevant weight and activation pixel
+            const uint8x16_t w = weights[wi][wj];
+            const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+            // Perform multiplication and accumulation
+            const uint16x8_t muls[2] = {
+              vmull_u8(vget_low_u8(w), vget_low_u8(x)),
+              vmull_u8(vget_high_u8(w), vget_high_u8(x))
+            };
+
+            const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+            const uint16x8_t sum_elems[2] = {
+              vmull_u8(vget_low_u8(x), woffset),
+              vmull_u8(vget_high_u8(x), woffset)
+            };
+
+            const uint32x4_t tmps[4] = {
+              vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
+              vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
+              vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
+              vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
+            };
+            for (unsigned int i = 0; i < 4; i++)
+            {
+              accs[i] = vaddq_u32(accs[i], tmps[i]);
+            }
+          }
+        }
+
+        // Rescale the accumulator and add in the new offset.
+        uint32x4_t final_accs[4];
+        for (unsigned int i = 0; i < 4; i++)
+        {
+#ifdef FIXED_POINT_REQUANTISATION
+          const int32x4_t y = rounding_divide_by_exp2(
+            saturating_doubling_high_mul(
+              reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+            ),
+            rescale_parameters.shift
+          );
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+          final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else  // floating point requantisation
+          float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+          fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+          fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+          fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+          final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+        }
+
+        uint8x16_t output = vcombine_u8(
+          vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
+          vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
+        );
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = vmaxq_u8(output, vdupq_n_u8(aqmin));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = vminq_u8(output, vdupq_n_u8(aqmax));
+        }
+
+        vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
+      }
+    }
+    outptr += 16;
+  }
+#endif  // defined(__aarch64__)
+  for (; n_channels >= 8; n_channels -= 8)
   {
     const int32x4_t biases[2] = {
-            vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-            vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
     };
     wbptr += 8*sizeof(int32_t);
 
-    int16x8_t weights[KernelRows][KernelCols];
-    const uint8x8_t woffset = vdup_n_u8(weight_offset);
+    uint8x8_t weights[KernelRows][KernelCols];
     for (unsigned int i = 0; i < KernelRows; i++)
     {
       for (unsigned int j = 0; j < KernelCols; j++)
       {
-        const uint8x8_t w = vld1_u8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
+        weights[i][j] = vld1_u8(wbptr);
         wbptr += 8;
       }
     }
 
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
+    uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
     {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
       {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
+        inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride);
       }
     }
+    inptr += 8;
 
     for (unsigned int oi = 0; oi < OutputTileRows; oi++)
     {
       for (unsigned int oj = 0; oj < OutputTileCols; oj++)
       {
-        int32x4_t accs[2];
+        uint32x4_t accs[2];
         for (unsigned int i = 0; i < 2; i++)
         {
-          accs[i] = biases[i];
+          accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
         }
 
         for (unsigned int wi = 0; wi < KernelRows; wi++)
         {
           for (unsigned int wj = 0; wj < KernelCols; wj++)
           {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
-            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
+            const uint8x8_t w = weights[wi][wj];
+            const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+            const uint16x8_t muls = vmull_u8(w, x);
+            const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+            const uint16x8_t sum_elems = vmull_u8(x, woffset);
+
+            const uint32x4_t tmps[2] = {
+              vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
+              vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
+            };
+            for (unsigned int i = 0; i < 2; i++)
+            {
+              accs[i] = vaddq_u32(accs[i], tmps[i]);
+            }
           }
         }
 
-        int32x4_t final_accs[2];
+        uint32x4_t final_accs[2];
         for (unsigned int i = 0; i < 2; i++)
         {
+#ifdef FIXED_POINT_REQUANTISATION
           const int32x4_t y = rounding_divide_by_exp2(
-                  saturating_doubling_high_mul(accs[i], requant_multiplier),
-                  requant_shift);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
+            saturating_doubling_high_mul(
+              reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+            ),
+            rescale_parameters.shift
+          );
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+          final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else  // floating point requantisation
+          float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+          fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+          fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+          fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+          final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
         }
 
-        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                         vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
-        const uint8x8_t output =
-                vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
+        uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = vmax_u8(output, vdup_n_u8(aqmin));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = vmin_u8(output, vdup_n_u8(aqmax));
+        }
+
+        vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
       }
     }
+    outptr += 8;
   }
-  for (; n_channels; n_channels--, channel++)
+  for (; n_channels; n_channels--)
   {
     // Load bias
     const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
     wbptr += sizeof(int32_t);
 
     // Load weights
-    int16_t weights[KernelRows][KernelCols];
+    uint8_t weights[KernelRows][KernelCols];
     for (unsigned int i = 0; i < KernelRows; i++)
     {
       for (unsigned int j = 0; j < KernelCols; j++)
       {
-        weights[i][j] = *(wbptr++) - weight_offset;
+        weights[i][j] = *(wbptr++);
       }
     }
 
     // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
+    uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
     {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
       {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
+        inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride);
       }
     }
+    inptr++;
 
     // Perform the convolution
     for (unsigned int oi = 0; oi < OutputTileRows; oi++)
@@ -352,135 +643,377 @@ static inline void tilefn(
       for (unsigned int oj = 0; oj < OutputTileCols; oj++)
       {
         int32_t acc = bias;
+        uint32_t element_sum = 0;
 
         for (unsigned int wi = 0; wi < KernelRows; wi++)
         {
           for (unsigned int wj = 0; wj < KernelCols; wj++)
           {
             const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
+            acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
+            element_sum += static_cast<uint32_t>(x);
           }
         }
 
+        acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
+
         // Requantize
+#ifdef FIXED_POINT_REQUANTISATION
         acc = rounding_divide_by_exp2(
-                saturating_doubling_high_mul(acc, requant_multiplier),
-                requant_shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
+            saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
+            rescale_parameters.shift
+        );
+        acc += _output_quant.offset;
+        uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
+#else  // floating point requantization
+        float fp_acc = static_cast<float>(acc);
+        fp_acc *= rescale_parameters.rescale;
+        fp_acc += static_cast<float>(_output_quant.offset);
+        fp_acc = std::max<float>(fp_acc, 0.0f);
+        uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
+#endif
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = std::max(output, aqmin);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = std::min(output, aqmax);
+        }
+
+        *(outptr + oi*out_row_stride + oj*out_col_stride) = output;
       }
     }
+    outptr++;
   }
 }
 
 template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols,
-        typename FInput, typename FOutput
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
-static inline void execute_tilefn(
-        int n_channels,
-        const void* packed_params,
-        const nck::ActivationFunction actfn,
-        FInput &get_input_ptr,
-        FOutput &get_output_ptr,
-        const QAsymm8Params &input_quant,
-        const QAsymm8Params &weight_quant,
-        const QAsymm8Params &output_quant,
-        const QAsymm8RescaleParams &requant
-                                 ) {
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == nck::ActivationFunction::ReLU ||
-      actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t bottom_rail = output_quant.offset;
-    clamp_min = std::max(clamp_min, bottom_rail);
+template<ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+  int n_channels,
+  const void* packed_params,
+  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  // Activation parameters (unused if Activation is None)
+  const uint8_t aqmin = _output_quant.offset;
+  const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
+    std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
+
+  // Byte type pointer to weights and biases
+  const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
+
+  // Offset into input/output tensors
+  int n = 0;
+
+#if defined(__aarch64__)  // Under Aarch64 only use quad registers
+  for (; n_channels >= 16; n_channels -= 16, n += 16)
+  {
+    // Load biases
+    const int32x4_t biases[4] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
+    };
+    wbptr += 16*sizeof(int32_t);
+
+    // Load weights
+    uint8x16_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = vld1q_u8(wbptr);
+        wbptr += 16;
+      }
+    }
+
+    // Load the input activations
+    uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
+      }
+    }
+
+    // Perform the convolution
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        // Two sets of operations are required, we perform the
+        // multiply-accumulates for the convolution proper but must also sum
+        // the tile elements to account for the _weight_ offset.
+        uint32x4_t accs[4];
+        for (unsigned int i = 0; i < 4; i++)
+        {
+          accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+        }
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            // Get relevant weight and activation pixel
+            const uint8x16_t w = weights[wi][wj];
+            const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+            // Perform multiplication and accumulation
+            const uint16x8_t muls[2] = {
+              vmull_u8(vget_low_u8(w), vget_low_u8(x)),
+              vmull_u8(vget_high_u8(w), vget_high_u8(x))
+            };
+
+            const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+            const uint16x8_t sum_elems[2] = {
+              vmull_u8(vget_low_u8(x), woffset),
+              vmull_u8(vget_high_u8(x), woffset)
+            };
+
+            const uint32x4_t tmps[4] = {
+              vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
+              vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
+              vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
+              vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
+            };
+            for (unsigned int i = 0; i < 4; i++)
+            {
+              accs[i] = vaddq_u32(accs[i], tmps[i]);
+            }
+          }
+        }
+
+        // Rescale the accumulator and add in the new offset.
+        uint32x4_t final_accs[4];
+        for (unsigned int i = 0; i < 4; i++)
+        {
+#ifdef FIXED_POINT_REQUANTISATION
+          const int32x4_t y = rounding_divide_by_exp2(
+            saturating_doubling_high_mul(
+              reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+            ),
+            rescale_parameters.shift
+          );
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+          final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else  // floating point requantisation
+          float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+          fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+          fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+          fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+          final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+        }
+
+        uint8x16_t output = vcombine_u8(
+          vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
+          vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
+        );
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = vmaxq_u8(output, vdupq_n_u8(aqmin));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = vminq_u8(output, vdupq_n_u8(aqmax));
+        }
+
+        vst1q_u8(outptrs[oi][oj] + n, output);
+      }
+    }
   }
+#endif  // defined(__aarch64__)
+  for (; n_channels >= 8; n_channels -= 8, n += 8)
+  {
+    const int32x4_t biases[2] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+    };
+    wbptr += 8*sizeof(int32_t);
+
+    uint8x8_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = vld1_u8(wbptr);
+        wbptr += 8;
+      }
+    }
+
+    uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        inputs[i][j] = vld1_u8(inptrs[i][j] + n);
+      }
+    }
+
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        uint32x4_t accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+          accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+        }
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const uint8x8_t w = weights[wi][wj];
+            const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+            const uint16x8_t muls = vmull_u8(w, x);
+            const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+            const uint16x8_t sum_elems = vmull_u8(x, woffset);
+
+            const uint32x4_t tmps[2] = {
+              vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
+              vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
+            };
+            for (unsigned int i = 0; i < 2; i++)
+            {
+              accs[i] = vaddq_u32(accs[i], tmps[i]);
+            }
+          }
+        }
+
+        uint32x4_t final_accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+#ifdef FIXED_POINT_REQUANTISATION
+          const int32x4_t y = rounding_divide_by_exp2(
+            saturating_doubling_high_mul(
+              reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+            ),
+            rescale_parameters.shift
+          );
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+          final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else  // floating point requantisation
+          float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+          fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+          fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+          fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+          final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+        }
+
+        uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = vmax_u8(output, vdup_n_u8(aqmin));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = vmin_u8(output, vdup_n_u8(aqmax));
+        }
 
-  if (actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
+        vst1_u8(outptrs[oi][oj] + n, output);
+      }
+    }
   }
+  for (; n_channels; n_channels--, n++)
+  {
+    // Load bias
+    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+    wbptr += sizeof(int32_t);
 
-  // Call the tile execution method
-  tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-          StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
-                      clamp_max, clamp_min, input_quant.offset,
-                      weight_quant.offset, output_quant.offset,
-                      requant.multiplier, requant.shift);
-}
+    // Load weights
+    uint8_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = *(wbptr++);
+      }
+    }
 
-template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-        int n_channels,
-        const void* packed_params,
-        const uint8_t* inptr,
-        unsigned int in_row_stride,
-        unsigned int in_col_stride,
-        uint8_t* outptr,
-        unsigned int out_row_stride,
-        unsigned int out_col_stride
-               ) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-          const int i, const int j, const int channel) {
-      return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-          const int i, const int j, const int channel) {
-      return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-          StrideRows, StrideCols>(
-          n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-          _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
+    // Load the input activations
+    uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        inputs[i][j] = *(inptrs[i][j] + n);
+      }
+    }
 
-template <
-        unsigned int OutputTileRows, unsigned int OutputTileCols,
-        unsigned int KernelRows, unsigned int KernelCols,
-        unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-        int n_channels,
-        const void* packed_params,
-        const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-        uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-               ) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-      return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-      return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-          StrideRows, StrideCols>(
-          n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-          _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+    // Perform the convolution
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        int32_t acc = bias;
+        uint32_t element_sum = 0;
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+            acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
+            element_sum += static_cast<uint32_t>(x);
+          }
+        }
+
+        acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
+
+        // Requantize
+#ifdef FIXED_POINT_REQUANTISATION
+        acc = rounding_divide_by_exp2(
+            saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
+            rescale_parameters.shift
+        );
+        acc += _output_quant.offset;
+        uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
+#else  // floating point requantization
+        float fp_acc = static_cast<float>(acc);
+        fp_acc *= rescale_parameters.rescale;
+        fp_acc += static_cast<float>(_output_quant.offset);
+        fp_acc = std::max<float>(fp_acc, 0.0f);
+        uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
+#endif
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          output = std::max(output, aqmin);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          output = std::min(output, aqmax);
+        }
+
+        *(outptrs[oi][oj] + n) = output;
+      }
+    }
+  }
 }
 
 }  // namespace depthwise
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 9f8064e924..c14100ab42 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -59,8 +59,8 @@ struct CLConvolutionLayerFunctions
 /** Collection of CL depthwise convolution functions */
 struct CLDepthwiseConvolutionLayerFunctions
 {
-    using GenericDepthwiseConvolutionLayer = CLDepthwiseConvolutionLayer;
-    using DepthwiseConvolutionLayer3x3     = CLDepthwiseConvolutionLayer3x3;
+    using GenericDepthwiseConvolutionLayer   = CLDepthwiseConvolutionLayer;
+    using OptimizedDepthwiseConvolutionLayer = CLDepthwiseConvolutionLayer3x3;
 };
 
 /** Collection of CL element-wise functions */
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index c31072661b..d4892f53a6 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -65,8 +65,8 @@ struct NEConvolutionLayerFunctions
 /** Collection of CL depthwise convolution functions */
 struct NEDepthwiseConvolutionLayerFunctions
 {
-    using GenericDepthwiseConvolutionLayer = NEDepthwiseConvolutionLayer;
-    using DepthwiseConvolutionLayer3x3     = NEDepthwiseConvolutionLayer3x3;
+    using GenericDepthwiseConvolutionLayer   = NEDepthwiseConvolutionLayer;
+    using OptimizedDepthwiseConvolutionLayer = NEDepthwiseConvolutionLayerOptimized;
 };
 
 /** Collection of CL element-wise functions */
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 43288ec4c6..45cc2d2762 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -363,6 +363,333 @@ void NEDepthwiseConvolutionLayer3x3::prepare()
     }
 }
 
+NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure_generic(ITensor                   *input,
+                                                             const ITensor             *weights,
+                                                             const ITensor             *biases,
+                                                             ITensor                   *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             unsigned int               depth_multiplier,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const Size2D              &dilation)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+
+    PixelValue zero_value(0.f);
+
+    // Initialize the intermediate accumulator tensor in case of quantized input
+    if(_is_quantized)
+    {
+        TensorShape accum_shape  = output->info()->tensor_shape();
+        DataLayout  accum_layout = output->info()->data_layout();
+        if(!_is_nchw)
+        {
+            permute(accum_shape, PermutationVector(1U, 2U, 0U));
+            accum_layout = DataLayout::NCHW;
+        }
+
+        _memory_group.manage(&_accumulator);
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
+        _accumulator.info()->set_data_layout(accum_layout);
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
+    }
+
+    if(!_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+        // Configure depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+
+        // Configure border handler
+        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+    }
+    else
+    {
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    }
+
+    // Configure biases accumulation
+    if(_is_quantized)
+    {
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
+
+        float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
+        _accumulator.allocator()->allocate();
+    }
+    else if(_has_bias)
+    {
+        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+    }
+
+    // Permute output
+    if(!_is_nchw)
+    {
+        // Configure the function to transform the convoluted output to NHWC
+        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+        _permuted_output.allocator()->allocate();
+    }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure_optimized(const ITensor             *input,
+                                                               const ITensor             *weights,
+                                                               const ITensor             *biases,
+                                                               ITensor                   *output,
+                                                               const PadStrideInfo       &conv_info,
+                                                               unsigned int               depth_multiplier,
+                                                               const ActivationLayerInfo &act_info,
+                                                               const Size2D              &dilation)
+{
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
+    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = act_info;
+    }
+
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+    }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure(ITensor       *input,
+                                                     const ITensor *weights,
+                                                     const ITensor *biases,
+                                                     ITensor *output, const PadStrideInfo &conv_info,
+                                                     unsigned int               depth_multiplier,
+                                                     const ActivationLayerInfo &act_info,
+                                                     const Size2D              &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias         = biases != nullptr;
+    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+                                                                                       weights->info(),
+                                                                                       conv_info,
+                                                                                       depth_multiplier,
+                                                                                       dilation);
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_optimized == _is_nchw;
+    _is_prepared                = false;
+    _is_activationlayer_enabled = act_info.enabled();
+
+    // Configure appropriate pipeline
+    if(_is_optimized)
+    {
+        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+    else
+    {
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+
+    // Configure activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+}
+
+Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo         *input,
+                                                      const ITensorInfo         *weights,
+                                                      const ITensorInfo         *biases,
+                                                      const ITensorInfo         *output,
+                                                      const PadStrideInfo       &conv_info,
+                                                      unsigned int               depth_multiplier,
+                                                      const ActivationLayerInfo &act_info,
+                                                      const Size2D              &dilation)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+    if(biases != nullptr)
+    {
+        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
+    {
+        const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+    }
+
+    //Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run_generic()
+{
+    // Fill border
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
+
+    // Execute depthwise convolution
+    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+
+    // Add biases
+    if(_has_bias || _is_quantized)
+    {
+        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+    }
+
+    // Permute output
+    if(!_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run_optimized()
+{
+    // Run assembly function
+    _dwc_optimized_func.run();
+
+    // Permute output
+    if(_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Permute input
+    if(_permute)
+    {
+        _permute_input.run();
+    }
+
+    _is_optimized ? run_optimized() : run_generic();
+
+    // Run activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        if(_permute)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights.run();
+            _original_weights->mark_as_unused();
+        }
+
+        // Prepare optimized function
+        if(_is_optimized)
+        {
+            _dwc_optimized_func.prepare();
+            if(!_permuted_weights.is_used())
+            {
+                _permuted_weights.allocator()->free();
+            }
+        }
+
+        _is_prepared = true;
+    }
+}
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
       _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 5f57bbfe23..b28aaa715f 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -26,7 +26,9 @@
 
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -42,19 +44,22 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
                                                                    const ITensor      *weights,
                                                                    ITensor            *output,
                                                                    PadStrideInfo       conv_info,
-                                                                   ActivationLayerInfo act_info)
+                                                                   ActivationLayerInfo act_info,
+                                                                   const Size2D       &dilation)
 {
+    ARM_COMPUTE_UNUSED(dilation);
     const DataType    data_type = input->info()->data_type();
     const TensorShape shape     = input->info()->tensor_shape();
 
-    const int n_batches      = shape[3];
-    const int in_rows        = shape.z();
-    const int in_cols        = shape.y();
-    const int n_channels     = shape.x();
-    const int padding_top    = conv_info.pad_top();
-    const int padding_left   = conv_info.pad_left();
-    const int padding_bottom = conv_info.pad_bottom();
-    const int padding_right  = conv_info.pad_right();
+    const int n_batches       = shape[3];
+    const int in_rows         = shape.z();
+    const int in_cols         = shape.y();
+    const int n_channels      = shape.x();
+    const int dilation_factor = dilation.x();
+    const int padding_top     = conv_info.pad_top();
+    const int padding_left    = conv_info.pad_left();
+    const int padding_bottom  = conv_info.pad_bottom();
+    const int padding_right   = conv_info.pad_right();
 
     const unsigned int stride_x = conv_info.stride().first;
 
@@ -95,11 +100,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
         switch(stride_x)
         {
             case 1:
-                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                           n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
             case 2:
-                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                           n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
             default:
                 return nullptr;
         }
@@ -115,11 +120,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
                 switch(stride_x)
                 {
                     case 1:
-                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                        return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                     case 2:
-                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                        return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                     default:
                         return nullptr;
                 }
@@ -131,11 +136,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
                 switch(stride_x)
                 {
                     case 1:
-                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                        return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                     case 2:
-                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                        return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                     default:
                         return nullptr;
                 }
@@ -148,21 +153,30 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
 }
 } // namespace
 
+struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
+{
+    std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
+    NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel{};
+};
+
 #ifndef DOXYGEN_SKIP_THIS
 NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
-      _dwc_acl_kernel()
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
+      _pImpl(support::cpp14::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
+NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
+
 void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
                                                        const ITensor             *weights,
                                                        const ITensor             *bias,
                                                        ITensor                   *output,
                                                        const PadStrideInfo       &conv_info,
                                                        unsigned int               depth_multiplier,
-                                                       const ActivationLayerInfo &act_info)
+                                                       const ActivationLayerInfo &act_info,
+                                                       const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(depth_multiplier);
@@ -172,10 +186,11 @@ void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor
                                                                                 output->info(),
                                                                                 conv_info,
                                                                                 depth_multiplier,
-                                                                                act_info));
+                                                                                act_info,
+                                                                                dilation));
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
 
     _input       = input;
@@ -185,24 +200,24 @@ void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor
     _is_prepared = false;
 
     // Create convolver
-    _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
-    ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+    _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
+    ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
 
     // Create assembly kernel wrapper
-    _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+    _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
 
     constexpr size_t alignment = 128;
 
     // Create workspace
     const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+    const size_t       workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
     _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
     _memory_group.manage(&_workspace);
     _workspace.allocator()->allocate();
 
     // Create packing tensor
-    const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+    const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
     ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
     _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
 }
@@ -213,7 +228,8 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
                                                         const ITensorInfo         *output,
                                                         const PadStrideInfo       &conv_info,
                                                         unsigned int               depth_multiplier,
-                                                        const ActivationLayerInfo &act_info)
+                                                        const ActivationLayerInfo &act_info,
+                                                        const Size2D              &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -227,6 +243,7 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
     ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() != dilation.y());
 
     const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
     const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
@@ -243,7 +260,7 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
     // Check output
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -283,17 +300,17 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
     bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
 
     // Check for supported padding
-    const auto    pad_top           = conv_info.pad_top();
-    const auto    pad_right         = conv_info.pad_right();
-    const auto    pad_bottom        = conv_info.pad_bottom();
-    const auto    pad_left          = conv_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    bool          is_dilation_1     = dilation.x() == 1 && dilation.y() == 1;
-
-    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+    const auto    pad_top               = conv_info.pad_top();
+    const auto    pad_right             = conv_info.pad_right();
+    const auto    pad_bottom            = conv_info.pad_bottom();
+    const auto    pad_left              = conv_info.pad_left();
+    PadStrideInfo same_pad              = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+    bool          is_same_padding       = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding      = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding     = is_same_padding || is_valid_padding;
+    bool          is_dilation_supported = (dilation.x() == dilation.y()) || (dilation == Size2D(1U, 1U));
+
+    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
 }
 
 void NEDepthwiseConvolutionAssemblyDispatch::run()
@@ -305,7 +322,7 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
 
     // Setup inputs/outputs
     ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
-    _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+    _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
 
     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
     const int   input_element_size = _input->info()->element_size();
@@ -313,7 +330,7 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
     const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
     const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
     const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
-    _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+    _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     const int output_element_size = _output->info()->element_size();
@@ -321,10 +338,10 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
     const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
     const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
     void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
-    _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+    _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
 
     // Schedule assembly kernel
-    NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
 }
 
 void NEDepthwiseConvolutionAssemblyDispatch::prepare()
@@ -338,12 +355,12 @@ void NEDepthwiseConvolutionAssemblyDispatch::prepare()
         const int weights_element_size = _weights->info()->element_size();
         const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
         const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
-        _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
-                                          _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
-                                          weights_row_stride,
-                                          weights_col_stride,
-                                          (_bias != nullptr) ? _bias->buffer() : nullptr);
-        _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+        _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+                                                  _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+                                                  weights_row_stride,
+                                                  weights_col_stride,
+                                                  (_bias != nullptr) ? _bias->buffer() : nullptr);
+        _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
 
         _weights->mark_as_unused();
         if(_bias != nullptr)
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 4c78eb87ea..440cb88ac2 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -215,6 +215,7 @@ public:
         // Stride 2
         add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
         add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL), Size2D(2U, 2U));
     }
 };
 /** Dataset containing optimized, 3x3 depthwise convolution shapes. */
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 773ebdeacc..2ffe540fbc 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -156,7 +156,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
-    bool is_valid = bool(NEDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false),
+    bool is_valid = bool(NEDepthwiseConvolutionLayerOptimized::validate(&input_info.clone()->set_is_resizable(false),
      &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
@@ -296,7 +296,7 @@ TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
 template <typename T>
-using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                     depth_multipliers),
                                                                                                                     framework::dataset::make("DataType",
@@ -409,7 +409,7 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 template <typename T>
-using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                    depth_multipliers),
                                                                                                                    framework::dataset::make("DataType",
@@ -480,7 +480,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEDepthwiseConvolutionLayerQuantizedFixture3x3 = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerQuantizedFixture3x3 = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
 template <typename T>
 using NEDepthwiseConvolutionLayerQuantizedFixture = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
 
-- 
cgit v1.2.1