From 881f2ded860fc1db23810076b699c4492556c376 Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@arm.com>
Date: Fri, 12 Apr 2019 10:29:17 +0100
Subject: COMPMID-2048: Add support for dilation in NEDepthwiseConvolution.

Change-Id: If9941e770779fbf918ba5ff0573da9378078b969
Signed-off-by: Usama Arif <usama.arif@arm.com>
Reviewed-on: https://review.mlplatform.org/c/999
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h |  15 +-
 .../core/NEON/kernels/NEDepthwiseIm2ColKernel.h    |  15 +-
 .../kernels/detail/NEDirectConvolutionDetail.h     | 376 ++++++++++++++++++++-
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |  12 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.h       |   3 +-
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp       | 106 +++---
 src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp  |  27 +-
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   |  11 +
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  55 ++-
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     |   6 +-
 .../validation/NEON/DepthwiseConvolutionLayer.cpp  | 188 ++++++++++-
 11 files changed, 718 insertions(+), 96 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 87ca4da05b..c0381cb8d7 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -58,21 +58,25 @@ public:
      * @param[out] output           Destination tensor. Data type supported: Same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
      */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
      *
      * @note Supported data layouts: NCHW and NHWC
      *
-     * @param[in] input            Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in] weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor. Data type supported: Same as @p input.
+     * @param[in] input            Source tensor info. DataType supported: QASYMM8/F16/F32.
+     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
+                           const Size2D &dilation = Size2D(1U, 1U));
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -86,6 +90,7 @@ private:
     PadStrideInfo  _conv_info;
     unsigned int   _num_elems_written_per_iteration;
     unsigned int   _depth_multiplier;
+    Size2D         _dilation;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
index de671361d6..3e123b4839 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,23 +62,27 @@ public:
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias         Boolean that specifies if the depthwise convolution has bias.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1);
+    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1,
+                   const Size2D &dilation = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseIm2ColKernel
      *
-     * @param[in] input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     * @param[in] input            The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
-     * @param[in] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
+     * @param[in] output           The output tensor info. First 3 lower dimensions represent a transform of each 3D input,
      *                             while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in] kernel_dims      The kernel dimensions (width and height).
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] has_bias         Boolean that specifies if the depthwise convolution has bias.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1,
+                           const Size2D &dilation = Size2D(1U, 1U));
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -104,6 +108,7 @@ private:
     PadStrideInfo              _conv_info;
     bool                       _has_bias;
     unsigned int               _depth_multiplier;
+    Size2D                     _dilation;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index e6dc43a47b..3547d2d110 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,6 +79,125 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0)
     return r;
 }
 
+/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional) Input quantization offset.
+ *
+ */
+inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
+                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                                const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + dilation_x),
+            vld1q_f32(in_top + 2 * dilation_x)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + dilation_x),
+            vld1q_f32(in_mid + 2 * dilation_x)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + dilation_x),
+            vld1q_f32(in_low + 2 * dilation_x)
+        }
+    };
+    float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
+    out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
+    out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
+
+    out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
+    out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
+    out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
+
+    out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
+    out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
+    out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional) Input quantization offset.
+ *
+ */
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
+                                    const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                    const size_t dilation_x, int input_offset = 0);
+
+template <>
+inline float32x4x2_t convolve_3x3_dilation<1>(const float *in_top, const float *in_mid, const float *in_low,
+                                              const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    const float32x4x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3_dilation<2>(const float *in_top, const float *in_mid, const float *in_low,
+                                              const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    float32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3_dilation<3>(const float *in_top, const float *in_mid, const float *in_low,
+                                              const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+
+    float32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+    ;
+    out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
 /** Perform a convolve3x3 on float32.
  *
  * @param[in] in_top       Pointer to the first row of the input.
@@ -183,6 +302,143 @@ inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, c
     return out;
 }
 
+/** Perform a 3x3 convolution for 4 consecutive elements on uint8_t when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset Input quantization offset.
+ *
+ */
+inline int32x4_t single_convolve_3x3_dilation(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
+                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                              size_t dilation_x, int input_offset)
+{
+    const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
+
+    const uint8x8x3_t vtop =
+    {
+        {
+            vld1_u8(in_top),
+            vld1_u8(in_top + dilation_x),
+            vld1_u8(in_top + 2 * dilation_x)
+        }
+    };
+    const uint8x8x3_t vmid =
+    {
+        {
+            vld1_u8(in_mid),
+            vld1_u8(in_mid + dilation_x),
+            vld1_u8(in_mid + 2 * dilation_x)
+        }
+    };
+    const uint8x8x3_t vlow =
+    {
+        {
+            vld1_u8(in_low),
+            vld1_u8(in_low + dilation_x),
+            vld1_u8(in_low + 2 * dilation_x)
+        }
+    };
+
+    const int32x4x3_t vtop_s32 =
+    {
+        {
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[0])))), //convert from uint8x8 to uint16x8, to uint16x4(lower or bottom half) to int16x4 to int32x4
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[1])))),
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[2])))),
+        }
+    };
+    const int32x4x3_t vmid_s32 =
+    {
+        {
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[0])))),
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[1])))),
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[2])))),
+        }
+    };
+    const int32x4x3_t vlow_s32 =
+    {
+        {
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[0])))),
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[1])))),
+            vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[2])))),
+        }
+    };
+
+    int32x4_t out = vmulq_s32(vtop_s32.val[0], m0.val[0]);
+    out           = vmlaq_s32(out, vtop_s32.val[1], m0.val[1]);
+    out           = vmlaq_s32(out, vtop_s32.val[2], m0.val[2]);
+
+    out = vmlaq_s32(out, vmid_s32.val[0], m1.val[0]);
+    out = vmlaq_s32(out, vmid_s32.val[1], m1.val[1]);
+    out = vmlaq_s32(out, vmid_s32.val[2], m1.val[2]);
+
+    out = vmlaq_s32(out, vlow_s32.val[0], m2.val[0]);
+    out = vmlaq_s32(out, vlow_s32.val[1], m2.val[1]);
+    out = vmlaq_s32(out, vlow_s32.val[2], m2.val[2]);
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 4 consecutive elements on uint8_t when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset Input quantization offset.
+ *
+ */
+template <unsigned int stridex>
+int32x4x2_t convolve_3x3_dilation(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
+                                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                  const size_t dilation_x, int input_offset);
+
+template <>
+inline int32x4x2_t convolve_3x3_dilation<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                            const size_t dilation_x, int input_offset)
+{
+    const int32x4x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+    return out;
+}
+
+template <>
+inline int32x4x2_t convolve_3x3_dilation<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
+                                            const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                            const size_t dilation_x, int input_offset)
+{
+    int32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+
+    out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1);
+    out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2);
+    out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline int32x4x2_t convolve_3x3_dilation<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
+                                            const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
+                                            const size_t dilation_x, int input_offset)
+{
+    int32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+    out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
 /** Perform a convolve3x3 on uint8_t
  *
  * @param[in] in_top       Pointer to the first row of the input.
@@ -390,6 +646,124 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
     return r;
 }
 
+/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional)Input quantization offset.
+ *
+ */
+inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                                const size_t dilation_x, int input_offset = 0)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+    const float16x8x3_t vtop =
+    {
+        {
+            vld1q_f16(in_top),
+            vld1q_f16(in_top + dilation_x),
+            vld1q_f16(in_top + 2 * dilation_x)
+        }
+    };
+    const float16x8x3_t vmid =
+    {
+        {
+            vld1q_f16(in_mid),
+            vld1q_f16(in_mid + dilation_x),
+            vld1q_f16(in_mid + 2 * dilation_x)
+        }
+    };
+    const float16x8x3_t vlow =
+    {
+        {
+            vld1q_f16(in_low),
+            vld1q_f16(in_low + dilation_x),
+            vld1q_f16(in_low + 2 * dilation_x)
+        }
+    };
+    float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
+    out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
+    out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
+
+    out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0]));
+    out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1]));
+    out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2]));
+
+    out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0]));
+    out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1]));
+    out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2]));
+
+    return out;
+}
+
+/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
+ *
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] dilation_x   Dilation, in elements across x.
+ * @param[in] input_offset (Optional)Input quantization offset.
+ *
+ */
+template <unsigned int stridex>
+float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                    const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                    const size_t dilation_x, int input_offset = 0);
+
+template <>
+inline float16x8x2_t convolve_3x3_dilation<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                              const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    const float16x8x2_t out =
+    {
+        {
+            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
+        }
+    };
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3_dilation<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                              const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+    float16x8x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3_dilation<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
+                                              const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                              const size_t dilation_x, int input_offset)
+{
+    ARM_COMPUTE_UNUSED(input_offset);
+    float16x8x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
+    return out;
+}
+
 /** Perform a convolve3x3 on float16.
  *
  * @param[in] in_top Pointer to the first row of the input.
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index c60233664d..396e2368c3 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -73,7 +73,7 @@ public:
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only.
+     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                    unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
@@ -88,7 +88,7 @@ public:
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return a status
      */
@@ -110,9 +110,11 @@ private:
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]      act_info         Activation layer information in case of a fused activation.
+     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
      */
     void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier, const ActivationLayerInfo &act_info);
+                           unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
     /** Configure the kernels/functions for the optimized pipeline.
      *
      * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
@@ -186,7 +188,7 @@ public:
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only.
+     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                    unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
@@ -201,7 +203,7 @@ public:
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
index df8f29d2c7..7d2cff7315 100644
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
@@ -92,10 +92,11 @@ public:
      * @param[in] weights          Weights tensor info.
      * @param[in] conv_info        Convolution layer metadata.
      * @param[in] depth_multiplier (Optional) Depth multiplier to be used.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      *
      * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
      */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1);
+    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1, 1));
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index ec672e0fc4..fdafc2da90 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -49,7 +49,7 @@ class convolver_3x3
 {
 public:
     static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
     {
         const int input_offset   = -input->info()->quantization_info().offset;
         const int weights_offset = -weights->info()->quantization_info().offset;
@@ -57,6 +57,7 @@ public:
         const int          input_stride_x  = input->info()->strides_in_bytes().x();
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
         const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          input_stride_w  = input->info()->strides_in_bytes()[3];
         const int          output_stride_y = output->info()->strides_in_bytes().y();
         const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
         const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
@@ -74,9 +75,10 @@ public:
 
         // setup input window for the iterator
         Window window_in = window;
-        // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
+        // Iteration of input is taken care of in execute_window_loop
         window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
         window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Window window_k = calculate_max_window(*weights->info(), Steps(1u));
 
@@ -91,7 +93,7 @@ public:
             int ih = 0;
             int oh = 0;
 
-            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y - (id.z() - id.z() / depth_multiplier) * input_stride_z;
+            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y + (id.z() / depth_multiplier) * input_stride_z + input_stride_w * id[3];
             const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
 
             const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
@@ -104,45 +106,54 @@ public:
             for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
             {
                 auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
-                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + 1) * input_stride_y);
-                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2) * input_stride_y);
-                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);
+                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
+                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); //uint8
+                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);                           //int32
 
                 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                     in_top += delta_input, in_mid += delta_input, in_low += delta_input,
                     p_out += num_elems_written_per_iteration)
                 {
-                    auto vres = detail::convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
-                    detail::store_results<stridex>(p_out, vres);
+                    if(dilation == Size2D(1U, 1U))
+                    {
+                        auto vres = detail::convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
+                        detail::store_results<stridex>(p_out, vres);
+                    }
+                    else
+                    {
+                        auto vres = detail::convolve_3x3_dilation<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), input_offset);
+                        detail::store_results<stridex>(p_out, vres);
+                    }
                 }
             }
         },
-        in, out);
+        out);
     }
 };
 
 template <typename T1, typename T2>
 inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+                         const ITensor *input, const ITensor *weights, ITensor *output,
+                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     switch(conv_stride_x)
     {
         case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -157,7 +168,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
 
         if(is_data_type_quantized_asymmetric(input->data_type()))
@@ -173,13 +184,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                        const Size2D &dilation)
 {
     Window win;
     bool   window_changed = false;
 
     // Get convolved dimensions
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
     const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
 
     // Output auto inizialitation if not yet initialized
@@ -197,15 +209,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     switch(input->data_type())
     {
         case DataType::QASYMM8:
-            num_elems_read_per_iteration = 16;
+            num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            num_elems_read_per_iteration = 24;
+            num_elems_written_per_iteration = 32 >> conv_stride_x;
+            num_elems_read_per_iteration    = 24 + 23 * (dilation.x() - 1);
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            num_elems_read_per_iteration = 12;
+            num_elems_read_per_iteration = 12 + 11 * (dilation.x() - 1);
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported.");
@@ -214,7 +227,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Configure kernel window
     win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
 
-    AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3, conv_stride_x, conv_stride_y);
+    AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3 + 2 * (dilation.y() - 1), conv_stride_x, conv_stride_y);
     AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
     AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
 
@@ -227,7 +240,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1)
+    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1), _dilation()
 {
 }
 
@@ -236,28 +249,41 @@ BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
     return _border_size;
 }
 
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                     const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier));
-
-    _input                           = input;
-    _output                          = output;
-    _weights                         = weights;
-    _conv_info                       = conv_info;
-    _depth_multiplier                = depth_multiplier;
-    _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
-    _border_size                     = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
-
-    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, dilation));
+
+    _input            = input;
+    _output           = output;
+    _weights          = weights;
+    _conv_info        = conv_info;
+    _depth_multiplier = depth_multiplier;
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+        case DataType::F32:
+            _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
+            break;
+        case DataType::F16:
+            _num_elems_written_per_iteration = 32 >> _conv_info.stride().first;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported.");
+    }
+    _border_size    = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
+    _dilation       = dilation;
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, dilation);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                      const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, dilation));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, dilation).first);
     return Status{};
 }
 
@@ -272,14 +298,14 @@ void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const Threa
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
         case DataType::QASYMM8:
-            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 62373e348b..88f8b31a35 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -38,7 +38,8 @@ using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(conv_info);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -48,6 +49,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || dilation.y() < 1);
 
     return Status{};
 }
@@ -84,7 +86,7 @@ void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
     Iterator out(_output, window_out);
 
     const int full_length   = input_w + pad_left + pad_right;
-    const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
+    const int max_initial_x = stride_x * (((full_length - (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1))) / stride_x) + 1);
 
     // Define pad value
     auto zero = static_cast<T>(0);
@@ -103,12 +105,12 @@ void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr() + id.z() / _depth_multiplier * input_stride_z;
         auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
-        const int            height     = src_y + _kernel_dims.height;
-        const int            width      = src_x + _kernel_dims.width;
+        const int            height     = src_y + (_kernel_dims.height + (_kernel_dims.height - 1) * (_dilation.y() - 1));
+        const int            width      = src_x + (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1));
 
-        for(int y = src_y; y < height; ++y)
+        for(int y = src_y; y < height; y += _dilation.y())
         {
-            for(int x = src_x; x < width; ++x, ++output_ptr)
+            for(int x = src_x; x < width; x += _dilation.x(), ++output_ptr)
             {
                 if(x < 0 || x >= input_w || y < 0 || y >= input_h)
                 {
@@ -130,15 +132,16 @@ void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
 }
 
 NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1), _dilation()
 {
 }
 
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                        const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
 
     _input            = input;
     _output           = output;
@@ -146,6 +149,7 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
     _conv_info        = conv_info;
     _has_bias         = has_bias;
     _depth_multiplier = depth_multiplier;
+    _dilation         = dilation;
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -172,10 +176,11 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
     INEKernel::configure(win);
 }
 
-Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                         const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 23c3f81edc..8211104bda 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -49,6 +49,14 @@ void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -239,6 +247,9 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
     if(bool(can_run_optimised_3x3_kernel))
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 4c602b3640..5133756993 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -51,7 +51,8 @@ void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor
                                                        ITensor                   *output,
                                                        const PadStrideInfo       &conv_info,
                                                        unsigned int               depth_multiplier,
-                                                       const ActivationLayerInfo &act_info)
+                                                       const ActivationLayerInfo &act_info,
+                                                       const Size2D              &dilation)
 {
     ARM_COMPUTE_UNUSED(act_info);
 
@@ -87,8 +88,8 @@ void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor
         _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
-        // Configure optimized depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
+        // Configure depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
 
         // Configure border handler
         _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -99,7 +100,7 @@ void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor
     else
     {
         // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
 
         // Configure border handler
         _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -185,18 +186,25 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
                                                const ActivationLayerInfo &act_info,
                                                const Size2D              &dilation)
 {
-    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
-    ARM_COMPUTE_UNUSED(dilation);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
     _has_bias         = biases != nullptr;
     _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
                                                                                        weights->info(),
                                                                                        conv_info,
-                                                                                       depth_multiplier);
+                                                                                       depth_multiplier, dilation);
     _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
     _permute                    = _is_optimized == _is_nchw;
     _is_prepared                = false;
@@ -209,7 +217,7 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
     }
     else
     {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
     }
 
     // Configure activation
@@ -230,7 +238,11 @@ Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     if(biases != nullptr)
     {
@@ -239,7 +251,7 @@ Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier))
+    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
     {
         const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
         TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -356,12 +368,17 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
-    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
-    ARM_COMPUTE_UNUSED(dilation);
-
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -392,7 +409,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -420,7 +437,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -491,11 +508,13 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
 
     const unsigned int width_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
     // Clone output to use auto init
     auto output_clone = output->clone();
 
@@ -522,7 +541,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
 
     const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
     const size_t       weights_w    = weights_to_use->dimension(0);
     const size_t       weights_h    = weights_to_use->dimension(1);
     const size_t       weights_z    = weights_to_use->dimension(2);
@@ -549,7 +568,7 @@ Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index d9b2bff810..049bf66689 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -254,7 +254,8 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
 bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
                                                                     const ITensorInfo *weights,
                                                                     PadStrideInfo      conv_info,
-                                                                    unsigned int       depth_multiplier)
+                                                                    unsigned int       depth_multiplier,
+                                                                    const Size2D      &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
 
@@ -290,8 +291,9 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
     bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
     bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
     bool          supported_padding = is_same_padding || is_valid_padding;
+    bool          is_dilation_1     = dilation.x() == 1 && dilation.y() == 1;
 
-    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1);
+    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
 }
 
 void NEDepthwiseConvolutionAssemblyDispatch::run()
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 6c0f590241..b61393f9ea 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -29,6 +29,7 @@
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/DepthwiseConvolutionLayerDataset.h"
+#include "tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -60,7 +61,7 @@ TEST_SUITE(DepthwiseConvLayer)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
@@ -69,6 +70,8 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid biases size
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid biases dimensions
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid output size
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // patch size bigger than input width
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // dilation < 1
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                      }),
                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
@@ -80,6 +83,8 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                        })),
                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -90,6 +95,8 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                       })),
                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
@@ -100,6 +107,8 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                       })),
                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
@@ -110,6 +119,8 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
                                                      })),
                framework::dataset::make("DepthMultiplier", { 1,
                                                              1,
@@ -120,21 +131,38 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                              1,
                                                              1,
                                                              1,
+                                                             1,
+                                                             1,
                                                             })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true })),
-               input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, expected)
+               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(1U, 1U),
+                                                      Size2D(25U, 1U),
+                                                      Size2D(0U, 1U),
+                                                      Size2D(1U, 1U),
+                                                            })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
-    bool is_valid = bool(NEDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier));
+    bool is_valid = bool(NEDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false),
+     &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 
-DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                 framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
+                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // patch size bigger than input width
+                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // dilation < 1
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
                                                       }),
@@ -145,6 +173,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
                                                         })),
                 framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -154,6 +184,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(24U), 1, DataType::S32),
                                                        })),
                 framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -163,6 +195,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
                                                        })),
                 framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
@@ -172,6 +206,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 1, 0),
                                                       })),
                 framework::dataset::make("DepthMultiplier", { 1,
@@ -181,12 +217,25 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               1,
                                                               1,
                                                               2,
+                                                              2,
+                                                              2,
                                                               3,
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, true, true })),
-                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, expected)
+                framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(25U, 1U),
+                                                       Size2D(0U, 1U),
+                                                       Size2D(1U, 1U),
+                                                       Size2D(1U, 1U),
+                                                             })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false,false, false, true, true })),
+                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
-    bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier));
+    bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -213,6 +262,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, fram
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                                                 depth_multipliers),
+                                                                                                                 framework::dataset::make("DataType",
+                                                                                                                         DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
@@ -234,6 +302,26 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, f
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                                                                                    depth_multipliers),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                                                                                        depth_multipliers),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // Dilation
+
 FIXTURE_DATA_TEST_CASE(RunOptimizedSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
                                                framework::dataset::make("DepthMultiplier", 1)),
@@ -276,6 +364,26 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, frame
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                                                        depth_multipliers),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                                                    depth_multipliers),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // Dilation
+
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 template <typename T>
@@ -296,6 +404,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<half>, fr
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
+
+TEST_SUITE(Dilation)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                                                                                   depth_multipliers),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                                                                                       depth_multipliers),
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE_END() // Dilation
+
 FIXTURE_DATA_TEST_CASE(RunOptimizedSmall, NEDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
                                                framework::dataset::make("DepthMultiplier", 1)),
@@ -337,6 +467,27 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uin
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() //Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
@@ -376,6 +527,27 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
+TEST_SUITE(Dilation)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
-- 
cgit v1.2.1