From f01201abec0a102f6e7a517971f83fef1eaffd50 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 6 Nov 2019 14:57:49 +0000
Subject: COMPMID-2305: NEDepthwiseConvolution 3x3: support for
 QUANT8_PER_CHANNEL_SYMM

Change-Id: I9a917cff6a089ce6ae16fb4e6066a4194e2e9487
Signed-off-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2241
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 .../NEON/kernels/convolution/common/qsymm8.hpp     |  76 ++++
 .../convolution/depthwise/depthwise_quantized.hpp  | 156 +++++++
 .../NEON/kernels/convolution/common/qsymm8.cpp     | 185 +++++++++
 .../convolution/depthwise/depthwise_qs8_qs8.cpp    |  31 ++
 .../kernels/convolution/depthwise/impl_base.hpp    |   3 +-
 .../kernels/convolution/depthwise/impl_qa8_qa8.hpp |  48 ---
 .../depthwise/impl_qa8_qs8_per_channel.hpp         | 457 +++++++++++++++++++++
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |   9 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     |  95 ++++-
 .../validation/NEON/DepthwiseConvolutionLayer.cpp  |  27 ++
 10 files changed, 1030 insertions(+), 57 deletions(-)
 create mode 100644 arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/qsymm8.cpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
new file mode 100644
index 0000000000..41bfbe4d8a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+#include <vector>
+#include "qasymm8.hpp"
+
+
+namespace qsymm8 {
+
+struct QSymm8Params {
+  int8_t quantize(float value) const;
+  float dequantize(int8_t value) const;
+
+  float scale;
+};
+
+struct QSymm8RescaleParams {
+  static QSymm8RescaleParams
+  make_rescale_params(const QSymm8Params &weight_quant,
+                      const QSymm8Params &input_quant,
+                      const QSymm8Params &output_quant);
+
+  QSymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
+
+  const int32_t shift, multiplier;
+  const float rescale;
+};
+
+struct QSymm8PerChannelParams {
+  int8_t quantize(float value, float scale) const;
+  float dequantize(int8_t value, float scale) const;
+
+  std::vector<float> scales;
+};
+
+struct QSymm8PerChannelRescaleParams {
+  static QSymm8PerChannelRescaleParams
+  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
+                      const QSymm8PerChannelParams &input_quant,
+                      const QSymm8PerChannelParams &output_quant);
+
+  static QSymm8PerChannelRescaleParams
+  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
+                      const qasymm8::QAsymm8Params &input_quant,
+                      const qasymm8::QAsymm8Params &output_quant);
+
+  QSymm8PerChannelRescaleParams(std::vector<int32_t>& shift, std::vector<int32_t>& multiplier, std::vector<float>& rescale);
+
+  std::vector<int32_t>  shifts, multipliers;
+  std::vector<float> rescales;
+};
+
+} // namespace qsymm8
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index f8db4db6cc..ef3adc4c0c 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -25,6 +25,68 @@
 #pragma once
 #include "depthwise.hpp"
 #include "qasymm8.hpp"
+#include "qsymm8.hpp"
+#pragma once
+
+using namespace neon_convolution_kernels;
+using namespace qasymm8;
+
+template <typename T, typename U = int32_t>
+inline T saturating_doubling_high_mul(const T&, const U&);
+
+template <>
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
+{
+  return vqrdmulhq_s32(a, b);
+}
+
+template <>
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
+{
+  return vqrdmulhq_n_s32(a, b);
+}
+
+template <>
+inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
+{
+  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+template <typename T, typename U = int32_t>
+inline T rounding_divide_by_exp2(const T& x, const U exponent);
+
+template <>
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
+{
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+  const int32x4_t fixed = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed, shift);
+}
+
+template <>
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
+{
+  const int32x4_t shift = vdupq_n_s32(-exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+  const int32x4_t fixed = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed, shift);
+}
+
+template <>
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
+{
+  const int32x2_t shift = vdup_n_s32(-exponent);
+  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+  const int32x2_t fixed = vqadd_s32(x, fixup);
+  return vrshl_s32(fixed, shift);
+}
+
+template <>
+inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
+{
+  const int32x2_t xs = vdup_n_s32(x);
+  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
 
 namespace depthwise
 {
@@ -145,4 +207,98 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
     const qasymm8::QAsymm8RescaleParams rescale_parameters;
 };
 
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  uint8_t, int32_t, uint8_t,
+  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    uint8_t, int32_t, uint8_t,
+    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+  >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+  QSymm8HybridPerChannelDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  QSymm8HybridPerChannelDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  size_t get_packed_params_size(void) const override
+  {
+      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
+
+  }
+
+  protected:
+    uint8_t _input_padding_value(void) const;
+
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      uint8_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+
+  private:
+    // Quantization parameters
+    const qsymm8::QSymm8PerChannelParams _weights_quant;
+    const qasymm8::QAsymm8Params _input_quant, _output_quant;
+    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
+};
+
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/common/qsymm8.cpp b/src/core/NEON/kernels/convolution/common/qsymm8.cpp
new file mode 100644
index 0000000000..e50263acaa
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/qsymm8.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+
+#include "qsymm8.hpp"
+
+namespace qsymm8 {
+#if(__ANDROID__ || BARE_METAL)
+template <typename T> T round(T val) {  return ::round(val); }
+template <typename T> T exp2(T val) { return ::exp2(val); }
+template <typename T> T log2(T val) { return ::log2(val); }
+#else  /* (__ANDROID__ || BARE_METAL) */
+template <typename T> T round(T val) { return std::round(val); }
+template <typename T> T exp2(T val) { return std::exp2(val); }
+template <typename T> T log2(T val) { return std::log2(val); }
+#endif  /* (__ANDROID__ || BARE_METAL) */
+
+// Symmetric quantization
+int8_t QSymm8Params::quantize(float value) const
+{
+  const float transformed = value / scale;
+  return static_cast<int8_t>(round(std::max(-128.0f, std::min(127.0f, transformed))));
+}
+
+float QSymm8Params::dequantize(const int8_t value) const
+{
+  return scale * (static_cast<float>(value));
+}
+
+QSymm8RescaleParams QSymm8RescaleParams::make_rescale_params(
+  const QSymm8Params& weight_quant,
+  const QSymm8Params& input_quant,
+  const QSymm8Params& output_quant
+)
+{
+  // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc
+  const float rescale = weight_quant.scale * input_quant.scale / output_quant.scale;
+  const float shiftf = round(log2(0.5f / rescale));
+  const float multf = exp2(31.0f + shiftf)*rescale;
+
+  int64_t shift = static_cast<int64_t>(shiftf);
+  int64_t mult = static_cast<int64_t>(multf);
+
+  if (mult == (1ll << 31))
+  {
+    mult /= 2;
+    shift--;
+  }
+
+  assert(shift >= 0);
+  assert(mult <= std::numeric_limits<int32_t>::max());
+
+  return QSymm8RescaleParams(
+    static_cast<int32_t>(shift),
+    static_cast<int32_t>(mult),
+    rescale
+  );
+}
+
+QSymm8RescaleParams::QSymm8RescaleParams(int32_t shift, int32_t multi, float rescale)
+  : shift(shift), multiplier(multi), rescale(rescale)
+{
+}
+
+// Symmetric per-channel quantization
+int8_t QSymm8PerChannelParams::quantize(float value, float scale) const
+{
+  const float transformed = value / scale;
+  return static_cast<int8_t>(round(std::max(-128.0f, std::min(127.0f, transformed))));
+}
+
+float QSymm8PerChannelParams::dequantize(const int8_t value, float scale) const
+{
+  return scale * (static_cast<float>(value));
+}
+
+QSymm8PerChannelRescaleParams QSymm8PerChannelRescaleParams::make_rescale_params(
+  const QSymm8PerChannelParams& weight_quant,
+  const QSymm8PerChannelParams& input_quant,
+  const QSymm8PerChannelParams& output_quant
+)
+{
+    std::vector<int32_t> shifts;
+    std::vector<int32_t> mults;
+    std::vector<float> rescales;
+
+    for(size_t s = 0; s< input_quant.scales.size(); s++)
+    {
+          // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc
+          const float rescale = weight_quant.scales[s] * input_quant.scales[s] / output_quant.scales[s];
+          const float shiftf = round(log2(0.5f / rescale));
+          const float multf = exp2(31.0f + shiftf)*rescale;
+
+          int64_t shift = static_cast<int64_t>(shiftf);
+          int64_t mult = static_cast<int64_t>(multf);
+
+          if (mult == (1ll << 31))
+          {
+            mult /= 2;
+            shift--;
+          }
+
+          assert(shift >= 0);
+          assert(mult <= std::numeric_limits<int32_t>::max());
+
+          shifts.push_back(static_cast<int32_t>(shift));
+          mults.push_back(static_cast<int32_t>(mult));
+          rescales.push_back(rescale);
+    }
+
+  return QSymm8PerChannelRescaleParams(shifts, mults, rescales);
+
+}
+
+QSymm8PerChannelRescaleParams QSymm8PerChannelRescaleParams::make_rescale_params(
+  const QSymm8PerChannelParams& weight_quant,
+  const qasymm8::QAsymm8Params& input_quant,
+  const qasymm8::QAsymm8Params& output_quant
+)
+{
+    std::vector<int32_t> shifts;
+    std::vector<int32_t> mults;
+    std::vector<float> rescales;
+
+    for(size_t s = 0; s< weight_quant.scales.size(); s++)
+    {
+          // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc
+          const float rescale = weight_quant.scales[s] * input_quant.scale / output_quant.scale;
+          const float shiftf = round(log2(0.5f / rescale));
+          const float multf = exp2(31.0f + shiftf)*rescale;
+
+          int64_t shift = static_cast<int64_t>(shiftf);
+          int64_t mult = static_cast<int64_t>(multf);
+
+          if (mult == (1ll << 31))
+          {
+            mult /= 2;
+            shift--;
+          }
+
+          assert(shift >= 0);
+          assert(mult <= std::numeric_limits<int32_t>::max());
+
+          shifts.push_back(static_cast<int32_t>(shift));
+          mults.push_back(static_cast<int32_t>(mult));
+          rescales.push_back(rescale);
+    }
+
+  return QSymm8PerChannelRescaleParams(shifts, mults, rescales);
+
+}
+
+QSymm8PerChannelRescaleParams::QSymm8PerChannelRescaleParams(std::vector<int32_t>& shifts, std::vector<int32_t>& multipliers, std::vector<float>& rescales)
+  : shifts(shifts), multipliers(multipliers), rescales(rescales)
+{
+}
+
+
+} // namespace qasymm8
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
new file mode 100644
index 0000000000..88d8e9f112
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_qa8_qs8_per_channel.hpp"
+
+namespace depthwise {
+template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
+template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
+template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
+template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index b102a24250..22231cf019 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -292,6 +292,7 @@ MEMBERFN(void)::run(
   // Parallelise over blocks of channels
   const auto start_channel = CHANNEL_BLOCK * start;
   const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
+  const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
 
   // Compute top and bottom padding for input and output
   const int input_pad_top = _padding_top;
@@ -325,7 +326,7 @@ MEMBERFN(void)::run(
 
       // Get the offset into the packed parameters
       const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
-        start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+        start_channel*params_size_per_channel;
 
       // Process the row
       process_tile_row(
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index e8f44b6bfd..81eb7b306c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -36,54 +36,6 @@
 #include "impl_base.hpp"
 #include "depthwise_quantized.hpp"
 
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-template <typename T>
-inline T saturating_doubling_high_mul(const T&, const int32_t&);
-
-template <>
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
-  return vqrdmulhq_n_s32(a, b);
-}
-
-template <>
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
-  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-template <typename T>
-inline T rounding_divide_by_exp2(const T& x, const int exponent);
-
-template <>
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
-  const int32x4_t shift = vdupq_n_s32(-exponent);
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-template <>
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
-  const int32x2_t shift = vdup_n_s32(-exponent);
-  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-  const int32x2_t fixed = vqadd_s32(x, fixup);
-  return vrshl_s32(fixed, shift);
-}
-
-template <>
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
-  const int32x2_t xs = vdup_n_s32(x);
-  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
 namespace depthwise
 {
 template <
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
new file mode 100644
index 0000000000..b27430c242
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include <limits>
+
+#include "arm.hpp"
+#include "impl_base.hpp"
+#include "depthwise_quantized.hpp"
+
+#pragma once
+
+namespace {
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename FInput, typename FOutput
+>
+static inline void tilefn_hybrid(
+  int n_channels,
+  const void* packed_params,
+  FInput &get_input_ptr,
+  FOutput &get_output_ptr,
+  int32_t clamp_min,
+  int32_t clamp_max,
+  uint8_t input_offset,
+  uint8_t output_offset
+)
+{
+  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
+  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
+
+  // Offset into channels
+  int channel = 0;
+
+  // Byte type pointer to weights and biases
+  const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
+
+  for (; n_channels >= 8; n_channels -= 8, channel += 8)
+  {
+    const int32x4_t biases[2] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+    };
+    const int32x4_t multipliers[2] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
+    };
+    const int32x4_t shifts[2] = {
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
+      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
+    };
+    wbptr += 24*sizeof(int32_t);
+
+    int16x8_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        const auto w = vld1_s8(wbptr);
+        weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
+        wbptr += 8;
+      }
+    }
+
+    int16x8_t inputs[InnerTileRows][InnerTileCols];
+    const uint8x8_t ioffset = vdup_n_u8(input_offset);
+    for (unsigned int i = 0; i < InnerTileRows; i++)
+    {
+      for (unsigned int j = 0; j < InnerTileCols; j++)
+      {
+        const auto x = vld1_u8(get_input_ptr(i, j, channel));
+        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
+      }
+    }
+
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        int32x4_t accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+          accs[i] = biases[i];
+        }
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const auto w = weights[wi][wj];
+            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
+            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
+            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
+          }
+        }
+
+        int32x4_t final_accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+          const int32x4_t y = rounding_divide_by_exp2(
+              saturating_doubling_high_mul(accs[i], multipliers[i]),
+              shifts[i]);
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
+          final_accs[i] = vaddq_s32(y, offset);
+          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
+          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
+        }
+
+        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
+                                         vreinterpretq_s16_s32(final_accs[1]));
+        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
+        const uint8x8_t output =
+                    vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
+
+        vst1_u8(get_output_ptr(oi, oj, channel), output);
+      }
+    }
+  }
+
+  for (; n_channels; n_channels--, channel++)
+  {
+    // Load bias
+    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+    const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
+    const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
+
+    wbptr += 3*sizeof(int32_t);
+
+    // Load weights
+    int16_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = *(wbptr++);
+      }
+    }
+
+    // Load the input activations
+    int16_t inputs[InnerTileRows][InnerTileCols];
+    for (unsigned int i = 0; i < InnerTileRows; i++)
+    {
+      for (unsigned int j = 0; j < InnerTileCols; j++)
+      {
+        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
+      }
+    }
+
+    // Perform the convolution
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        int32_t acc = bias;
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+            acc += w * x;
+          }
+        }
+
+        // Requantize
+        acc = rounding_divide_by_exp2(
+            saturating_doubling_high_mul(acc, multiplier),
+            -shift);
+        acc += output_offset;
+        acc = std::max(acc, clamp_min);
+        acc = std::min(acc, clamp_max);
+        uint8_t output = static_cast<uint8_t>(acc);
+        *(get_output_ptr(oi, oj, channel)) = output;
+      }
+    }
+  }
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename FInput, typename FOutput
+>
+static inline void execute_tilefn_hybrid(
+  int n_channels,
+  const void* packed_params,
+  const ActivationFunction actfn,
+  const qasymm8::QAsymm8Params &input_quant,
+  const qasymm8::QAsymm8Params &output_quant,
+  FInput &get_input_ptr,
+  FOutput &get_output_ptr) {
+
+  // Compute min/max clamp values
+  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
+  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
+
+  if (actfn == ActivationFunction::ReLU) {
+    clamp_min = output_quant.offset;
+  }
+
+  // Disabling Relu6 for now
+  if (actfn == ActivationFunction::ReLU6) {
+    const int32_t top_rail = output_quant.quantize(6.0f);
+    clamp_max = std::min(clamp_max, top_rail);
+  }
+
+  // Call the tile execution method
+  tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
+}
+}
+
+
+
+namespace depthwise {
+using namespace qsymm8;
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QSymm8HybridPerChannelDepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  const ActivationFunction activation,
+  const QSymm8PerChannelParams& weight_quantisation,
+  const qasymm8::QAsymm8Params& input_quantisation,
+  const qasymm8::QAsymm8Params& output_quantisation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : QSymm8HybridPerChannelDepthwiseConvolution(
+    n_batches, n_input_rows, n_input_cols, n_channels,
+    activation, weight_quantisation, input_quantisation, output_quantisation,
+    QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+    padding_top, padding_left, padding_bottom, padding_right
+  )
+{
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QSymm8HybridPerChannelDepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  const ActivationFunction activation,
+  const QSymm8PerChannelParams& weight_quantisation,
+  const qasymm8::QAsymm8Params& input_quantisation,
+  const qasymm8::QAsymm8Params& output_quantisation,
+  const QSymm8PerChannelRescaleParams& rescale_params,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+  ),
+  _weights_quant(weight_quantisation),
+  _input_quant(input_quantisation),
+  _output_quant(output_quantisation),
+  _rescale_parameters(rescale_params)
+{
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_input_padding_value(void) const
+{
+  return _input_quant.offset;
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+void QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_pack_params(
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
+{
+  const int8_t *wptr = static_cast<const int8_t *>(weights);
+  const int32_t *bptr = static_cast<const int32_t *>(biases);
+  const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
+  const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
+  int8_t *outptr = static_cast<int8_t *>(buffer);
+
+  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
+  // For SVE set this to half the vector length.
+  unsigned int veclen = 8;
+
+  // While there are channels left to process, pack a vector length of them at
+  // a time and reduce the size of vector used as the size of the tensor
+  // decreases.
+  for (
+    unsigned int n_channels = this->n_channels(); n_channels;
+    n_channels -= veclen,
+    outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
+  )
+  {
+    // NOTE Ignore this section if using SVE, the vector length remains the
+    // same and we just don't fill a full register for the tail.
+    while (n_channels < veclen)
+    {
+      // Reduce the vector length to either 8 or 1 (scalar)
+      // TODO Support more vector lengths in `execute_tile`.
+      veclen = (veclen == 16) ? 8 : 1;
+    }
+
+    // Get pointers to bias and weight portions of the output structure.
+    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
+    int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
+    int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
+    int8_t  *out_wptr = outptr + 3*veclen*sizeof(int32_t);
+
+    // Copy a vector length of elements
+    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
+    {
+      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
+      const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
+      const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
+
+      out_bptr[n] = bias;
+      out_mptr[n] = multiplier;
+      out_sptr[n] = -shift;
+
+      for (unsigned int i = 0; i < KernelRows; i++)
+      {
+        int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
+        for (unsigned int j = 0; j < KernelCols; j++)
+        {
+          int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
+          row_outptr[j*veclen + n] = w;
+        }
+      }
+      wptr++;
+    }
+  }
+}
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+  int n_channels,
+  const void* packed_params,
+  const uint8_t* inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  uint8_t* outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride
+) {
+
+  // Construct methods to get pointers
+  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
+      const int i, const int j, const int channel) {
+    return inptr + i * in_row_stride + j * in_col_stride + channel;
+  };
+
+  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
+      const int i, const int j, const int channel) {
+    return outptr + i * out_row_stride + j * out_col_stride + channel;
+  };
+
+  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+                 StrideRows, StrideCols>(
+      n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void QSymm8HybridPerChannelDepthwiseConvolution<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+  int n_channels,
+  const void* packed_params,
+  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+) {
+  // Construct methods to get pointers
+  const auto get_input_ptr = [inptrs](const int i, const int j,
+                                      const int channel) {
+    return inptrs[i][j] + channel;
+  };
+
+  const auto get_output_ptr = [outptrs](const int i, const int j,
+                                        const int channel) {
+    return outptrs[i][j] + channel;
+  };
+
+  // Call the tile execution method
+  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+                 StrideRows, StrideCols>(
+      n_channels, packed_params, Activation,  _input_quant, _output_quant, get_input_ptr, get_output_ptr);
+}
+
+} // namespace depthwise
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 6cf7b97e66..5e47dd56ae 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -40,7 +40,10 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
@@ -55,7 +58,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type());
 
     if(is_quantized)
     {
@@ -67,7 +70,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
         ARM_COMPUTE_UNUSED(multiplier);
         ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
     }
-
     if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
     {
         TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -88,7 +90,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
     }
-
     return Status{};
 }
 } // namespace
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 92ad93e4a7..c564e22d46 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -84,6 +84,48 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kern
     }
 }
 
+std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
+                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
+                                                                                  neon_convolution_kernels::ActivationFunction activation,
+                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
+                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
+                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
+{
+    switch(kernel_size)
+    {
+        case 3:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        case 5:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
+                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+                default:
+                    return nullptr;
+            }
+        }
+        default:
+            return nullptr;
+    }
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
                                                                      int n_batches, int in_rows, int in_cols, int n_channels,
@@ -187,6 +229,9 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
     const int padding_bottom  = conv_info.pad_bottom();
     const int padding_right   = conv_info.pad_right();
 
+    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8);
+    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
+
     const unsigned int stride_x    = conv_info.stride().first;
     const unsigned int kernel_size = weights->info()->tensor_shape().y();
 
@@ -202,7 +247,7 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
     }
 
     // Create quantized convolver
-    if(data_type == DataType::QASYMM8)
+    if(is_uniform_quantized)
     {
         const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform();
@@ -226,6 +271,40 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
         return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
                                      wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
     }
+    else if(is_perchannel_quantized)
+    {
+        const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
+        const QuantizationInfo        weights_qinfo = weights->info()->quantization_info();
+        const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
+        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        std::vector<float> fmultipliers;
+        std::vector<int>   qmultipliers;
+        std::vector<int>   qshifts;
+
+        for(auto const s : wqinfo.scales)
+        {
+            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
+            int         qmultiplier = 0;
+            int         qshift      = 0;
+            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+            fmultipliers.push_back(fmultipler);
+            qmultipliers.push_back(qmultiplier);
+            qshifts.push_back(qshift);
+        }
+
+        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
+
+        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
+                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+    }
     else
     {
         // Create float convolver
@@ -328,7 +407,10 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     // Validate convolver
@@ -378,7 +460,7 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
 
     // Check data type
     const DataType data_type          = weights->data_type();
-    bool           is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
+    bool           is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type) || data_type == DataType::QSYMM8_PER_CHANNEL;
 
     // Check weighs size
     std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
@@ -402,7 +484,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
     bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
     bool          supported_padding = is_same_padding || is_valid_padding;
     // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = (dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1);
+    bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
+
+    if(data_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
+    }
 
     return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
 }
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 37d2373d7b..6d8c083c3f 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -680,6 +680,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPe
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
+
+TEST_SUITE(Optimized)
+FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                               framework::dataset::make("DepthMultiplier", 1)),
+                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                               framework::dataset::make("DepthMultiplier", 1)),
+                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
-- 
cgit v1.2.1