From df4cf57c7394265b27d051cb1cf0152c53659126 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 9 Oct 2019 15:32:39 +0100
Subject: COMPMID-2306: CLDepthwiseConvolution: support for
 QUANT8_PER_CHANNEL_SYMM

Change-Id: I18c886400daa2dcba0b91011bc4e503d807a4732
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2143
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/CLHelpers.h                    | 16 ++++++
 .../CLDepthwiseConvolutionLayer3x3NCHWKernel.h     | 58 ++++++++++++--------
 .../CLDepthwiseConvolutionLayer3x3NHWCKernel.h     | 56 ++++++++++++--------
 .../CLDepthwiseConvolutionLayerNativeKernel.h      | 61 ++++++++++++++--------
 .../ICLDepthwiseConvolutionLayer3x3Kernel.h        | 33 +++++++-----
 .../NEDepthwiseConvolutionLayerNativeKernel.h      | 12 +++--
 arm_compute/core/utils/quantization/AsymmHelpers.h | 17 +++++-
 .../CL/functions/CLDepthwiseConvolutionLayer.h     | 37 +++++++++----
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   | 33 +++++++-----
 9 files changed, 215 insertions(+), 108 deletions(-)

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 1d647a86b0..9130e05121 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -50,6 +50,22 @@ static constexpr unsigned int max_cl_vector_width = 16;
  */
 std::string get_cl_type_from_data_type(const DataType &dt);
 
+/** Translates a tensor data type to the appropriate OpenCL promoted type.
+ *
+ * @param[in] dt @ref DataType to be used to get the promoted OpenCL type.
+ *
+ * @return The string specifying the OpenCL type to be used.
+ */
+std::string get_cl_promoted_type_from_data_type(const DataType &dt);
+
+/** Translates the element size to an unsigned integer data type
+ *
+ * @param[in] element_size Size in bytes of an element.
+ *
+ * @return The string specifying the OpenCL type to be used.
+ */
+std::string get_cl_unsigned_type_from_element_size(size_t element_size);
+
 /** Translates a tensor data type to the appropriate OpenCL select type.
  *
  * @param[in] dt @ref DataType to be translated to OpenCL select type.
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
index 3b7fc7b7dc..a2f61ee058 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
@@ -39,35 +39,47 @@ public:
     CLDepthwiseConvolutionLayer3x3NCHWKernel();
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                   ActivationLayerInfo act_info, const Size2D &dilation) override;
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
      *
-     * @param[in] input            Source tensor info. DataType supported: F16/F32/QASYMM8.
-     * @param[in] weights          Weights tensor info. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] gpu_target       (Optional) GPU target to validate the kernel for. Defaults to midgard.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] input              Source tensor info. DataType supported: F16/F32/QASYMM8.
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
+     *                               Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in] gpu_target         (Optional) GPU target to validate the kernel for. Defaults to midgard.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                           ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
+                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
 
     void run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
index 7d0ecec13e..e8cca954e2 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
@@ -40,34 +40,46 @@ public:
     /** Default move assignment operator. */
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [IFM, 3, 3]. Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
+     *                                Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                   ActivationLayerInfo act_info, const Size2D &dilation) override;
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
      *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8.
-     * @param[in] weights          Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3]. Data type supported: Same as @p input.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] input              Source tensor info. DataType supported: QASYMM8.
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
+     *                               Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                           ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 31ec871123..8e8df9c1f6 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -49,36 +49,48 @@ public:
     CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Initialize the function's source, destination and parameters
      *
-     * @param[in]  input            Source tensor. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info         Depthwise convolution layer info
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  input              Source tensor. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
+     *                                Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[in]  dwc_info           Depthwise convolution layer info
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info,
-                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
+                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
+                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
-     * @param[in] weights          Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in] dwc_info         Depthwise convolution layer info
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] input              Source tensor info. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC
+     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
+     *                               Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                               Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
+     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
+     * @param[in] dwc_info           Depthwise convolution layer info
+     * @param[in] conv_info          Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
+                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -89,6 +101,9 @@ private:
     const ICLTensor *_biases;
     ICLTensor       *_output;
     unsigned int     _depth_multiplier;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
index 92eca89fd8..a6b4510115 100644
--- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
@@ -37,7 +37,7 @@ class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
 public:
     /** Default constructor */
     ICLDepthwiseConvolutionLayer3x3Kernel()
-        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1)
+        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -50,18 +50,24 @@ public:
     ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
+     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
+     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
+     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info          Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
+     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
+     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
+     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
      */
-    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)) = 0;
+    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
+                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
 
 protected:
     BorderSize       _border_size;
@@ -70,6 +76,9 @@ protected:
     const ICLTensor *_weights;
     const ICLTensor *_biases;
     unsigned int     _conv_stride_y;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
index a0205f1ea6..6db1a767d8 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
@@ -58,8 +58,10 @@ public:
      * @note Supported data layouts: NHWC
      *
      * @param[in]  input            Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. Data type supported: Same as @p input.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
+     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[out] output           Destination tensor. Data type supported: Same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -73,8 +75,10 @@ public:
      * @note Supported data layouts: NHWC
      *
      * @param[in] input            Source tensor info. DataType supported: QASYMM8/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H]. Data type supported: Same as @p input.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index bc5b9dbdba..8ec4a331f6 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -60,9 +61,23 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant
 Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift);
 /** Get minimum and maximum values for the input quantized data type
  *
- * @ return min and max values for the quantized data type
+ * @return min and max values for the quantized data type
  */
 std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type);
+/** Compute quantized per-channel multipliers and shifts. As many multipliers
+ * and shifts as output channels are computed. If weights are not quantized
+ * per-channel, multipliers and shifts will end up being the same for each
+ * channel.
+ *
+ * @param[in]  input                  Input tensor.
+ * @param[in]  weights                Weights tensor.
+ * @param[in]  output                 Output tensor.
+ * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers.
+ * @param[out] output_shifts_ptr      Pointer to the buffer where to store per-channel shifts.
+ *
+ * @return min and max values for the quantized data type
+ */
+void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr);
 } // namespace quantization
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index b8b11f08b2..e15f62f779 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -58,7 +58,8 @@ public:
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
@@ -73,7 +74,8 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
      *
      * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
@@ -95,7 +97,8 @@ private:
     /** Static function to choose the best depthwise convolution function for @ref CLDepthwiseConvolutionLayer
      *
      * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
@@ -135,7 +138,8 @@ private:
         /** Initialize the function's source, destination, conv and border_size.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
+         * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM].
+         *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                                  Data type supported: Same as @p input.
          * @param[out]     output           Destination tensor. Data type supported: same as @p input.
@@ -150,7 +154,8 @@ private:
         /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3
          *
          * @param[in] input            Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW.
-         * @param[in] weights          Weights tensor info. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
+         * @param[in] weights          Weights tensor info. A 3D tensor with shape [3, 3, IFM].
+         *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[in] output           Destination tensor. Data type supported: same as @p input.
@@ -184,10 +189,15 @@ private:
         CLTensor                                               _permuted_input;
         CLTensor                                               _permuted_weights;
         CLTensor                                               _permuted_output;
+        CLTensor                                               _output_multipliers;
+        CLTensor                                               _output_shifts;
         const ITensor                                         *_original_weights;
+        const ITensor                                         *_input;
+        const ITensor                                         *_output;
         bool                                                   _needs_permute;
         bool                                                   _needs_weights_reshape;
         bool                                                   _is_prepared;
+        bool                                                   _is_quantized;
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
@@ -212,7 +222,8 @@ private:
         /** Initialize the function's source, destination, weights and convolution information.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[out]     output           Destination tensor. Data type supported: same as @p input.
@@ -227,7 +238,8 @@ private:
         /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerGeneric
          *
          * @param[in] input            Source tensor info. Data type supported: QASYMM8/F32.
-         * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[in] output           Destination tensor. Data type supported: same as @p input.
@@ -261,10 +273,15 @@ private:
         CLTensor       _permuted_input;
         CLTensor       _permuted_weights;
         CLTensor       _permuted_output;
+        CLTensor       _output_multipliers;
+        CLTensor       _output_shifts;
         const ITensor *_original_weights;
+        const ITensor *_input;
+        const ITensor *_output;
 
         bool _needs_permute;
         bool _is_prepared;
+        bool _is_quantized;
     };
 
     std::shared_ptr<IMemoryManager> _memory_manager;
@@ -298,7 +315,8 @@ public:
     /** Initialize the function's source, destination, conv and border_size.
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM].
+     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                                  Data type supported: Same as @p input.
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
@@ -314,7 +332,8 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3
      *
      * @param[in] input            Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW.
-     * @param[in] weights          Weights tensor info. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor info. A 3D tensor with shape [3, 3, IFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
      *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 8fe9644963..efe9cdfbf0 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -56,9 +56,10 @@ public:
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input.
+     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
@@ -71,9 +72,10 @@ public:
      *
      * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
@@ -92,9 +94,10 @@ private:
     /** Static function to choose the best depthwise convolution function for @ref NEDepthwiseConvolutionLayer
      *
      * @param[in] input            Source tensor info. Data type supported: QASYMM8/F16/F32
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -136,7 +139,7 @@ private:
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
          * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input.
+         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[out]     output           Destination tensor. Data type supported: same as @p input.
          * @param[in]      conv_info        Padding and stride information to use for the convolution.
          * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -151,7 +154,7 @@ private:
          * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
          * @param[in] biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                             Data type supported: Same as @p input.
+         *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[in] output           Destination tensor. Data type supported: same as @p input.
          * @param[in] conv_info        Padding and stride information to use for the convolution.
          * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -173,7 +176,7 @@ private:
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
          * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input.
+         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[out]     output           Destination tensor. Data type supported: same as @p input.
          * @param[in]      conv_info        Padding and stride information to use for the convolution.
          * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -188,7 +191,7 @@ private:
          * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[in]  weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
          * @param[in]  biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                              Data type supported: Same as @p input.
+         *                              Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[out] output           Destination tensor. Data type supported: same as @p input.
          * @param[in]  conv_info        Padding and stride information to use for the convolution.
          * @param[in]  depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -246,7 +249,8 @@ private:
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[in]      conv_info        Padding and stride information to use for the convolution.
@@ -261,7 +265,8 @@ private:
          *
          * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
          * @param[in] output           Destination tensor. Data type supported: same as @p input.
-         * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+         * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
          * @param[in] biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
          *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
          * @param[in] conv_info        Padding and stride information to use for the convolution.
@@ -328,7 +333,7 @@ public:
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input.
+     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -344,7 +349,7 @@ public:
      * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
      * @param[in] weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
      * @param[in] biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-- 
cgit v1.2.1