5 files changed, 42 insertions, 41 deletions
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 6c31fa4fb1..8a4cf7abeb 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -53,8 +53,8 @@
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 #include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 4529120f02..cd482ddbdf 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -58,7 +58,7 @@ public:
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
      * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
@@ -70,7 +70,7 @@ public:
      *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                      Data type supported:Same as @p input.
      * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
      * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index 05ade1c5dd..46d52fc182 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -21,64 +21,64 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
-#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__
 
 #include "arm_compute/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
 class ITensor;
-/** NEON kernel to accumulate the biases to each element of the input tensor
+/** NEON kernel to accumulate the biases, if provided, or downscale in case of quantized input.
  *
  * @note We assume bias to be shared
  */
-class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel
+class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
 {
 public:
     /** Default constructor */
-    NEDirectConvolutionLayerBiasAccumulateKernel();
+    NEDirectConvolutionLayerOutputStageKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default;
     /** Default destructor */
-    ~NEDirectConvolutionLayerBiasAccumulateKernel() = default;
+    ~NEDirectConvolutionLayerOutputStageKernel() = default;
     /** Set the accumulate buffer and the biases of the kernel.
      *
      * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                        Data type supported: QS8/QS16/F16/F32
-     * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     *                        Data type supported: QS16/QS32/F16/F32
+     * @param[in]      bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                         Data type supported: Same as @p input
+     *                         Data type supported: QS8/QS16/F16/F32
      */
-    void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerBiasAccumulateKernel
+    void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
      *
      * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: QS8/QS16/F16/F32
-     * @param[in] bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     *                   Data type supported: QS16/QS32/F16/F32
+     * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                         Data type supported: Same as @p input
+     *                         Data type supported: QS8/QS16/F16/F32
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output = nullptr);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
+    using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
 
 private:
-    BiasAccumulateKernel *_func;
-    ITensor              *_input;
-    const ITensor        *_bias;
-    ITensor              *_output;
+    OutputStageKernel *_func;
+    ITensor           *_input;
+    const ITensor     *_bias;
+    ITensor           *_output;
 };
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 659594fe11..6208c20227 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -28,7 +28,7 @@
 #include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/Types.h"
@@ -67,10 +67,10 @@ public:
     void run() override;
 
 private:
-    NEDepthwiseConvolutionLayer3x3Kernel         _kernel;
-    NEDirectConvolutionLayerBiasAccumulateKernel _bias_kernel;
-    NEFillBorderKernel                           _border_handler;
-    bool                                         _has_bias;
+    NEDepthwiseConvolutionLayer3x3Kernel      _kernel;
+    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
+    NEFillBorderKernel                        _border_handler;
+    bool                                      _has_bias;
 };
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 09a54968bb..e1aa839802 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -24,8 +24,8 @@
 #ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
 #define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
 
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -42,7 +42,7 @@ namespace arm_compute
  *  This function calls the following NEON kernels:
  *
  * -# @ref NEFillBorderKernel for the input
- * -# @ref NEDirectConvolutionLayerBiasAccumulateKernel
+ * -# @ref NEDirectConvolutionLayerOutputStageKernel
  * -# @ref NEDirectConvolutionLayerKernel
  */
 class NEDirectConvolutionLayer : public IFunction
@@ -93,12 +93,13 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                                  _memory_group;
-    NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
-    NEDirectConvolutionLayerKernel               _conv_kernel;
-    NEFillBorderKernel                           _input_border_handler;
-    Tensor                                       _accumulator;
-    bool                                         _has_bias;
+    MemoryGroup                               _memory_group;
+    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
+    NEDirectConvolutionLayerKernel            _conv_kernel;
+    NEFillBorderKernel                        _input_border_handler;
+    Tensor                                    _accumulator;
+    bool                                      _has_bias;
+    bool                                      _is_fixed_point;
 };
 }
 #endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */