From f87cc7f6fef95f9b022725304118796a6a764a7c Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Wed, 26 Jul 2017 10:28:40 +0100
Subject: COMPMID-417: Port NEDirectConvolution 1x1 to QS16.

Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/NEON/NEFixedPoint.h               |  11 ++
 arm_compute/core/NEON/NEFixedPoint.inl             |  14 +++
 .../NEDirectConvolutionLayerBiasAccumulateKernel.h |   2 +-
 .../NEON/kernels/NEDirectConvolutionLayerKernel.h  |   2 +-
 arm_compute/core/Types.h                           |   1 +
 arm_compute/core/Utils.h                           |   3 +
 .../NEON/functions/NEDirectConvolutionLayer.h      |   2 +-
 ...EDirectConvolutionLayerBiasAccumulateKernel.cpp |  84 ++++++++++----
 .../kernels/NEDirectConvolutionLayerKernel.cpp     | 128 ++++++++++++++++-----
 .../NEON/functions/NEDirectConvolutionLayer.cpp    |  41 +++++--
 tests/validation/NEON/DirectConvolutionLayer.cpp   |  62 ++++++----
 11 files changed, 263 insertions(+), 87 deletions(-)

diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 08f680801d..3de226112e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -48,6 +48,7 @@ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 element
 using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
 using qint32x2_t   = int32x2_t;   /**< 32 bit fixed point vector with 2 elements */
 using qint32x4_t   = int32x4_t;   /**< 32 bit fixed point vector with 4 elements */
+using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
 
 /** Get the lower half of a 16 elements vector
  *
@@ -673,6 +674,16 @@ qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
  */
 qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector long multiply (4 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 32 bit fixed point long vector multiplication.
+ */
+qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
  *
  * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index c879d3e275..dd1066d6bc 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -624,6 +624,20 @@ inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
     return vqrshlq_s16(res, fixed_point_position_s16);
 }
 
+inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmull_s16(a, b);
+
+    // Shift right by fixed_point_position
+    return vqshlq_s32(tmp, fixed_point_position_s32);
+}
+
 inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
index f098e18655..87788ba389 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -51,7 +51,7 @@ public:
     /** Set the accumulate buffer and the biases of the kernel.
      *
      * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                        Data type supported: QS8/F32
+     *                        Data type supported: QS8/QS16/F16/F32
      * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
      *                         Data type supported: Same as @p input
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 5612e1ae62..e0dac9858b 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -49,7 +49,7 @@ public:
     /** Set the input, weights, and output tensors.
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32.
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 765cae4ad4..2d3b3d6f66 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -68,6 +68,7 @@ enum class DataType
     QS16,
     U32,
     S32,
+    QS32,
     U64,
     S64,
     F16,
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 4ecd464cdb..af788beeb7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -100,6 +100,7 @@ inline size_t data_size_from_type(DataType data_type)
         case DataType::F32:
         case DataType::U32:
         case DataType::S32:
+        case DataType::QS32:
             return 4;
         case DataType::F64:
         case DataType::U64:
@@ -173,6 +174,7 @@ inline size_t element_size_from_data_type(DataType dt)
         case DataType::U32:
         case DataType::S32:
         case DataType::F32:
+        case DataType::QS32:
             return 4;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -645,6 +647,7 @@ inline bool is_data_type_fixed_point(DataType dt)
     {
         case DataType::QS8:
         case DataType::QS16:
+        case DataType::QS32:
             return true;
         default:
             return false;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index a66cab3013..872fae3a6b 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -48,7 +48,7 @@ public:
     NEDirectConvolutionLayer();
     /** Set the input, weights, biases and output tensors.
       *
-      * @param[in, out] input     Input tensor. Data types supported: QS8/F16/F32.
+      * @param[in, out] input     Input tensor. Data types supported: QS8/QS16/F16/F32.
       * @param[in]      weights   Set of kernels to convolve the input volume.
       *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
       *                           Data type supported: Same as @p input.
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index fb16c8dcc1..12ef064803 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -54,6 +54,11 @@ inline qint16x8_t internal_vld1q(const qint16_t *in)
     return vld1q_qs16(in);
 }
 
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+    return vld1q_s32(in);
+}
+
 // Internal store
 inline void internal_vst1q(float *p, const float32x4_t &v)
 {
@@ -72,6 +77,16 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
     vst1q_qs16(p, v);
 }
 
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+    vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+    vst1_qs16(p, vqmovn_qs32(v));
+}
+
 // Internal vdup
 inline float32x4_t internal_vdupq_n(float v)
 {
@@ -86,6 +101,11 @@ inline qint16x8_t internal_vdupq_n(qint16_t v)
     return vdupq_n_qs16(v);
 }
 
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+    return vdupq_n_qs32(v);
+}
+
 // Internal vadd
 inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
 {
@@ -99,6 +119,10 @@ inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
 {
     return vqaddq_qs16(x, y);
 }
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+    return vqaddq_qs32(x, y);
+}
 
 #ifdef ARM_COMPUTE_ENABLE_FP16
 inline float16x8_t internal_vld1q(const float16_t *in)
@@ -162,8 +186,8 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu
 
 void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
     if(output != nullptr)
     {
@@ -198,27 +222,47 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con
     INEKernel::configure(win);
 
     // Set appropriate function
-    if(input->info()->data_type() == DataType::F32)
+    switch(input->info()->data_type())
     {
-        _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
-    }
+        case DataType::QS8:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+            break;
+        }
+        case DataType::QS16:
+        {
+            if(bias->info()->data_type() == DataType::QS8)
+            {
+                _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        }
+        case DataType::QS32:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
+            break;
+        }
 #ifdef ARM_COMPUTE_ENABLE_FP16
-    else if(input->info()->data_type() == DataType::F16)
-    {
-        _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
-    }
+        case DataType::F16:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+            break;
+        }
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
-    else if(input->info()->data_type() == DataType::QS8)
-    {
-        _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
-    }
-    else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
-    {
-        _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+        case DataType::F32:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            break;
+        }
     }
 }
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 09d8dd5642..43292d1b22 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -39,6 +39,34 @@ using namespace arm_compute;
 
 namespace
 {
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+
+template <>
+qint16x8_t internal_vld1q<2>(const qint16_t *in)
+{
+    const int16x8x2_t tmp = vld2q_s16(in);
+    return tmp.val[0];
+}
+
+template <>
+qint16x8_t internal_vld1q<3>(const qint16_t *in)
+{
+    const int16x8x3_t tmp = vld3q_s16(in);
+    return tmp.val[0];
+}
+
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+
 #ifdef ARM_COMPUTE_ENABLE_FP16
 template <unsigned int stridex>
 float16x8_t internal_vld1q(const float16_t *in);
@@ -109,6 +137,28 @@ float32x4_t internal_vld1q<3>(const float *in)
     return tmp.val[0];
 }
 
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmlaq_f32(x, y, z);
+}
+
 template <unsigned int stridex>
 qint8x8_t internal_vld1q(const qint8_t *in);
 
@@ -132,28 +182,19 @@ qint8x8_t internal_vld1q<3>(const qint8_t *in)
     return tmp.val[0];
 }
 
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
-    return vld1q_s16(in);
-}
-
-inline float32x4_t internal_vdupq_n(float v)
+inline qint8x8_t internal_vdupq_n(qint8_t v)
 {
-    return vdupq_n_f32(v);
+    return vdup_n_qs8(v);
 }
 
-inline qint8x8_t internal_vdupq_n(qint8_t v)
+inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
 {
-    return vdup_n_qs8(v);
+    return vmull_qs8(x, y, fixed_point_position);
 }
 
-inline void internal_vst1q(float *p, const float32x4_t &v)
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
 {
-    vst1q_f32(p, v);
+    return vqmlal_qs8(x, y, z, fixed_point_position);
 }
 
 inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
@@ -161,26 +202,50 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
     vst1q_qs16(p, v);
 }
 
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+inline void internal_vst1q(int *p, const qint32x4x2_t &v)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    return vmulq_f32(x, y);
+    vst1q_s32(p, v.val[0]);
+    vst1q_s32(p + 4, v.val[1]);
 }
 
-qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+template <unsigned int stridex>
+qint32x4x2_t internal_vld1q(const qint32_t *in);
+
+template <>
+qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
 {
-    return vmull_qs8(x, y, fixed_point_position);
+    const qint32x4x2_t r =
+    {
+        {
+            vld1q_s32(in),
+            vld1q_s32(in + 4)
+        }
+    };
+    return r;
 }
 
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    return vmlaq_f32(x, y, z);
+    const qint32x4x2_t r =
+    {
+        {
+            vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
+            vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
+        }
+    };
+    return r;
 }
 
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
 {
-    return vqmlal_qs8(x, y, z, fixed_point_position);
+    const qint32x4x2_t r =
+    {
+        {
+            vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
+            vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
+        }
+    };
+    return r;
 }
 
 template <typename T1, typename T2, unsigned int stridex>
@@ -216,8 +281,7 @@ public:
         window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
         window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
+        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
         Iterator out(output, window_out);
         Iterator in(input, window_in);
         Iterator k(weights, window_k);
@@ -887,9 +951,9 @@ BorderSize NEDirectConvolutionLayerKernel::border_size() const
 
 void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
                              "Pad > 0 not supported for 1x1 weights");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
@@ -919,6 +983,7 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
                 case DataType::F16:
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::QS8:
+                case DataType::QS16:
                     _num_elems_written_per_iteration = 8;
                     break;
                 case DataType::F32:
@@ -995,6 +1060,9 @@ void NEDirectConvolutionLayerKernel::run(const Window &window)
                 case DataType::QS8:
                     convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
+                case DataType::QS16:
+                    convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
                 case DataType::F32:
                     convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index d5f03fcc41..0380e8cdb4 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -40,7 +40,7 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer()
 
 void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
@@ -49,17 +49,36 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
     }
 
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(output->info()->data_type() == DataType::QS8)
+    switch(output->info()->data_type())
     {
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-        _accumulator.allocator()->allocate();
-    }
-    else
-    {
-        _conv_kernel.configure(input, weights, output, conv_info);
-        _accumulate_bias_kernel.configure(output, bias);
+        case DataType::QS8:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::QS16:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::F16:
+        case DataType::F32:
+        {
+            _conv_kernel.configure(input, weights, output, conv_info);
+            _accumulate_bias_kernel.configure(output, bias);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+        }
     }
 
     // Add zero padding XY
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index effb898428..7022d656e9 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -48,11 +48,11 @@ using namespace arm_compute::test::validation;
 
 namespace
 {
-const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */
+const float tolerance_qs = 1.f; /**< Tolerance for 8 bit fixed point tests */
 #ifdef ARM_COMPUTE_ENABLE_FP16
 const float tolerance_fp16 = 0.01f; /**< Tolerance for half precision floating point tests */
 #endif                              /* ARM_COMPUTE_ENABLE_FP16 */
-const float tolerance_qs8 = 1;      /**< Tolerance for fixed point tests */
+const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */
 
 /** Compute NEON direct convolution layer function.
  *
@@ -91,18 +91,30 @@ Tensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape
     BOOST_TEST(!dst.info()->is_resizable());
 
     // Fill tensors
-    if(dt == DataType::F16 || dt == DataType::F32)
-    {
-        std::uniform_real_distribution<> distribution(-1.f, 1.f);
-        library->fill(Accessor(src), distribution, 0);
-        library->fill(Accessor(weights), distribution, 1);
-        library->fill(Accessor(bias), distribution, 2);
-    }
-    else
+    switch(dt)
     {
-        library->fill_tensor_uniform(Accessor(src), 0);
-        library->fill_tensor_uniform(Accessor(weights), 1);
-        library->fill_tensor_uniform(Accessor(bias), 2);
+        case DataType::F16:
+        case DataType::F32:
+        {
+            std::uniform_real_distribution<> distribution(-1.f, 1.f);
+            library->fill(Accessor(src), distribution, 0);
+            library->fill(Accessor(weights), distribution, 1);
+            library->fill(Accessor(bias), distribution, 2);
+            break;
+        }
+        case DataType::QS8:
+        case DataType::QS16:
+        {
+            library->fill_tensor_uniform(Accessor(src), 0);
+            library->fill_tensor_uniform(Accessor(weights), 1);
+            library->fill_tensor_uniform(Accessor(bias), 2);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported.");
+            break;
+        }
     }
 
     // Compute function
@@ -221,8 +233,10 @@ BOOST_AUTO_TEST_SUITE_END()
 BOOST_AUTO_TEST_SUITE(Quantized)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
 BOOST_DATA_TEST_CASE(W1x1,
-                     DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
-                     input_shape, sx, sy, num_kernels, fixed_point_position)
+                     DirectConvolutionShapes() * boost::unit_test::data::make({ DataType::QS8, DataType::QS16 }) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3,
+                             1)
+                     * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
+                     input_shape, dt, sx, sy, num_kernels, fixed_point_position)
 {
     const unsigned int  kernel_size = 1;
     const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR);
@@ -230,18 +244,20 @@ BOOST_DATA_TEST_CASE(W1x1,
     const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
     const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
 
-    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
 
-    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
 
     // Validate output
-    validate(Accessor(dst), ref);
+    validate(Accessor(dst), ref, tolerance_qs);
 }
 
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
-BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2, 1)
+BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3,
+                     1)
+                     * boost::unit_test::data::xrange(0, 2, 1)
                      * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
-                     input_shape, sx, sy, px, py, num_kernels, fixed_point_position)
+                     input_shape, dt, sx, sy, px, py, num_kernels, fixed_point_position)
 {
     const unsigned int  kernel_size = 3;
     const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR);
@@ -249,12 +265,12 @@ BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::x
     const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
     const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
 
-    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
 
-    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
 
     // Validate output
-    validate(Accessor(dst), ref, tolerance_qs8);
+    validate(Accessor(dst), ref, tolerance_qs);
 }
 BOOST_AUTO_TEST_SUITE_END()
 
-- 
cgit v1.2.1