10 files changed, 244 insertions, 63 deletions
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index 5eb4c55c41..774125ec7d 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -251,7 +251,16 @@ qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position);
 *
 * @return The result of the 8 bit fixed point exponential.
 */
-qint8_t sexp_qs8(qint8_t a, int fixed_point_position);
+qint8_t sqexp_qs8(qint8_t a, int fixed_point_position);
+
+/** 16 bit fixed point scalar exponential
+*
+* @param[in] a                    16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point exponential.
+*/
+qint16_t sqexp_qs16(qint16_t a, int fixed_point_position);
 
 /** 16 bit fixed point scalar exponential
 *
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index e3eb5d4638..e30509cd0a 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -176,6 +176,14 @@ void vst1q_qs8(qint8_t *addr, qint8x16_t b);
 */
 void vst1q_qs16(qint16_t *addr, qint16x8_t b);
 
+/** Store two 16 bit fixed point vector to memory (8x2 elements)
+*
+* @param[in] addr Memory address where the 16 bit fixed point vectors should be stored
+* @param[in] b    16 bit fixed point vectors to store
+*
+*/
+void vst2q_qs16(qint16_t *addr, qint16x8x2_t b);
+
 /** 16 bit fixed point vector saturating narrow (8 elements)
  *
  * @param[in] a 16 bit fixed point vector to convert
@@ -1122,7 +1130,7 @@ qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
  *
  * @return The calculated Hyperbolic Tangent.
  */
-qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
+qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position);
 
 /** Calculate hyperbolic tangent for fixed point 16 bit (4 elements)
  *
@@ -1131,7 +1139,7 @@ qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
  *
  * @return The calculated Hyperbolic Tangent.
  */
-qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position);
+qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position);
 
 /** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
  *
@@ -1140,7 +1148,16 @@ qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position);
  *
  * @return The calculated Hyperbolic Tangent.
  */
-qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position);
+qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 16bit (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position);
 
 /** Calculate saturating n power for fixed point 8bit (16 elements).
  *
@@ -1162,15 +1179,6 @@ qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
  * @return The lane-by-lane maximum -> float32x4x2
  */
 float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
-
-/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position);
 }
 #include "arm_compute/core/NEON/NEFixedPoint.inl"
 #endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 92af82cf71..b241dd5069 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -200,6 +200,11 @@ inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
     vst1q_s16(addr, b);
 }
 
+inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
+{
+    vst2q_s16(addr, b);
+}
+
 inline qint8x8_t vqmovn_qs16(qint16x8_t a)
 {
     return vqmovn_s16(a);
@@ -1641,15 +1646,15 @@ inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
     const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
     qint8x8_t temp         = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
     uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
     temp                   = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
-    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+    qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
 
-    temp = vshl_s8(a, shift_value);
+    temp = vqshl_s8(a, shift_value);
 
     // Initial guess
     qint8x8_t x = temp;
@@ -1660,7 +1665,7 @@ inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
     x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
     x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
 
-    return vshl_s8(x, shift_value2);
+    return vqshl_s8(x, shift_value2);
 }
 
 inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
@@ -1668,15 +1673,15 @@ inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
     const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint16x4_t shift_value = vneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+    qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
     qint16x4_t temp         = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
     uint16x4_t temp_ltz     = vclt_s16(temp, vdup_n_qs16(0));
     temp                    = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
-    qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
+    qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
 
-    temp = vshl_s16(a, shift_value);
+    temp = vqshl_s16(a, shift_value);
 
     // Initial guess
     qint16x4_t x = temp;
@@ -1753,15 +1758,15 @@ inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
     const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
     qint8x16_t temp         = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
     uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
     temp                    = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
-    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+    qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
 
-    temp = vshlq_s8(a, shift_value);
+    temp = vqshlq_s8(a, shift_value);
 
     // Initial guess
     qint8x16_t x = temp;
@@ -1780,13 +1785,13 @@ inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
     const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint16x8_t shift_value = vnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
+    qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
     qint16x8_t temp         = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
     uint16x8_t temp_ltz     = vcltq_s16(temp, vdupq_n_qs16(0));
     temp                    = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
-    qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
+    qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
 
     temp = vqshlq_s16(a, shift_value);
 
@@ -1804,7 +1809,7 @@ inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
     return vqshlq_s16(x, shift_value2);
 }
 
-inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
+inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
     const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
@@ -1817,7 +1822,7 @@ inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
     return tanh;
 }
 
-inline qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position)
+inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
 {
     const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
     const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
@@ -1830,7 +1835,7 @@ inline qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position)
     return tanh;
 }
 
-inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
+inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
     const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
@@ -1843,6 +1848,19 @@ inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
     return tanh;
 }
 
+inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
+    const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
+
+    qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
+    qint16x8_t num   = vqsubq_qs16(exp2x, const_one);
+    qint16x8_t den   = vqaddq_qs16(exp2x, const_one);
+    qint16x8_t tanh  = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
 inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
 {
     return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
@@ -1859,17 +1877,4 @@ inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
     };
     return res;
 }
-
-inline qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
-
-    qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
-    qint16x8_t num   = vqsubq_qs16(exp2x, const_one);
-    qint16x8_t den   = vqaddq_qs16(exp2x, const_one);
-    qint16x8_t tanh  = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
-
-    return tanh;
-}
 }
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 539bca587a..e995f1e5e0 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -50,7 +50,7 @@ public:
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QS8/F32.
+     *                                 of the activation function. Data types supported: QS8/QS16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer information.
      */
@@ -78,6 +78,12 @@ private:
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
 
 private:
     ITensor                      *_input;
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index b1a211553d..f3cd305910 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -44,7 +44,7 @@ public:
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QS8/F32.
+     *                                 of the activation function. Data types supported: QS8/QS16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer parameters.
      */
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 492d197925..f530413453 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -47,7 +47,7 @@ NEActivationLayerKernel::NEActivationLayerKernel()
 
 void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
 
     _input    = input;
     _act_info = activation_info;
@@ -78,7 +78,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
     };
-
     // Activation functions : QS8
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
     {
@@ -92,15 +91,31 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
     };
+    // Activation functions : QS16
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs16 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint16_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint16_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint16_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint16_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint16_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint16_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint16_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
+    };
 
     switch(input->info()->data_type())
     {
-        case DataType::F32:
-            _func = act_map_f32[activation_info.activation()];
-            break;
         case DataType::QS8:
             _func = act_map_qs8[activation_info.activation()];
             break;
+        case DataType::QS16:
+            _func = act_map_qs16[activation_info.activation()];
+            break;
+        case DataType::F32:
+            _func = act_map_f32[activation_info.activation()];
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
@@ -262,9 +277,9 @@ typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationL
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-    int      fixed_point_position = _input->info()->fixed_point_position();
+    Iterator  input(_input, window);
+    Iterator  output(_output, window);
+    const int fixed_point_position = _input->info()->fixed_point_position();
 
     static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
     const qint8x16_t        CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
@@ -291,7 +306,7 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
                 tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
                 break;
             case ActivationFunction::LOGISTIC:
-                tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+                tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
                 break;
             case ActivationFunction::RELU:
                 tmp = vmaxq_qs8(CONST_0, in);
@@ -300,13 +315,13 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
                 tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
                 break;
             case ActivationFunction::SQRT:
-                tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+                tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
                 break;
             case ActivationFunction::SQUARE:
                 tmp = vqmulq_qs8(in, in, fixed_point_position);
                 break;
             case ActivationFunction::TANH:
-                tmp = vtanhq_qs8(in, fixed_point_position);
+                tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position);
                 break;
             default:
                 break;
@@ -317,6 +332,118 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
     input, output);
 }
 
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, int16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator  input(_input, window);
+    Iterator  output(_output, window);
+    const int fixed_point_position = _input->info()->fixed_point_position();
+
+    static const qint16x8_t CONST_0 = vdupq_n_qs16(0);
+    const qint16x8_t        CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position));
+    const qint16x8_t        a       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position));
+    const qint16x8_t        b       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        const qint16x8x2_t in  = vld2q_s16(input_ptr);
+        qint16x8x2_t       tmp = { {} };
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp =
+                {
+                    {
+                        vqabsq_qs16(in.val[0]),
+                        vqabsq_qs16(in.val[1]),
+                    }
+                };
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])),
+                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])),
+                    }
+                };
+                break;
+            case ActivationFunction::LINEAR:
+                tmp =
+                {
+                    {
+                        vqmlaq_qs16(b, a, in.val[0], fixed_point_position),
+                        vqmlaq_qs16(b, a, in.val[1], fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp =
+                {
+                    {
+                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position),
+                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::RELU:
+                tmp =
+                {
+                    {
+                        vmaxq_qs16(CONST_0, in.val[0]),
+                        vmaxq_qs16(CONST_0, in.val[1]),
+                    }
+                };
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp =
+                {
+                    {
+                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position),
+                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::SQRT:
+                tmp =
+                {
+                    {
+                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position),
+                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::SQUARE:
+                tmp =
+                {
+                    {
+                        vqmulq_qs16(in.val[0], in.val[0], fixed_point_position),
+                        vqmulq_qs16(in.val[1], in.val[1], fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::TANH:
+                tmp =
+                {
+                    {
+                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position),
+                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position),
+                    }
+                };
+                break;
+            default:
+                break;
+        }
+
+        vst2q_qs16(output_ptr, tmp);
+    },
+    input, output);
+}
+
 void NEActivationLayerKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index f6230c0199..11b41aa178 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -286,6 +286,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
         case DataType::U16:
             print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
             break;
+        case DataType::QS16:
         case DataType::S16:
             print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
             break;
@@ -316,6 +317,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
             return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
         case DataType::U16:
             return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
+        case DataType::QS16:
         case DataType::S16:
             return max_consecutive_elements_display_width_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n);
         case DataType::U32:
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index d92699d93e..a551da731e 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -90,7 +90,7 @@ std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::Activation
             break;
         case ActivationLayerInfo::ActivationFunction::SQRT:
             // Reduce range as sqrt should take a non-negative number
-            bounds.first = (is_float) ? 0 : 1 << (fixed_point_position);
+            bounds.first = (is_float) ? 0 : 1;
             break;
         default:
             break;
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 40be32278d..71dfcdc4e2 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -193,10 +193,11 @@ BOOST_DATA_TEST_CASE(Configuration, boost::unit_test::data::make({ false, true }
 
 BOOST_AUTO_TEST_SUITE(Float)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
-BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * SmallShapes() * CNNFloatDataTypes() * ActivationFunctions(), in_place, shape, dt, act_function)
+BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * SmallShapes() * CNNFloatDataTypes() * ActivationFunctions() * boost::unit_test::data::make({ 0.5f, 1.f }),
+                     in_place, shape, dt, act_function, alpha_beta)
 {
     // Create activation layer info
-    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+    ActivationLayerInfo act_info(act_function, alpha_beta, alpha_beta);
 
     // Compute function
     Tensor dst = compute_activation_layer(in_place, shape, dt, act_info);
@@ -209,10 +210,11 @@ BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * S
 }
 
 BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
-BOOST_DATA_TEST_CASE(RunLarge, boost::unit_test::data::make({ false, true }) * LargeShapes() * CNNFloatDataTypes() * ActivationFunctions(), in_place, shape, dt, act_function)
+BOOST_DATA_TEST_CASE(RunLarge, boost::unit_test::data::make({ false, true }) * LargeShapes() * CNNFloatDataTypes() * ActivationFunctions() * boost::unit_test::data::make({ 0.5f, 1.f }),
+                     in_place, shape, dt, act_function, alpha_beta)
 {
     // Create activation layer info
-    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+    ActivationLayerInfo act_info(act_function, alpha_beta, alpha_beta);
 
     // Compute function
     Tensor dst = compute_activation_layer(in_place, shape, dt, act_info);
@@ -229,12 +231,13 @@ BOOST_AUTO_TEST_SUITE_END()
  *        cause overflowing issues in most of the transcendentals functions.
  */
 BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_AUTO_TEST_SUITE(QS8)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
-BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * SmallShapes() * ActivationFunctions() * boost::unit_test::data::xrange(3, 6, 1),
-                     in_place, shape, act_function, fixed_point_position)
+BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * SmallShapes() * ActivationFunctions() * boost::unit_test::data::xrange(3, 6, 1) * boost::unit_test::data::make({ 0.5f, 1.f }),
+                     in_place, shape, act_function, fixed_point_position, alpha_beta)
 {
     // Create activation layer info
-    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+    ActivationLayerInfo act_info(act_function, alpha_beta, alpha_beta);
 
     // Compute function
     Tensor dst = compute_activation_layer(in_place, shape, DataType::QS8, act_info, fixed_point_position);
@@ -247,6 +250,27 @@ BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * S
 }
 BOOST_AUTO_TEST_SUITE_END()
 
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, boost::unit_test::data::make({ false, true }) * SmallShapes() * ActivationFunctions() * boost::unit_test::data::xrange(3, 6, 1) * boost::unit_test::data::make({ 0.5f, 1.f }),
+                     in_place, shape, act_function, fixed_point_position, alpha_beta)
+{
+    // Create activation layer info
+    ActivationLayerInfo act_info(act_function, alpha_beta, alpha_beta);
+
+    // Compute function
+    Tensor dst = compute_activation_layer(in_place, shape, DataType::QS16, act_info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_activation_layer(shape, DataType::QS16, act_info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, activation_layer_tolerance(act_function, fixed_point_position));
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+
 BOOST_AUTO_TEST_SUITE_END()
 BOOST_AUTO_TEST_SUITE_END()
 #endif /* DOXYGEN_SKIP_THIS */
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 0d752ee6fc..adac70901d 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -930,7 +930,7 @@ void activation_layer(const Tensor<T> &in, Tensor<T> &out, ActivationLayerInfo a
                 out[i] = mul(x, x).raw();
                 break;
             case ActivationLayerInfo::ActivationFunction::TANH:
-                out[i] = tanh(x).raw();
+                out[i] = mul(a, tanh(mul(b, x))).raw();
                 break;
             default:
                 ARM_COMPUTE_ERROR("Activation function not recognised");