From 5dfeae62f89eefdc241887c3e67cd1c04ec0b6a7 Mon Sep 17 00:00:00 2001
From: Michel Iwaniec <michel.iwaniec@arm.com>
Date: Wed, 29 Nov 2017 10:48:23 +0000
Subject: IVGCVSW-820: Add QASYMM8 support to NeonActivationLayerKernel

Change-Id: Ic3881e97b4fcbae0ac287a1e010cfc6f0fd8d7d1
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112139
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 arm_compute/core/NEON/NEAsymm.h                    | 20 ++++++-
 arm_compute/core/NEON/NEAsymm.inl                  | 36 +++++++++++-
 .../core/NEON/kernels/NEActivationLayerKernel.h    |  7 +++
 arm_compute/core/QAsymm8.h                         | 33 +++++++++++
 arm_compute/core/QAsymm8.inl                       | 41 ++++++++++++++
 arm_compute/core/Rounding.h                        | 46 ++++++++++++++++
 arm_compute/core/Types.h                           | 25 +++------
 arm_compute/core/Utils.h                           | 10 +---
 src/core/NEON/kernels/NEActivationLayerKernel.cpp  | 56 ++++++++++++++++++-
 src/core/Rounding.cpp                              | 64 ++++++++++++++++++++++
 src/core/Utils.cpp                                 | 31 -----------
 tests/validation/NEON/ActivationLayer.cpp          | 30 +++++++++-
 12 files changed, 337 insertions(+), 62 deletions(-)
 create mode 100644 arm_compute/core/QAsymm8.h
 create mode 100644 arm_compute/core/QAsymm8.inl
 create mode 100644 arm_compute/core/Rounding.h
 create mode 100644 src/core/Rounding.cpp

diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index d227d3ccbe..f0d7439d40 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -28,6 +28,12 @@
 
 namespace arm_compute
 {
+using qasymm8x8_t   = uint8x8_t;   /**< 8 bit quantized asymmetric vector with 8 elements */
+using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */
+using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */
+using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
+using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
+
 /** Round to the nearest division by a power-of-two using exponent
  *
  * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
@@ -38,6 +44,18 @@ namespace arm_compute
  * @return the nearest division by a power-of-two using exponent
  */
 int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
+
+/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] vd Input vector value in QASYMM8 format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A 16-component vector in QASYMM8 format, saturated to fit
+ */
+uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
 } // namespace arm_compute
 #include "arm_compute/core/NEON/NEAsymm.inl"
-#endif // __ARM_COMPUTE_NEASYMM_H__
\ No newline at end of file
+#endif // __ARM_COMPUTE_NEASYMM_H__
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
index bbce308b35..ce999a5413 100644
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ b/arm_compute/core/NEON/NEAsymm.inl
@@ -30,4 +30,38 @@ inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
     const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
     return vrshlq_s32(fixed_up_x, shift_vec);
 }
-} // namespace arm_compute
\ No newline at end of file
+
+inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
+{
+    // Convert uint8 vectors to uint16 vectors
+    const uint8x8_t vd_low        = vget_low_u8(vd);
+    const uint8x8_t vd_high       = vget_high_u8(vd);
+    uint16x8_t      vd_low_u16x8  = vmovl_u8(vd_low);
+    uint16x8_t      vd_high_u16x8 = vmovl_u8(vd_high);
+    // Convert uint16 vectors to uint32 vectors
+    uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
+    uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
+    uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
+    uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
+    // Convert uint32 vectors to float32 vectors
+    float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
+    float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
+    float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
+    float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
+    // vd = vd*vs + vo
+    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
+    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
+    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
+    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
+    // Convert float32 vectors to uint32 vectors
+    A_u32x4 = vcvtq_u32_f32(A_f32x4);
+    B_u32x4 = vcvtq_u32_f32(B_f32x4);
+    C_u32x4 = vcvtq_u32_f32(C_f32x4);
+    D_u32x4 = vcvtq_u32_f32(D_f32x4);
+    // Convert uint32 vectors to uint16 vectors (with saturation)
+    vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
+    vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
+    // convert uint16 vectors to uint8 vectors (with saturation)
+    return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
+}
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index e8c032aaeb..1edda843de 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/QAsymm8.h"
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_fp16.h>
@@ -105,6 +106,12 @@ private:
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
 
 private:
diff --git a/arm_compute/core/QAsymm8.h b/arm_compute/core/QAsymm8.h
new file mode 100644
index 0000000000..2fa4029807
--- /dev/null
+++ b/arm_compute/core/QAsymm8.h
@@ -0,0 +1,33 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_QASYMM8_H__
+#define __ARM_COMPUTE_QASYMM8_H__
+
+#include "arm_compute/core/Rounding.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */
+}
+#include "arm_compute/core/QAsymm8.inl"
+#endif /* __ARM_COMPUTE_QASYMM8_H__ */
diff --git a/arm_compute/core/QAsymm8.inl b/arm_compute/core/QAsymm8.inl
new file mode 100644
index 0000000000..611d68eb23
--- /dev/null
+++ b/arm_compute/core/QAsymm8.inl
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+namespace arm_compute
+{
+inline qasymm8_t sqcvt_qasymm8_f32(float value, float scale, int offset, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+{
+    int quantized = arm_compute::round(value / scale, rounding_policy) + offset;
+    quantized     = std::max(0, std::min(quantized, 255));
+    return quantized;
+}
+
+inline float scvt_f32_qasymm8(qasymm8_t value, float scale, int offset)
+{
+    float dequantized = (static_cast<int>(value) - offset) * scale;
+    return dequantized;
+}
+}
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
new file mode 100644
index 0000000000..f95058c567
--- /dev/null
+++ b/arm_compute/core/Rounding.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ROUNDING_H__
+#define __ARM_COMPUTE_ROUNDING_H__
+
+namespace arm_compute
+{
+/** Rounding method */
+enum class RoundingPolicy
+{
+    TO_ZERO,         /**< Truncates the least significand values that are lost in operations. */
+    TO_NEAREST_UP,   /**< Rounds to nearest value; half rounds away from zero */
+    TO_NEAREST_EVEN, /**< Rounds to nearest value; half rounds to nearest even */
+};
+
+/** Return a rounded value of x. Rounding is done according to the rounding_policy.
+ *
+ * @param[in] x               Float value to be rounded.
+ * @param[in] rounding_policy Policy determining how rounding is done.
+ *
+ * @return Rounded value of the argument x.
+ */
+int round(float x, RoundingPolicy rounding_policy);
+}
+#endif /*__ARM_COMPUTE_ROUNDING_H__ */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index beaec143ef..538449b40a 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -25,10 +25,13 @@
 #define __ARM_COMPUTE_TYPES_H__
 
 #include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "support/Half.h"
 
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -102,17 +105,6 @@ constexpr float SCALE_PYRAMID_HALF = 0.5f;
 /* Constant value used to indicate a ORB scaled pyramid */
 constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
 
-/** Rounding method */
-enum class RoundingPolicy
-{
-    TO_ZERO,         /**< Truncates the least significand values that are lost in operations. */
-    TO_NEAREST_UP,   /**< Rounds to nearest value; half rounds away from zero */
-    TO_NEAREST_EVEN, /**< Rounds to nearest value; half rounds to nearest even */
-};
-
-//forward declare round function
-int round(float, RoundingPolicy);
-
 /** Quantization settings (used for QASYMM8 data type) */
 struct QuantizationInfo
 {
@@ -140,20 +132,17 @@ struct QuantizationInfo
     int   offset; /**< offset */
 
     /** Quantizes a value using the scale/offset in this QuantizationInfo */
-    uint8_t quantize(float value, RoundingPolicy rounding_policy) const
+    qasymm8_t quantize(float value, RoundingPolicy rounding_policy) const
     {
         ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::quantize: scale == 0");
-        int quantized = arm_compute::round(value / scale, rounding_policy) + offset;
-        quantized     = std::max(0, std::min(quantized, 255));
-        return quantized;
+        return sqcvt_qasymm8_f32(value, scale, offset, rounding_policy);
     }
 
     /** Dequantizes a value using the scale/offset in this QuantizationInfo */
-    float dequantize(uint8_t value) const
+    float dequantize(qasymm8_t value) const
     {
         ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::dequantize: scale == 0");
-        float dequantized = (static_cast<int>(value) - offset) * scale;
-        return dequantized;
+        return scvt_f32_qasymm8(value, scale, offset);
     }
 
     /** Indicates whether this QuantizationInfo has valid settings or not */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 9397d507f8..f78add13f9 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_UTILS_H__
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
 
 #include <algorithm>
@@ -62,15 +63,6 @@ constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
     return (val + m - 1) / m;
 }
 
-/** Return a rounded value of x. Rounding is done according to the rounding_policy.
- *
- * @param[in] x               Float value to be rounded.
- * @param[in] rounding_policy Policy determining how rounding is done.
- *
- * @return Rounded value of the argument x.
- */
-int round(float x, RoundingPolicy rounding_policy);
-
 /** Returns the arm_compute library build information
  *
  * Contains the version number and the build options used to build the library
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 6ea504a173..9670b7798c 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -26,8 +26,10 @@
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/QAsymm8.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -44,7 +46,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
@@ -107,6 +109,9 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
 
+    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
+                             "For QASYMM8 only lower/upper bounded relu is supported");
+
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
     {
@@ -170,9 +175,17 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
     };
+    // Activation functions : QASYMM8
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
+    {
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
+    };
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = act_map_qasymm8[activation_info.activation()];
+            break;
         case DataType::QS8:
             _func = act_map_qs8[activation_info.activation()];
             break;
@@ -545,6 +558,47 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
     input, output);
 }
 
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator               input(_input, window);
+    Iterator               output(_output, window);
+    const QuantizationInfo qi_in  = _input->info()->quantization_info();
+    const QuantizationInfo qi_out = _output->info()->quantization_info();
+    const qasymm8x16_t     a      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t     b      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+        const qasymm8x16_t in  = vld1q_u8(input_ptr);
+        qasymm8x16_t       tmp = {};
+
+        switch(F)
+        {
+            case ActivationFunction::LU_BOUNDED_RELU:
+                // Perform activation
+                tmp = vminq_u8(a, vmaxq_u8(b, in));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Function not implemented");
+                break;
+        }
+
+        vst1q_u8(output_ptr, tmp);
+    },
+    input, output);
+}
+
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
new file mode 100644
index 0000000000..fea635be97
--- /dev/null
+++ b/src/core/Rounding.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Rounding.h"
+
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace std;
+
+int arm_compute::round(float x, RoundingPolicy rounding_policy)
+{
+    using namespace std;
+    int rounded = 0;
+    switch(rounding_policy)
+    {
+        case RoundingPolicy::TO_ZERO:
+        {
+            rounded = static_cast<int>(x);
+            break;
+        }
+        case RoundingPolicy::TO_NEAREST_UP:
+        {
+            rounded = static_cast<int>(support::cpp11::round(x));
+            break;
+        }
+        case RoundingPolicy::TO_NEAREST_EVEN:
+        {
+            ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported rounding policy.");
+            break;
+        }
+    }
+
+    return rounded;
+}
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index af50bbbaf7..76d0b0f059 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -390,34 +390,3 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
     }
     return 0;
 }
-
-int arm_compute::round(float x, RoundingPolicy rounding_policy)
-{
-    using namespace std;
-    int rounded = 0;
-    switch(rounding_policy)
-    {
-        case RoundingPolicy::TO_ZERO:
-        {
-            rounded = static_cast<int>(x);
-            break;
-        }
-        case RoundingPolicy::TO_NEAREST_UP:
-        {
-            rounded = static_cast<int>(support::cpp11::round(x));
-            break;
-        }
-        case RoundingPolicy::TO_NEAREST_EVEN:
-        {
-            ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported rounding policy.");
-            break;
-        }
-    }
-
-    return rounded;
-}
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 6ed482e4e7..8a918b2597 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -209,7 +209,7 @@ TEST_SUITE_END()
 template <typename T>
 using NEActivationLayerFixedPointFixture = ActivationValidationFixedPointFixture<Tensor, Accessor, NEActivationLayer, T>;
 
-TEST_SUITE(Quantized)
+TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 // We test for fixed point precision [3,5] because [1,2] and [6,7] ranges cause
 // overflowing issues in most of the transcendentals functions.
@@ -252,6 +252,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerFixedPointFixture<int16_t>, fr
 TEST_SUITE_END()
 TEST_SUITE_END()
 
+template <typename T>
+using NEActivationLayerQuantizedFixture = ActivationValidationQuantizedFixture<Tensor, Accessor, NEActivationLayer, T>;
+
+/** Input data sets. */
+const auto QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU })),
+                                                framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), QuantizedActivationDataset),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance(_data_type, _function));
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), QuantizedActivationDataset),
+                                                                                                                      framework::dataset::make("DataType",
+                                                                                                                              DataType::QASYMM8)),
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance(_data_type, _function));
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
-- 
cgit v1.2.1