From 79fa9a22022824735986f74557bf38095eb2284d Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Fri, 22 Feb 2019 17:54:22 +0000
Subject: COMPMID-2009: Add support for QASYMM8 in
 NEPixelWiseMultiplicationKernel

Change-Id: I58536e945d069c96a065b82cc14960f54afc6e1a
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/781
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 38 ++++++---
 .../NEON/functions/NEPixelWiseMultiplication.h     | 31 +++++---
 .../kernels/NEPixelWiseMultiplicationKernel.cpp    | 90 ++++++++++++++++++----
 tests/validation/CL/PixelWiseMultiplication.cpp    |  4 +-
 tests/validation/NEON/PixelWiseMultiplication.cpp  | 48 +++++++++++-
 .../fixtures/PixelWiseMultiplicationFixture.h      | 15 +++-
 .../reference/PixelWiseMultiplication.cpp          |  4 +-
 7 files changed, 182 insertions(+), 48 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 41ea91495f..2a8e36b1de 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,12 +56,12 @@ public:
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/S16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[in]  input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8.
      * @param[in]  rounding_policy Rounding policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
@@ -70,12 +70,12 @@ public:
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor. Data types supported: U8/S16/F16/F32
-     * @param[in] input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy.
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8.
      * @param[in] rounding_policy Rounding policy.
      *
      * @return a status
@@ -92,6 +92,7 @@ private:
      * @param[in]  input1_ptr Pointer to the first input tensor.
      * @param[in]  input2_ptr Pointer to the second input tensor.
      * @param[out] output_ptr Pointer to the output tensor.
+     * @param[in]  scale      Integer scale factor.
      */
     using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
     /** Common signature for all the specialised multiplication functions with float scaling factor
@@ -99,11 +100,26 @@ private:
      * @param[in]  input1_ptr Pointer to the first input tensor.
      * @param[in]  input2_ptr Pointer to the second input tensor.
      * @param[out] output_ptr Pointer to the output tensor.
+     * @param[in]  scale      Float scale factor.
      */
     using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
+    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
+     *
+     * @param[in]  input1_ptr      Pointer to the first input tensor.
+     * @param[in]  input2_ptr      Pointer to the second input tensor.
+     * @param[out] output_ptr      Pointer to the output tensor.
+     * @param[in]  scale           Float scale factor.
+     * @param[in]  input1_qua_info Quantization Info of tensor input1.
+     * @param[in]  input2_qua_info Quantization Info of tensor input2.
+     * @param[in]  output_qua_info Quantization Info of tensor output.
+     *
+     */
+    using MulFunctionQASYMM8 = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
+                                    const QuantizationInfo &input1_qua_info, const QuantizationInfo &input2_qua_info, const QuantizationInfo &output_qua_info);
 
-    MulFunctionFloat *_func_float;
-    MulFunctionInt   *_func_int;
+    MulFunctionFloat   *_func_float;
+    MulFunctionInt     *_func_int;
+    MulFunctionQASYMM8 *_func_qasymm8;
 
 private:
     const ITensor *_input1;
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 371bb2e13e..869dd4e1d5 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,24 +37,31 @@ class NEPixelWiseMultiplication : public INESimpleFunction
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32.
-     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
-     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8/S16/F16/F32.
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy.
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8.
      * @param[in]      rounding_policy Rounding policy.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
-     * @param[in] input1          First tensor info input. Data types supported: U8/S16/F16/F32.
-     * @param[in] input2          Second tensor info input. Data types supported: U8/S16/F16/F32.
-     * @param[in] output          Output tensor info. Data types supported: U8/S16/F16/F32.
-     * @param[in] scale           Scale to apply after multiplication. Must be positive.
-     * @param[in] overflow_policy Overflow policy.
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8.
      * @param[in] rounding_policy Rounding policy.
      *
      * @return a status
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index a4f51436b4..e3166e02b6 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
@@ -42,12 +44,9 @@
 #include <arm_fp16.h> // needed for float16_t
 #endif                /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
 class Coordinates;
-} // namespace arm_compute
 
 namespace
 {
@@ -63,15 +62,29 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                     "Output can only be U8 if both inputs are U8");
 
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() != DataType::QASYMM8,
+                                    "Input2 must be QASYMM8 if both input1 is QASYMM8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() == DataType::QASYMM8 && overflow_policy == ConvertPolicy::WRAP,
+                                    "ConvertPolicy cannot be WRAP if datatype is QASYMM8");
+
+    if(output->total_size() > 0)
+    {
+        if(output->data_type() == DataType::QASYMM8)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        }
+
+        const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    }
 
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
@@ -159,6 +172,34 @@ inline uint16x8_t scale255_U16_U16(uint16x8_t in)
     return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
 }
 
+void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
+                                            const QuantizationInfo &input1_qua_info, const QuantizationInfo &input2_qua_info, const QuantizationInfo &output_qua_info)
+{
+    const auto input1 = static_cast<const qasymm8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const qasymm8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<qasymm8_t *__restrict>(output_ptr);
+
+    const qasymm8x16_t input1_q = vld1q_u8(input1);
+    const qasymm8x16_t input2_q = vld1q_u8(input2);
+
+    // Dequantitize inputs
+    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+    const QuantizationInfo tmp_qua_info = QuantizationInfo(output_qua_info.scale / scale, output_qua_info.offset);
+
+    const float32x4x4_t out_f32x4x4 =
+    {
+        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3])
+    };
+
+    const uint8x16_t result = vquantize(out_f32x4x4, tmp_qua_info);
+    vst1q_u8(output, result);
+}
+
 template <bool is_scale255, bool is_sat>
 void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
 {
@@ -291,7 +332,6 @@ void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict
     vst2q_s16(output, result);
 }
 
-template <bool is_scale255, bool is_sat>
 void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
 {
     const auto input1 = static_cast<const float *__restrict>(input1_ptr);
@@ -313,7 +353,6 @@ void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict
     vst4q_f32(output, result);
 }
 
-template <bool is_scale255, bool is_sat>
 void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -419,7 +458,7 @@ void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+    : _func_float(nullptr), _func_int(nullptr), _func_qasymm8(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
@@ -439,6 +478,7 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
     _output         = output;
     _scale          = scale;
     _scale_exponent = 0;
+    _func_qasymm8   = nullptr;
     _func_int       = nullptr;
     _func_float     = nullptr;
 
@@ -464,7 +504,11 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
     const DataType dt_output = output->info()->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+    if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8)
+    {
+        _func_qasymm8 = &mul_saturate_QASYMM8_QASYMM8_QASYMM8_n;
+    }
+    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
     {
         if(is_scale_255)
         {
@@ -521,12 +565,12 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
     }
     else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
     {
-        _func_float = &mul_F16_F16_F16_n<false, false>;
+        _func_float = &mul_F16_F16_F16_n;
         _func_int   = nullptr;
     }
     else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
     {
-        _func_float = &mul_F32_F32_F32_n<false, false>;
+        _func_float = &mul_F32_F32_F32_n;
         _func_int   = nullptr;
     }
     else
@@ -581,7 +625,18 @@ void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo
     Iterator input2(_input2, slice_input2);
     Iterator output(_output, slice);
 
-    if(_func_int != nullptr)
+    if(_func_qasymm8 != nullptr)
+    {
+        execute_window_loop(collapsed, [&](const Coordinates & id)
+        {
+            (*_func_qasymm8)(input1.ptr(), input2.ptr(), output.ptr(), _scale,
+                             _input1->info()->quantization_info(), _input2->info()->quantization_info(), _output->info()->quantization_info());
+            collapsed.slide_window_slice_3D(slice_input1);
+            collapsed.slide_window_slice_3D(slice_input2);
+        },
+        input1, input2, output);
+    }
+    else if(_func_int != nullptr)
     {
         execute_window_loop(collapsed, [&](const Coordinates & id)
         {
@@ -610,3 +665,4 @@ BorderSize NEPixelWiseMultiplicationKernel::border_size() const
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
     return BorderSize(0, border, 0, 0);
 }
+} // namespace arm_compute
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index b61ec394ea..03ce4c9639 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -129,7 +129,7 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("Scale", { 1.f, 2.f })),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 77da473e0d..0cc97a2c26 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,7 @@ const float scale_255   = 1.f / 255.f;
 const float scale_other = 1.f / 32768.f;
 
 #define DEFAULT_VALIDATE validate(Accessor(_target), _reference);
+#define QASYMM8_VALIDATE validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(1), 0.f);
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 
@@ -74,6 +75,24 @@ const float scale_other = 1.f / 32768.f;
     {                                                                                                                     \
         VALIDATE                                                                                                          \
     }
+
+#define PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, VALIDATE) \
+    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                           \
+                           combine(combine(combine(combine(combine(combine(combine(combine(                                       \
+                           datasets::SHAPES,                                                                                      \
+                           framework::dataset::make("DataType1", DataType::DT1)),                                                 \
+                           framework::dataset::make("DataType2", DataType::DT2)),                                                 \
+                           framework::dataset::make("Scale", std::move(SCALE))),                                                  \
+                           framework::dataset::make("ConvertPolicy", ConvertPolicy::SATURATE)),                                                                          \
+                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)),                                       \
+                           framework::dataset::make("QuantizationInfoIn1", QuantizationInfo(1.0 , 0))),                           \
+                           framework::dataset::make("QuantizationInfoIn2", QuantizationInfo(1.0 , 0))),                           \
+                           framework::dataset::make("QuantizationInfoOut", QuantizationInfo(100.0, 10))))                         \
+    {                                                                                                                             \
+        VALIDATE                                                                                                                  \
+    }
+
+
 // *INDENT-ON*
 // clang-format on
 
@@ -105,6 +124,7 @@ void validate_configuration(TensorShape shape, DataType dt1, DataType dt2, float
 }
 } // namespace
 
+using NEPixelWiseMultiplicationToQASYMM8Fixture = PixelWiseMultiplicationQuatizedValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
 template <typename T>
 using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, uint8_t>;
 template <typename T>
@@ -128,7 +148,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid scale
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),  // Mismatching data type
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching data type
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Mismatching data type
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -137,6 +158,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -145,9 +167,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                      })),
                framework::dataset::make("Scale",{  scale_unity, scale_unity, scale_unity, -1.f, scale_unity, scale_unity, scale_unity})),
-               framework::dataset::make("Expected", { true, true, false, false, false, false, false })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false })),
                input1_info, input2_info, output_info, scale, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO));
@@ -156,6 +179,25 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(QASYMM8toQASYMM8)
+
+TEST_SUITE(Scale255)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
+TEST_SUITE_END() // Scale255
+
+TEST_SUITE(ScaleUnity)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_unity, TO_ZERO, QASYMM8_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_unity, TO_ZERO, QASYMM8_VALIDATE)
+TEST_SUITE_END() // ScaleUnity
+
+TEST_SUITE(ScaleOther)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunSmall, ToQASYMM8Fixture, PRECOMMIT, SmallShapes(), QASYMM8, QASYMM8, scale_other, TO_ZERO, QASYMM8_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_QASYMM8_DATA_TEST_CASE(RunLarge, ToQASYMM8Fixture, NIGHTLY, LargeShapes(), QASYMM8, QASYMM8, scale_other, TO_ZERO, QASYMM8_VALIDATE)
+TEST_SUITE_END() // ScaleOther
+
+TEST_SUITE_END() // QASYMM8toQASYMM8
+
 TEST_SUITE(U8toU8)
 
 TEST_SUITE(Scale255)
diff --git a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
index 9927b75032..efdf5d078e 100644
--- a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
+++ b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -121,6 +121,19 @@ protected:
     SimpleTensor<T2> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
+class PixelWiseMultiplicationQuatizedValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+               QuantizationInfo in1_qua_info, QuantizationInfo in2_qua_info, QuantizationInfo out_qua_info)
+    {
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, scale, convert_policy, rounding_policy,
+                                                                                                               in1_qua_info, in2_qua_info, out_qua_info);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
 class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index d86f8aae0d..3470de241a 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -18,7 +18,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * dst OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "PixelWiseMultiplication.h"
-- 
cgit v1.2.1