From 7a0212aae4fe6617eb31c734c24246c345556391 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Tue, 14 Apr 2020 16:08:32 +0100
Subject: COMPMID-3236: Add support QSYMM16 into S32
 CLPixelwiseMultiplicationKernel

Change-Id: Ifc519f53f04fcb14ddb9c17f98cc687f34285c97
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3018
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../CL/kernels/CLPixelWiseMultiplicationKernel.h   | 20 +++++++++++++--
 .../CL/functions/CLPixelWiseMultiplication.h       | 20 +++++++++++++--
 src/core/CL/cl_kernels/pixelwise_mul_float.cl      | 20 +++++++--------
 src/core/CL/cl_kernels/pixelwise_mul_int.cl        | 16 ++++++------
 .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp | 30 ++++++++++------------
 tests/validation/CL/PixelWiseMultiplication.cpp    | 20 ++++++++++++++-
 6 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index eacdb44c09..2a54a4bc48 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -50,7 +50,15 @@ public:
      *
      * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[out] output          The output tensor, Data types supported:
+     *                             - U8, only if both input are U8
+     *                             - QASYMM8, only if both inputs are QASYMM8
+     *                             - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
+     *                             - S16
+     *                             - QSYMM16, only if both inputs are QSYMM16
+     *                             - S32, only if both inputs are QSYMM16
+     *                             - F16
+     *                             - F32
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
@@ -63,7 +71,15 @@ public:
      *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[in] output          The output tensor info, Data types supported:
+     *                            - U8, only if both input are U8
+     *                            - QASYMM8, only if both inputs are QASYMM8
+     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
+     *                            - S16
+     *                            - QSYMM16, only if both inputs are QSYMM16
+     *                            - S32, only if both inputs are QSYMM16
+     *                            - F16
+     *                            - F32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index a5ab829c83..47bb2bf4db 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -42,7 +42,15 @@ public:
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[out]     output          The output tensor. Data types supported:
+     *                                 - U8, only if both input are U8
+     *                                 - QASYMM8, only if both inputs are QASYMM8
+     *                                 - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
+     *                                 - S16
+     *                                 - QSYMM16, only if both inputs are QSYMM16
+     *                                 - S32, only if both inputs are QSYMM16
+     *                                 - F16
+     *                                 - F32
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
@@ -55,7 +63,15 @@ public:
      *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[in] output          The output tensor info. Data types supported:
+     *                            - U8, only if both input are U8
+     *                            - QASYMM8, only if both inputs are QASYMM8
+     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
+     *                            - S16
+     *                            - QSYMM16, only if both inputs are QSYMM16
+     *                            - S32, only if both inputs are QSYMM16
+     *                            - F16
+     *                            - F32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index aad4becc1a..163cb23582 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -30,7 +30,7 @@
 #endif /* SATURATE */
 #define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
 
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
 
 #if defined(ACTIVATION_TYPE)
 #include "activation_float_helpers.h"
@@ -40,8 +40,8 @@
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
  * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
  *
  * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
@@ -82,18 +82,18 @@ __kernel void pixelwise_mul_float(
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
 
     // Perform multiplication
 #ifdef DATA_TYPE_FLOAT
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data * in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    res = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
 #else  /* DATA_TYPE_FLOAT */
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(ACC_DATA_TYPE, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
 #endif /* DATA_TYPE_FLOAT */
 
 #if defined(ACTIVATION_TYPE)
@@ -103,7 +103,7 @@ __kernel void pixelwise_mul_float(
     vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
 #endif // defined(ACTIVATION_TYPE)
 }
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
 /** Performs a pixelwise multiplication of complex float values
  *
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index d277c6c56f..097df82eaa 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -35,13 +35,13 @@
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
 /** Performs a pixelwise multiplication with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
- * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
  *
  * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
@@ -81,15 +81,15 @@ __kernel void pixelwise_mul_int(
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
-    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
 
     // Perform multiplication and store result
     vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
 #if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
 
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index ff5afa3d95..2df3ff4f34 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -28,17 +28,9 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
 #include "support/StringSupport.h"
 
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
 namespace arm_compute
 {
 namespace
@@ -77,7 +69,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
                                                              1,
                                                              DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                              DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::F32);
+                                                             DataType::S32, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
@@ -86,6 +78,8 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
                                         "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
                                         "Output can only be QSYMM16 if both inputs are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
+                                        "Output can only be S32 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
     }
 
@@ -177,22 +171,24 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
         scale_int = std::abs(exponent - 1);
     }
 
-    std::string compute_type;
+    std::string acc_type;
     // Check if it has float inputs and output
     if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
     {
-        scale_int    = -1;
-        compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
+        scale_int = -1;
+        acc_type  = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        if(input1->info()->element_size() == 2 || input2->info()->element_size() == 2)
         {
-            compute_type = "int";
+            // Use 32-bit accumulator for 16-bit input
+            acc_type = "int";
         }
         else
         {
-            compute_type = "ushort";
+            // Use 16-bit accumulator for 8-bit input
+            acc_type = "ushort";
         }
     }
 
@@ -205,7 +201,7 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_quantized)
+    if(is_quantized && (output->info()->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
@@ -227,7 +223,7 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
         build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
-        build_opts.add_option("-DDATA_TYPE_RES=" + compute_type);
+        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
         if(act_info.enabled())
         {
             build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 310828c48d..ea686af812 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -137,7 +137,8 @@ PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunWithActivationSmallBroadcast
                                                  VALIDATE(float, 1.f))
 
 template <typename T>
-using CLPixelWiseMultiplicationQuantizedFixture = PixelWiseMultiplicationValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, T>;
+using CLPixelWiseMultiplicationQuantizedFixture   = PixelWiseMultiplicationValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, T>;
+using CLPixelWiseMultiplicationQSYMM16ToS32Fxture = PixelWiseMultiplicationValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, int16_t, int16_t, int32_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
@@ -208,6 +209,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPixelWiseMultiplicationQuantizedFixture<int16
     validate(CLAccessor(_target), _reference, tolerance_qsymm16);
 }
 TEST_SUITE_END() // QSYMM16
+TEST_SUITE(QSYMM16ToS32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQSYMM16ToS32Fxture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                               framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                                                       framework::dataset::make("Scale", { 1.f })),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
+                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
+                               framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 0) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qsymm16);
+}
+TEST_SUITE_END() // QSYMM16ToS32
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // PixelWiseMultiplication
-- 
cgit v1.2.1