From f3dfa279d536906dac3e618244b2c1d33e5ff28a Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Tue, 21 Nov 2017 17:52:12 +0000
Subject: COMPMID-632 Assembly: Integrate gemmlowp assembly version

Integrate generic gemmlowp assembly version for u8.

Change-Id: I17ed4494c25a132b2bac581febe1544e49b4f352
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110114
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 tests/validation/CPP/GEMMLowp.cpp | 28 +++++++++++++++++-----------
 tests/validation/CPP/GEMMLowp.h   | 11 +++++++----
 2 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'tests/validation/CPP')
diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp
index bf002cf2b5..35b8a6486e 100644
--- a/tests/validation/CPP/GEMMLowp.cpp
+++ b/tests/validation/CPP/GEMMLowp.cpp
@@ -63,19 +63,21 @@ void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleT
 }
 } // namespace
 
-template <typename T>
-SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset)
+template <typename T_out, typename T_in>
+SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in> &b, int32_t a_offset, int32_t b_offset)
 {
-    TensorShape shape(b.shape()[0], a.shape()[1]);
+    static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
 
-    SimpleTensor<int32_t> c(shape, DataType::S32);
+    TensorShape         shape(b.shape()[0], a.shape()[1]);
+    DataType            dt = std::is_same<T_out, int32_t>::value ? DataType::S32 : DataType::U32;
+    SimpleTensor<T_out> c(shape, dt);
 
     const int K       = a.shape().x();
     const int b_width = b.shape().x();
     const int rows    = c.shape().y(); //M
     const int cols    = c.shape().x(); //N
 
-    std::vector<int32_t> acc;
+    std::vector<T_out> acc;
     acc.resize(cols);
 
     for(int i = 0; i < rows; ++i)
@@ -86,11 +88,11 @@ SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, co
         }
         for(int k = 0; k < K; ++k)
         {
-            const int32_t tmp_a = a_offset + static_cast<int32_t>(a[k + i * K]);
+            const T_out tmp_a = a_offset + static_cast<T_out>(a[k + i * K]);
             for(int j = 0; j < b_width; ++j)
             {
-                const int32_t tmp_b       = b_offset + static_cast<int32_t>(b[j + k * b_width]);
-                const int32_t mult_as_int = tmp_a * tmp_b;
+                const T_out tmp_b       = b_offset + static_cast<T_out>(b[j + k * b_width]);
+                const T_out mult_as_int = tmp_a * tmp_b;
                 acc[j] += mult_as_int;
             }
         }
@@ -104,9 +106,10 @@ SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, co
 }
 
 // used to validate assembly kernels which don't know anything about offsets
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b)
+template <typename T1, typename T2>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b)
 {
-    return gemmlowp_matrix_multiply_core(a, b, 0, 0);
+    return gemmlowp_matrix_multiply_core<T1, T2>(a, b, 0, 0);
 }
 
 template <typename T>
@@ -130,11 +133,14 @@ SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTe
     return dst;
 }
 
-template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min,
                                                                            int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
                                                                            int32_t result_shift, int32_t min, int32_t max);
+template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, int32_t a_offset, int32_t b_offset);
+template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
+template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
+template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h
index ee33d8e0c0..6c72b56e7a 100644
--- a/tests/validation/CPP/GEMMLowp.h
+++ b/tests/validation/CPP/GEMMLowp.h
@@ -35,13 +35,16 @@ namespace validation
 {
 namespace reference
 {
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
-
 template <typename T>
-SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
+template <typename T1, typename T2>
+SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, int32_t a_offset, int32_t b_offset);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
+
+template <typename T1, typename T2>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b);
 
 template <typename T>
 SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
-- 
cgit v1.2.1