From 578ab61332aede4e11b7d5d92dbd72993d807a08 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 23 Jun 2017 09:34:33 +0100
Subject: COMPMID-414 - Port CLConvolutionLayer to support 8 bit fixed point -
 CLGEMMMatrixAccumulateBiasesKernel

Change-Id: Idba13b578dc564b8003ce2fa3392eea2af3ce806
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78664
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h   |  2 +-
 src/core/CL/CLKernelLibrary.cpp                    |  3 +-
 src/core/CL/cl_kernels/gemm.cl                     | 44 ++++++----------------
 .../kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp | 12 +++---
 4 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
index ea1db9f831..74a7a0e4a6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -46,7 +46,7 @@ public:
     CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: QS8/F16/F32
      * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
      */
     void configure(ICLTensor *accum, const ICLTensor *biases);
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 45a247db1a..6c64265785 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -151,8 +151,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "finalize", "optical_flow_pyramid_lk.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
-    { "gemm_accumulate_biases_f16", "gemm.cl" },
-    { "gemm_accumulate_biases_f32", "gemm.cl" },
+    { "gemm_accumulate_biases", "gemm.cl" },
     { "gemm_interleave4x4_8bit", "gemm.cl" },
     { "gemm_interleave4x4_16bit", "gemm.cl" },
     { "gemm_interleave4x4_32bit", "gemm.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index d80b5262a7..9bec8d5d92 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -247,6 +247,8 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
 }
 
 /** This kernel accumulates each row with the biases vector
+ *
+ * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  *
  * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
  * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
@@ -259,48 +261,24 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
  * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void gemm_accumulate_biases_f32(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
-    float4 biases_value = vload4(0, (__global float *)biases.ptr);
-    accum_value         = biases_value + accum_value;
-
-    // Store result in the accummulate buffer
-    vstore4(accum_value, 0, (__global float *)accum.ptr);
-}
-
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemm_accumulate_biases_f16(
+#if(defined DATA_TYPE)
+__kernel void gemm_accumulate_biases(
     IMAGE_DECLARATION(accum),
     VECTOR_DECLARATION(biases))
 {
     Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 
-    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
-    half8 biases_value = vload8(0, (__global half *)biases.ptr);
-    accum_value        = biases_value + accum_value;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
+    accum_value  = biases_value + accum_value;
 
     // Store result in the accummulate buffer
-    vstore8(accum_value, 0, (__global half *)accum.ptr);
+    vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
 }
+#endif // defined DATA_TYPE
 
 #if(defined WIDTH_MATRIX_B)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 289873c23f..75c1a6e629 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -43,20 +43,22 @@ CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
 
 void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
     ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
 
     _biases = biases;
     _accum  = accum;
 
+    std::set<std::string> build_opts;
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())));
+
     // Create kernel
-    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
-    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+    const unsigned int num_elems_processed_per_iteration = 16;
 
     Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
 
-- 
cgit v1.2.1