From 4a578b923ed000c67fe0bc1433f945aea634ca9c Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 25 Jun 2021 12:13:49 +0100
Subject: Port the ClGemmLowp kernels to the new API

Ported kernels:
 - CLGEMMLowpMatrixMultiplyNativeKernel
 - CLGEMMLowpMatrixMultiplyReshapedKernel
 - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
 - CLGEMMLowpOffsetContributionKernel
 - CLGEMMLowpOffsetContributionOutputStageKernel
 - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
 - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
 - CLGEMMLowpQuantizeDownInt32ScaleKernel

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 examples/gemm_tuner/cl_gemmlowp_reshaped.cpp       | 19 +++++++-------
 ...aped_rhs_only_fused_output_stage_fixedpoint.cpp | 30 +++++++++++++---------
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'examples')

diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
index 3d3f7fef1e..3c8976fddd 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
@@ -33,7 +33,7 @@
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
@@ -168,8 +168,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using CLGEMMReshapeLHSMatrix           = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>;
-using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>;
+using ClGemmReshapeLHSMatrix           = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>;
+using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedExample : public Example
 {
@@ -269,20 +269,20 @@ public:
         // Validate argments
         if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
         {
-            std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl;
             return false;
         }
 
         if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
         {
-            std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl;
             return false;
         }
 
         // Configure functions
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
 
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -298,7 +298,8 @@ public:
         ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
         reshape_lhs.run(reshape_lsh_pack);
 
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
         CLScheduler::get().sync();
@@ -315,8 +316,8 @@ private:
     CLTensor                         rhs_reshaped{};
     CLTensor                         dst{};
     CLTuner                          tuner{};
-    CLGEMMReshapeLHSMatrix           reshape_lhs{};
-    CLGEMMLowpMatrixMultiplyReshaped gemm{};
+    ClGemmReshapeLHSMatrix           reshape_lhs{};
+    ClGemmLowpMatrixMultiplyReshaped gemm{};
 };
 
 /** Main test program for gemmlowp reshaped
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
index ca7b7a5f04..15c1b86c61 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
@@ -35,8 +35,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
@@ -47,6 +47,7 @@
 
 using namespace arm_compute;
 using namespace utils;
+using namespace arm_compute::opencl::kernels;
 using namespace arm_compute::misc::shape_calculator;
 using namespace gemm_tuner;
 
@@ -146,8 +147,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>;
-using CLGEMMLowpMatrixAReduction              = test::CLSynthetizeFunction<CLGEMMLowpMatrixAReductionKernel>;
+using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
+using ClGemmLowpMatrixAReduction              = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example
 {
@@ -289,7 +290,7 @@ public:
             const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32);
             vector_sum_row.allocator()->init(info_vector_sum_row);
 
-            mtx_a_reduction = std::make_unique<CLGEMMLowpMatrixAReduction>();
+            mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>();
 
             if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
             {
@@ -297,7 +298,7 @@ public:
                 return false;
             }
 
-            mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{});
+            mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{});
         }
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
         if(gemm_info.a_offset != 0)
@@ -311,12 +312,14 @@ public:
         if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
                           gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info()))
         {
-            std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl;
+            std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl;
             return false;
         }
 
         // Configure function
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
+                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(),
+                       bias.info(), dst_multipliers.info(), dst_shifts.info());
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -335,9 +338,12 @@ public:
     {
         if(mtx_a_reduction != nullptr)
         {
-            mtx_a_reduction->run();
+            ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } });
+            mtx_a_reduction->run(red_pack);
         }
-        gemm.run();
+
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
         CLScheduler::get().sync();
@@ -358,8 +364,8 @@ private:
     CLTensor                                    dst_multipliers{};
     CLTensor                                    dst_shifts{};
     CLTuner                                     tuner{};
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHS     gemm{};
-    std::unique_ptr<CLGEMMLowpMatrixAReduction> mtx_a_reduction{ nullptr };
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhs     gemm{};
+    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{ nullptr };
 };
 
 /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint
-- 
cgit v1.2.1