From 7cd26d4a1b14bc4bf7c61496803416ab3d84791f Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 9 Jan 2019 18:35:17 +0000
Subject: COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels.

Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3
Reviewed-on: https://review.mlplatform.org/515
Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEGEMMInterleavedMatrixMultiplyWrapper.h       | 130 ++++++++++++++++++---
 1 file changed, 111 insertions(+), 19 deletions(-)

(limited to 'arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h')
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
index 46a05abcdb..e2b849aa3d 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,13 @@
 
 #include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/WindowIterator.h"
 
 namespace arm_compute
 {
@@ -84,7 +89,7 @@ public:
 };
 
 /** Equivalent to arm_gemm::GemmInterleaved's strategy::kernel() but using Compute Library types. */
-template <typename To, typename Tr, bool use_dot = false>
+template <typename strategy>
 class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedMatrixMultiplyWrapper
 {
 public:
@@ -94,7 +99,7 @@ public:
      * @param[in]     transformed_b    Already reshaped matrix B.
      * @param[out]    tmp_c            Temporary buffer to be used to store intermediate results.
      * @param[in,out] c                Result matrix C.
-     * @param[in]     batch_window     Window containing iteration information for the M and batch dimensions.
+     * @param[in]     block_walker     Window containing iteration information for the M and batch dimensions.
      * @param[in]     block_sizes      Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
      * @param[in]     params           M, N, K sizes.
      * @param[in]     is_pretransposed Is B also pretransposed ?
@@ -102,30 +107,117 @@ public:
      * @param[in]     beta             Beta value
      * @param[in]     max_num_threads  Maximum number of threads that might be used for the calculations.
      */
-    void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &batch_window, const BlockSizes &block_sizes,
-                   const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads);
+    void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes,
+                   const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
+    {
+        _prepared_a         = prepared_a;
+        _transformed_b      = transformed_b;
+        _tmp_c              = tmp_c;
+        _c                  = c;
+        _block_walker       = block_walker;
+        _block_sizes        = block_sizes;
+        _params             = params;
+        _b_is_pretransposed = b_is_pretransposed;
+        _alpha              = alpha;
+        _beta               = beta;
+
+        auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
+    }
 
     // Inherited methods overridden:
-    void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override;
-    void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) override;
+    void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override
+    {
+        strategy                                        strat(info.cpu_info);
+        TensorAccessor<typename strategy::operand_type> prepared_a(*_prepared_a);
+        TensorAccessor<typename strategy::operand_type> transformed_b(*_transformed_b);
+        TensorAccessor<typename strategy::result_type>  c(*_c);
+        TensorAccessor<typename strategy::result_type>  tmp_c(*_tmp_c);
+
+        int                              prev_batch = -1;
+        typename strategy::operand_type *a_ptr      = nullptr;
+        auto window_iterator                        = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
+        {
+            const unsigned int y     = id.x();
+            const unsigned int batch = id.y();
+            const unsigned int ymax  = std::min(_params.M, y + strategy::out_height());
+
+            // If it's the first block of a new batch then reset the pointer to A.
+            if(prev_batch != static_cast<int>(batch))
+            {
+                const unsigned int first_m = id.x();
+                a_ptr                      = prepared_a(0, first_m, batch);
+                prev_batch                 = batch;
+            }
+
+            // Call matrix multiply assembly routine to process the block:
+            strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
+            a_ptr += strategy::out_height() * wl._kern_k;
+
+            // Merge the result with the other blocks' results:
+            strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<typename strategy::result_type>(1)));
+        });
+        auto on_new_row_size = [&](unsigned int start, unsigned int end)
+        {
+            //Nothing to do
+        };
+        window_iterator.iterate_2D(on_new_row_size);
+    }
+    void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) override
+    {
+        unsigned int offset_transformed_b = 0;
+        unsigned int wl_index             = 0;
+        unsigned int num_buffers = 0, reshaped_block_size = 0;
+
+        if(!_b_is_pretransposed)
+        {
+            num_buffers         = _transformed_b->info()->tensor_shape()[1];
+            reshaped_block_size = _transformed_b->info()->tensor_shape()[0];
+        }
+        execute_window_loop(_block_walker, [&](const Coordinates & id)
+        {
+            const unsigned int x0    = id.x();
+            const unsigned int k0    = id.y();
+            const unsigned int multi = id.z();
+
+            const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
+            const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
+
+            // Figure out how many "K" the kernel will actually process.
+            const int kern_k  = ceil_to_multiple(kmax - k0, strategy::k_unroll());
+            const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
+
+            workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
+
+            if(_b_is_pretransposed)
+            {
+                offset_transformed_b += bblocks * strategy::out_width() * kern_k;
+            }
+            else
+            {
+                // Rotate through the BufferManager's buffers:
+                wl_index++;
+                offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
+            }
+        });
+    }
 
 private:
     const ITensor *_prepared_a
     {
         nullptr
     };
-    const ITensor               *_transformed_b{ nullptr };
-    ITensor                     *_tmp_c{ nullptr };
-    ITensor                     *_c{ nullptr };
-    unsigned int                 _Nsize{ 0 };
-    unsigned int                 _Ksize{ 0 };
-    bool                         _transpose_b{ false };
-    BlockSizes                   _block_sizes{};
-    INEGEMMWrapperKernel::Params _params{};
-    Window                       _block_walker{};
-    bool                         _b_is_pretransposed{ false };
-    Tr                           _alpha{};
-    Tr                           _beta{};
+    const ITensor                 *_transformed_b{ nullptr };
+    ITensor                       *_tmp_c{ nullptr };
+    ITensor                       *_c{ nullptr };
+    unsigned int                   _Nsize{ 0 };
+    unsigned int                   _Ksize{ 0 };
+    bool                           _transpose_b{ false };
+    BlockSizes                     _block_sizes{};
+    INEGEMMWrapperKernel::Params   _params{};
+    Window                         _block_walker{};
+    bool                           _b_is_pretransposed{ false };
+    typename strategy::result_type _alpha{};
+    typename strategy::result_type _beta{};
 };
 
 } // namespace arm_compute
-- 
cgit v1.2.1