From ff0bccfb4697c591d569db9c2dc223f2e311a7d3 Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Fri, 30 Nov 2018 10:42:40 +0000
Subject: COMPMID-1497: Add support for interleaved B reshaping in
 gemm_interleaved

Change-Id: I2171e1bf707bdcfa221c18d7a8904979e110020d
Reviewed-on: https://review.mlplatform.org/326
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../assembly/NEGEMMInterleavedWrapper.cpp          | 355 +++++++++++++++++----
 1 file changed, 300 insertions(+), 55 deletions(-)

(limited to 'src/runtime/NEON/functions/assembly')
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index dcb2f856f0..fe998a0e42 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -32,8 +32,149 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
 namespace arm_compute
 {
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+    /** Number of buffers to ping pong between */
+    static constexpr unsigned int NUM_BUFFERS = 3;
+
+    explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+        : _max_num_users(max_num_users)
+    {
+    }
+    unsigned int num_buffers() const override
+    {
+        return NUM_BUFFERS;
+    }
+    /* - Lock the requested index if it's free and return true if it needs reshaping.
+     * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+     * - Block if the corresponding buffer for the given index is still being used by a different index.
+     */
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        while(true)
+        {
+            if(buf.index == index && buf.state != State::FREE)
+            {
+                //Another thread already is reshaping / has reshaped this block: nothing to do
+                return false;
+            }
+            else
+            {
+                std::unique_lock<std::mutex> lock(buf.mutex);
+                //If the buffer is free then lock it for reshaping:
+                if(buf.state == State::FREE)
+                {
+                    buf.index = index;
+                    buf.state = State::BEING_RESHAPED;
+                    return true;
+                }
+                // Check again just in case it changed while we were acquiring the lock:
+                if(buf.index == index)
+                {
+                    //Another thread is reshaping this block already, nothing to do
+                    return false;
+                }
+                // buf.index != index: Buffer still being used by another block, need to wait
+                buf.sem.wait(lock);
+            }
+        }
+    }
+    /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+    void mark_as_reshaped(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        {
+            std::lock_guard<std::mutex> lock(buf.mutex);
+            buf.users = _max_num_users;
+            buf.state = State::IN_USE;
+        }
+        buf.sem.notify_all();
+    }
+
+    /* Block until the buffer at the given index is reshaped */
+    void wait_for_reshaping(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        // Check if it's already ready to use:
+        if(buf.state == State::IN_USE)
+            return;
+        std::unique_lock<std::mutex> lock(buf.mutex);
+        //Double check it didn't change while we were acquiring the lock:
+        if(buf.state == State::IN_USE)
+            return;
+        buf.sem.wait(lock);
+    }
+    /* Mark the buffer at the given index as not used by this thread anymore.
+     * Once all the threads have called this method then the buffer is marked as free again.
+     */
+    void mark_as_unused(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        if(--buf.users == 0)
+        {
+            std::unique_lock<std::mutex> lock(buf.mutex);
+            buf.state = State::FREE;
+            lock.unlock();
+            buf.sem.notify_all();
+        }
+    }
+
+private:
+    enum class State
+    {
+        FREE,
+        BEING_RESHAPED,
+        IN_USE
+    };
+    struct Buffer
+    {
+        unsigned int            index{};
+        std::atomic_uint        users{};
+        State                   state{ State::FREE };
+        std::mutex              mutex{};
+        std::condition_variable sem{};
+    } _buffers[NUM_BUFFERS];
+    Buffer &get_buffer_from_index(unsigned int index)
+    {
+        return _buffers[index % NUM_BUFFERS];
+    }
+    unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+    unsigned int num_buffers() const override
+    {
+        return 1;
+    }
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        return true;
+    }
+    void mark_as_reshaped(unsigned int index) override
+    {
+    }
+    void wait_for_reshaping(unsigned int index) override
+    {
+    }
+    void mark_as_unused(unsigned int index) override
+    {
+    }
+};
+
 NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager))
 {
@@ -72,6 +213,7 @@ void NEGEMMInterleavedWrapper::prepare()
         // Keep the smallest of the two:
         const unsigned int num_windows  = std::min(num_iterations, max_iterations);
         const TensorShape  window_shape = _batch_window.shape();
+        const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
 
         // Create a 1D window to dynamically split the batch window:
         Window win_1D;
@@ -80,28 +222,112 @@ void NEGEMMInterleavedWrapper::prepare()
         // Create one workload for each sub-window:
         for(unsigned int w = 0; w < num_windows; w++)
         {
-            Window             win          = win_1D.split_window(0, w, num_windows);
-            const Coordinates  start_offset = index2coords(window_shape, win.x().start());
-            const Coordinates  end_offset   = index2coords(window_shape, win.x().end() - 1);
-            const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+            Window            win          = win_1D.split_window(0, w, num_windows);
+            const Coordinates start_offset = index2coords(window_shape, win.x().start());
+            const Coordinates end_offset   = index2coords(window_shape, win.x().end() - 1);
 
-            auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+            if(_pretranspose_b)
             {
-                //For each block of rows in "M"
-                auto workload_mm = this->_mm_workloads.begin();
-                for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
                 {
-                    // Transform one k_block from A:
-                    this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
-                    // Then perform the matrix multiplication for each x block along N:
-                    for(unsigned int i = 0; i < num_x_blocks; i++)
+                    //For each block of rows in "M"
+                    auto workload_mm = this->_mm_workloads.begin();
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
                     {
-                        ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                        this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        }
                     }
-                }
-            };
-            _workloads.push_back(workload);
+                };
+                _workloads.push_back(workload);
+            }
+            else
+            {
+                auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+                {
+                    //For each block of rows in "M"
+                    auto         workload_mm = this->_mm_workloads.begin();
+                    unsigned int workload_b  = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                    {
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            if(workload_b_next < this->_b_workloads.size())
+                            {
+                                //Lock on BufferManager: need to run it ?
+                                if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                                {
+                                    this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                    this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                                }
+                                workload_b_next++;
+                            }
+                            ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                            // Run if needed or wait
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b);
+                            }
+                            this->_buffer_manager->wait_for_reshaping(workload_b);
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                            this->_buffer_manager->mark_as_unused(workload_b);
+                            workload_b++;
+                        }
+                    }
+                };
+                _workloads.push_back(workload);
+            }
+        }
+        if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+        {
+            //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+            for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+            {
+                auto workload = [this](const ThreadInfo & info)
+                {
+                    unsigned int workload_b = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = 1;
+
+                    for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+                    {
+                        if(workload_b_next < this->_b_workloads.size())
+                        {
+                            //Lock on BufferManager: need to run it ?
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                            }
+                            workload_b_next++;
+                        }
+                        ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                        // Run if needed or wait
+                        if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                        {
+                            this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                            this->_buffer_manager->mark_as_reshaped(workload_b);
+                        }
+                        this->_buffer_manager->wait_for_reshaping(workload_b);
+                        this->_buffer_manager->mark_as_unused(workload_b);
+                        workload_b++;
+                    }
+                };
+                _workloads.push_back(workload);
+            }
         }
 
         _is_prepared = true;
@@ -158,62 +384,81 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe
 
     if(!_pretranspose_b)
     {
+        _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+        // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+        const unsigned int num_iterations = _batch_window.num_iterations_total();
+        if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+        {
+            _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+        }
+        else
+        {
+#ifdef NO_MULTI_THREADING
+            ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else  /* NO_MULTI_THREADING */
+            _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+        }
         // If B is transposed at every iteration then transformed_B can be managed:
         _memory_group.manage(&_transformed_b);
-        _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+        auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
     }
     else
     {
         _tag += "_preB";
-        switch(input_type)
-        {
-            case DataType::F32:
-                _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
-                break;
+    }
+    switch(input_type)
+    {
+        case DataType::F32:
+            _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+            break;
 #ifdef __aarch64__
-            case DataType::U8:
-            case DataType::QASYMM8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
-            case DataType::S8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if(use_dot)
+            {
+                _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+            }
+            else
+            {
+                _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+            }
+            break;
+        case DataType::S8:
+            if(use_dot)
+            {
+                _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+            }
+            else
+            {
+                _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+            }
+            break;
 #endif /* __aarch64__ */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
-                break;
+        case DataType::F16:
+            _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+            break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-        }
-        ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported");
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
 
+    if(_pretranspose_b)
+    {
         _block_sizes = _prepare_b->block_sizes();
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
     }
 
     _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
     _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
     _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
 
-    _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-    _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
     _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
     _memory_group.manage(&_transformed_a);
     _memory_group.manage(&_tmp_c);
-- 
cgit v1.2.1