From 48b3ef89de5f21a0169d8416e3d54081f82c7bf8 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 14 Oct 2019 19:03:09 +0100
Subject: COMPMID-2577: Fuse bias addition and activation in gemm assembly
 kernels

Change-Id: I7f52112d2d05b1ea3d3f3d4b19b8eafab05d6c44
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2141
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
---
 .../assembly/NEGEMMInterleavedWrapper.cpp          | 430 ---------------------
 1 file changed, 430 deletions(-)
 delete mode 100644 src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp

(limited to 'src/runtime/NEON/functions/assembly')
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
deleted file mode 100644
index 1aeab5b9cb..0000000000
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
-
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-namespace arm_compute
-{
-#ifndef NO_MULTI_THREADING
-class BufferManagerMultipleThreads final : public IBufferManager
-{
-public:
-    /** Number of buffers to ping pong between */
-    static constexpr unsigned int NUM_BUFFERS = 3;
-
-    explicit BufferManagerMultipleThreads(unsigned int max_num_users)
-        : _buffers(), _max_num_users(max_num_users)
-    {
-    }
-    unsigned int num_buffers() const override
-    {
-        return NUM_BUFFERS;
-    }
-    /* - Lock the requested index if it's free and return true if it needs reshaping.
-     * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
-     * - Block if the corresponding buffer for the given index is still being used by a different index.
-     */
-    bool lock_to_reshape_if_needed(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        while(true)
-        {
-            if(buf.index == index && buf.state != State::FREE)
-            {
-                //Another thread already is reshaping / has reshaped this block: nothing to do
-                return false;
-            }
-            else
-            {
-                std::unique_lock<std::mutex> lock(buf.mutex);
-                //If the buffer is free then lock it for reshaping:
-                if(buf.state == State::FREE)
-                {
-                    buf.index = index;
-                    buf.state = State::BEING_RESHAPED;
-                    return true;
-                }
-                // Check again just in case it changed while we were acquiring the lock:
-                if(buf.index == index)
-                {
-                    //Another thread is reshaping this block already, nothing to do
-                    return false;
-                }
-                // buf.index != index: Buffer still being used by another block, need to wait
-                buf.sem.wait(lock);
-            }
-        }
-    }
-    /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
-    void mark_as_reshaped(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        {
-            std::lock_guard<std::mutex> lock(buf.mutex);
-            buf.users = _max_num_users;
-            buf.state = State::IN_USE;
-        }
-        buf.sem.notify_all();
-    }
-
-    /* Block until the buffer at the given index is reshaped */
-    void wait_for_reshaping(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
-        // Check if it's already ready to use:
-        if(buf.state == State::IN_USE)
-        {
-            return;
-        }
-        std::unique_lock<std::mutex> lock(buf.mutex);
-        //Double check it didn't change while we were acquiring the lock:
-        if(buf.state == State::IN_USE)
-        {
-            return;
-        }
-        buf.sem.wait(lock);
-    }
-    /* Mark the buffer at the given index as not used by this thread anymore.
-     * Once all the threads have called this method then the buffer is marked as free again.
-     */
-    void mark_as_unused(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
-        if(--buf.users == 0)
-        {
-            std::unique_lock<std::mutex> lock(buf.mutex);
-            buf.state = State::FREE;
-            lock.unlock();
-            buf.sem.notify_all();
-        }
-    }
-
-private:
-    enum class State
-    {
-        FREE,
-        BEING_RESHAPED,
-        IN_USE
-    };
-    struct Buffer
-    {
-        unsigned int            index{};
-        std::atomic_uint        users{};
-        State                   state{ State::FREE };
-        std::mutex              mutex{};
-        std::condition_variable sem{};
-    };
-    std::array<struct Buffer, NUM_BUFFERS> _buffers;
-    Buffer &get_buffer_from_index(unsigned int index)
-    {
-        return _buffers[index % NUM_BUFFERS];
-    }
-    unsigned int _max_num_users;
-};
-#endif /* NO_MULTI_THREADING */
-
-class BufferManagerSingleThread : public IBufferManager
-{
-public:
-    unsigned int num_buffers() const override
-    {
-        return 1;
-    }
-    bool lock_to_reshape_if_needed(unsigned int index) override
-    {
-        ARM_COMPUTE_UNUSED(index);
-        return true;
-    }
-    void mark_as_reshaped(unsigned int index) override
-    {
-        ARM_COMPUTE_UNUSED(index);
-    }
-    void wait_for_reshaping(unsigned int index) override
-    {
-        ARM_COMPUTE_UNUSED(index);
-    }
-    void mark_as_unused(unsigned int index) override
-    {
-        ARM_COMPUTE_UNUSED(index);
-    }
-};
-
-NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(std::move(memory_manager)),
-      _weights_manager(weights_manager)
-{
-}
-
-void NEGEMMInterleavedWrapper::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-    NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
-}
-
-void NEGEMMInterleavedWrapper::prepare()
-{
-    ARM_COMPUTE_UNUSED(_weights_manager);
-    if(!_is_prepared)
-    {
-        if(_pretranspose_b)
-        {
-            _transformed_b.allocator()->allocate();
-            NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
-            _b->mark_as_unused();
-        }
-        else
-        {
-            _prepare_b->create_workloads(_b_workloads);
-        }
-        _transform_a->create_workloads(_a_workloads);
-        _matrix_multiply->create_workloads(_mm_workloads);
-
-        //Maximum number of workloads to create:
-        const unsigned int num_threads    = NEScheduler::get().num_threads();
-        const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
-        //Maximum number of iterations the parameters allow:
-        const unsigned int num_iterations = _batch_window.num_iterations_total();
-        // Keep the smallest of the two:
-        const unsigned int num_windows  = std::min(num_iterations, max_iterations);
-        const TensorShape  window_shape = _batch_window.shape();
-        const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
-
-        // Create a 1D window to dynamically split the batch window:
-        Window win_1D;
-        win_1D.set(0, Window::Dimension(0, num_iterations));
-
-        // Create one workload for each sub-window:
-        for(unsigned int w = 0; w < num_windows; w++)
-        {
-            Window            win          = win_1D.split_window(0, w, num_windows);
-            const Coordinates start_offset = index2coords(window_shape, win.x().start());
-            const Coordinates end_offset   = index2coords(window_shape, win.x().end() - 1);
-
-            if(_pretranspose_b)
-            {
-                auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
-                {
-                    //For each block of rows in "M"
-                    auto workload_mm = this->_mm_workloads.begin();
-                    for(auto &workload_a : this->_a_workloads)
-                    {
-                        // Transform one k_block from A:
-                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
-                        // Then perform the matrix multiplication for each x block along N:
-                        for(unsigned int i = 0; i < num_x_blocks; i++)
-                        {
-                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
-                        }
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-            else
-            {
-                auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
-                {
-                    //For each block of rows in "M"
-                    auto         workload_mm = this->_mm_workloads.begin();
-                    unsigned int workload_b  = 0;
-                    //If there is only one thread then only reshape the B blocks as you need them:
-                    unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
-
-                    for(auto &workload_a : this->_a_workloads)
-                    {
-                        // Transform one k_block from A:
-                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
-                        // Then perform the matrix multiplication for each x block along N:
-                        for(unsigned int i = 0; i < num_x_blocks; i++)
-                        {
-                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                            if(workload_b_next < this->_b_workloads.size())
-                            {
-                                //Lock on BufferManager: need to run it ?
-                                if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
-                                {
-                                    this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
-                                    this->_buffer_manager->mark_as_reshaped(workload_b_next);
-                                }
-                                workload_b_next++;
-                            }
-                            ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
-                            // Run if needed or wait
-                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
-                            {
-                                this->_prepare_b->transform(this->_b_workloads[workload_b], info);
-                                this->_buffer_manager->mark_as_reshaped(workload_b);
-                            }
-                            this->_buffer_manager->wait_for_reshaping(workload_b);
-                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
-                            this->_buffer_manager->mark_as_unused(workload_b);
-                            workload_b++;
-                        }
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-        }
-        if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
-        {
-            //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
-            for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
-            {
-                auto workload = [this](const ThreadInfo & info)
-                {
-                    unsigned int workload_b = 0;
-                    //If there is only one thread then only reshape the B blocks as you need them:
-                    unsigned int workload_b_next = 1;
-
-                    for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
-                    {
-                        if(workload_b_next < this->_b_workloads.size())
-                        {
-                            //Lock on BufferManager: need to run it ?
-                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
-                            {
-                                this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
-                                this->_buffer_manager->mark_as_reshaped(workload_b_next);
-                            }
-                            workload_b_next++;
-                        }
-                        ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
-                        // Run if needed or wait
-                        if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
-                        {
-                            this->_prepare_b->transform(this->_b_workloads[workload_b], info);
-                            this->_buffer_manager->mark_as_reshaped(workload_b);
-                        }
-                        this->_buffer_manager->wait_for_reshaping(workload_b);
-                        this->_buffer_manager->mark_as_unused(workload_b);
-                        workload_b++;
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-        }
-
-        _is_prepared = true;
-    }
-}
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    _params         = INEGEMMWrapperKernel::extract_parameters(a, b, c, gemm_info);
-    _a              = a;
-    _b              = b;
-    _c              = c;
-    _pretranspose_b = gemm_info.pretranpose_B();
-
-    const DataType     input_type  = a->info()->data_type();
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-
-    const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, _pretranspose_b);
-    ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
-
-    // Forcing 128-byte alignment (required by 32-bit kernels)
-    const unsigned int alignment = 128;
-    _transformed_b.allocator()->init(TensorInfo{}, alignment);
-    _tmp_c.allocator()->init(TensorInfo{}, alignment);
-    _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
-
-    // Get strategy
-    std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
-    ARM_COMPUTE_ERROR_ON(strategy == nullptr);
-
-    if(!_pretranspose_b)
-    {
-        _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
-        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-        // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
-        const unsigned int num_iterations = _batch_window.num_iterations_total();
-        if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
-        {
-            _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
-        }
-        else
-        {
-#ifdef NO_MULTI_THREADING
-            ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
-#else  /* NO_MULTI_THREADING */
-            _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
-#endif /* NO_MULTI_THREADING */
-        }
-        // If B is transposed at every iteration then transformed_B can be managed:
-        _memory_group.manage(&_transformed_b);
-        auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
-    }
-    else
-    {
-        _tag += "_preB";
-    }
-
-    _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
-    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
-
-    if(_pretranspose_b)
-    {
-        _block_sizes = _prepare_b->block_sizes();
-        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-    }
-
-    _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
-    _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
-    _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
-
-    _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
-    _memory_group.manage(&_transformed_a);
-    _memory_group.manage(&_tmp_c);
-
-    _transform_a     = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params, gemm_info);
-    _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, gemm_info, num_threads);
-    ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
-    ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
-
-    _transformed_a.allocator()->allocate();
-    _tmp_c.allocator()->allocate();
-    if(!_pretranspose_b)
-    {
-        _transformed_b.allocator()->allocate();
-    }
-}
-} // namespace arm_compute
-- 
cgit v1.2.1