diff options
20 files changed, 1518 insertions, 101 deletions
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h index f41567ee11..ec05af20bd 100644 --- a/arm_compute/core/CPP/ICPPKernel.h +++ b/arm_compute/core/CPP/ICPPKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -49,7 +49,25 @@ public: * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window()) * @param[in] info Info about executing thread and CPU. */ - virtual void run(const Window &window, const ThreadInfo &info) = 0; + virtual void run(const Window &window, const ThreadInfo &info) + { + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked"); + } + + /** legacy compatibility layer for implemantions which do not support thread_locator + * In these cases we simply narrow the interface down the legacy version + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window()) + * @param[in] info Info about executing thread and CPU. + * @param[in] thread_locator Specifies "where" the current thread is in the multi-dimensional space + */ + virtual void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) + { + ARM_COMPUTE_UNUSED(thread_locator); + run(window, info); + } /** Name of the kernel * diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h index d612681c41..0e3dd74577 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ #ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H #define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" @@ -65,15 +66,33 @@ public: { return _name.c_str(); } - // Inherited methods overridden: + + void run(const Window &window, const ThreadInfo &info) override { ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - auto first = window.x().start(); - auto last = window.x().end(); - _kernel->execute(first, last, info.thread_id); + + auto win=arm_gemm::to_ndcoord(window); + + arm_gemm::ndcoord_t thread_locator { }; + + _kernel->execute(win, thread_locator, info.thread_id); } + + // Inherited methods overridden: + void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + //convert between arm_compute and arm_gemm types + auto ndc_win = arm_gemm::to_ndcoord(window); + auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator); + + _kernel->execute(ndc_win, ndc_tlc, info.thread_id); + } + /** Initialise the kernel's input and output. * * @param[in] kernel Pointer to an assembly kernel implementation. @@ -83,9 +102,9 @@ public: { ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel))); _kernel = kernel; - auto win_last = _kernel->get_window_size(); - Window win; - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + + Window win = to_window(kernel->get_window_size()); + INEKernel::configure(win); if(!kernel_name_tag.empty()) diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp index e89523981d..7723224ec8 100644 --- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,6 +40,7 @@ enum class GemmMethod GEMM_NATIVE, GEMM_HYBRID, GEMM_INTERLEAVED, + GEMM_INTERLEAVED_2D, QUANTIZE_WRAPPER, GEMM_HYBRID_QUANTIZED }; diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp new file mode 100644 index 0000000000..7dff01003d --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_compute/core/Window.h" +#include "arm_compute/core/Dimensions.h" +#include "src/core/NEON/kernels/arm_gemm/ndrange.hpp" + +#include <cassert> + +/* This file contains mapping between integral types used in arm_compute and arm_gemm + * These two codebases both require a degree of separation for the sake of modularity + * so maintain their own types which represent similar information. + */ + +namespace arm_gemm { + +//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library +constexpr std::size_t ndrange_max = + arm_compute::Dimensions<unsigned int>::num_max_dimensions; + +using ndrange_t=NDRange<ndrange_max>; +using ndcoord_t=NDCoordinate<ndrange_max>; + +/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window` + * + * As `NDRange<T>` does not not encode start positions, we specify + * the start to be zero in the produced `arm_compute::Window` + * + * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr` + */ +inline arm_compute::Window to_window(const ndrange_t& ndr) { + arm_compute::Window win; + + for(unsigned int i = 0; i!=ndrange_max; ++i) { + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); + } + + return win; +} + +/* + * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window` + * + * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc` + */ +inline arm_compute::Window to_window(const ndcoord_t& ndc) { + arm_compute::Window win; + + for(unsigned int i = 0; i!=ndrange_max; ++i) { + const auto start = ndc.get_position(i); + const auto size = ndc.get_size(i); + const auto stop = start + size; + + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(start, stop)); + } + + return win; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions + * + * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()` + * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t` + * @return the resultant ndrange_t + */ +inline ndrange_t to_ndrange(const arm_compute::Window& win) { + return { + static_cast<unsigned int>(win[0].end() - win[0].start()), + static_cast<unsigned int>(win[1].end() - win[1].start()), + static_cast<unsigned int>(win[2].end() - win[2].start()), + static_cast<unsigned int>(win[3].end() - win[3].start()), + static_cast<unsigned int>(win[4].end() - win[4].start()), + static_cast<unsigned int>(win[5].end() - win[5].start()) + }; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t` + * @return the resultant ndcoord_t + */ +inline ndcoord_t to_ndcoord(const arm_compute::Window& win) { + return { + { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) }, + { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) }, + { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) }, + { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) }, + { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) }, + { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) } + }; +} + +} //namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index d17fd5fe97..ea9b524e15 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,10 @@ */ #pragma once +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp" + #include <cstddef> +#include <cassert> #define UNUSED(x) (void)(x) @@ -51,10 +54,10 @@ public: void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; - /* For threading, we divide the work into some number of units and work - * out internally what unit corresponds to what work. This returns the - * total number of units. */ - virtual unsigned int get_window_size() const = 0; + /** @returns an ndrange containing ranges of the compute space which can be + * broken up and parallelised over + */ + virtual ndrange_t get_window_size() const = 0; /* The maximum thread count is specified when the GEMM is created. Some * implementations need to know how many threads will actually run in @@ -73,9 +76,12 @@ public: /* Whether this GEMM can be dynamically scheduled or not. */ virtual bool supports_dynamic_scheduling() const { return false; } - /* Actually do the work. Provide a threadid to index any per-thread - * buffers, and a start/end range to indicate which work to do. */ - virtual void execute(unsigned int, unsigned int, int) = 0; + /** Main execute member fucntion + * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() + * @param [in] thread_locator where are we inside of the thread space + * @naram [in] threadid a unique threadid + */ + virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0; /*** Working space interface (optional) ***/ /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ @@ -108,8 +114,7 @@ public: virtual ~IGemmCommon() { } }; -/* - * "Real" GemmCommon class which is templated on the operand and return types. +/* "Real" GemmCommon class which is templated on the operand and return types. * * In addition to correctly typed versions of the functions that operate on * operand and return data, this class provides a default implementation of @@ -178,4 +183,19 @@ public: } }; +template<typename GemmKernel> +inline +int unsigned get_total_window_size(const GemmKernel& kernel) +{ + auto window=kernel.get_window_size(); + + unsigned int total = 1; + for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i) + { + total *= window.get_size(i); + } + + return total; +} + } // namespace arm_gemm diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h index f68294016a..a5e20ee627 100644 --- a/arm_compute/runtime/IScheduler.h +++ b/arm_compute/runtime/IScheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,7 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include <functional> +#include <limits> namespace arm_compute { @@ -42,6 +43,13 @@ public: STATIC, /**< Split the workload evenly among the threads */ DYNAMIC, /**< Split the workload dynamically using a bucket system */ }; + + /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value + * then the schedular is free to break down the problem space over as many dimensions + * as it wishes + */ + static constexpr unsigned int split_dimensions_all = std::numeric_limits<unsigned>::max(); + /** Scheduler hints * * Collection of preferences set by the function regarding how to split a given workload diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 96e3ce832c..e3355ed2d5 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -26,6 +26,8 @@ #include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_interleaved_2d.hpp" +#include "gemm_interleaved_pretransposed_2d.hpp" #include "gemm_native.hpp" #include "gemv_batched.hpp" #include "gemv_native_transposed.hpp" @@ -144,13 +146,31 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); } }, #endif // __ARM_FEATURE_SVE +//Pretranpose, 2D split +{ + GemmMethod::GEMM_INTERLEAVED_2D, + "sgemm_12x8", + [](const GemmArgs &args) { return args._pretransposed_hint; }, + [](const GemmArgs &args) { return args._pretransposed_hint; }, + [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); } +}, +//Tranpose, 2D split, no blockmanager +{ + GemmMethod::GEMM_INTERLEAVED_2D, + "sgemm_12x8", + [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; }, + [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; }, + [](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); } +}, +//Tranpose, 1D split, with blockmanager { GemmMethod::GEMM_INTERLEAVED, "sgemm_12x8", - nullptr, - nullptr, + [](const GemmArgs &args) { return (!args._pretransposed_hint); }, + [](const GemmArgs &args) { return (!args._pretransposed_hint); }, [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); } }, + #endif // __aarch64__ #ifdef __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index c3abb04db7..0cb3160de4 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -142,8 +142,8 @@ public: _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { } // Interface implementation - Compulsory functions - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // This kernel can always be dynamically scheduled. @@ -151,8 +151,7 @@ public: return true; } - // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { UNUSED(threadid); #ifdef CYCLE_PROFILING profiler prof; @@ -215,6 +214,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + // Interface implementation - pretransposed bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index 22b6960baf..3d7ad99d1e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -149,8 +149,8 @@ public: _qp (qp), _nthreads(args._maxthreads) { } // Interface implementation - Compulsory functions - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // This kernel can always be dynamically scheduled. @@ -158,8 +158,7 @@ public: return true; } - // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -234,6 +233,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + // Working space needed for intermediate result buffers. size_t get_working_size() const override { return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri)); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index efd984561d..4897bedf47 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -385,9 +385,9 @@ public: // out work in units of out_height. Factor batches into the window, but // not multi for now (as this would cause problems with the buffer // manager). - unsigned int get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return (_Mround / strategy::out_height()) * _nbatches; + ndrange_t get_window_size() const override { + auto m_win_size = (_Mround / strategy::out_height()) * _nbatches; + return { m_win_size, 1u, 1u, 1u, 1u, 1u }; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -399,7 +399,7 @@ public: } // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { if (_pretransposed) { execute_internal<true>(start, end, threadid); } else { @@ -407,6 +407,16 @@ public: } } + //Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto stop = work_range.get_position_end(0); + + execute_1d(start, stop, threadid); + } + // Interface implementation - working space size_t get_working_size() const override { // In all cases, we need one A buffer plus a C buffer per thread. diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp new file mode 100644 index 0000000000..53f8e6c938 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#include <algorithm> +#include <cassert> + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +namespace arm_gemm { + +template<typename strategy, typename To, typename Tr> +class GemmInterleaved2d : public GemmCommon<To, Tr> { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + const bool _trA; + const bool _trB; + + const Activation _act; + + const int _maxthreads; + int _nthreads; + + /* Blocking info */ + unsigned int _k_block=0; + unsigned int _x_block=0; + + unsigned int _Mround_div=0; + unsigned int _Mround=0; + unsigned int _Nround_div=0; + unsigned int _Nround=0; + + /* Working space, pretransposed buffer */ + void *_working_space=nullptr; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker { + private: + /* Size loops, etc. based on our parent's configuration */ + const GemmInterleaved2d<strategy, To, Tr> &_parent; + + /* K, X and multi parameters for current iteration. */ + unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; + + unsigned int _index=0; + bool _done=false; + bool _newkblock=true; + bool _newmulti=true; + + public: + blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent) + : _parent(parent) + , _xmax { parent._Nsize } + { } + + blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) + : _parent(parent) + , _x0 { x0 } + , _xmin { x0 } + , _xmax { xmax } + { + assert(_x0 <= _xmax); + } + + unsigned int xmax() { + return std::min(_x0 + _parent._x_block, _xmax); + } + + unsigned int kmax() { + return std::min(_k0 + _parent._k_block, _parent._Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) { + if (_done) { + return false; + } + + _newkblock=false; + _x0 += _parent._x_block; + if (_x0 >= _xmax) { + _x0=_xmin; + _k0 += _parent._k_block; + if (_k0 >= _parent._Ksize) { + _k0=0; + _multi++; + if (_multi >= _parent._nmulti) { + _done=true; + return false; + } + _newmulti=true; + } + _newkblock=true; + } + _index++; + + return true; + } + + unsigned int k0(void) { return _k0; } + unsigned int x0(void) { return _x0; } + unsigned int multi(void) { return _multi; } + unsigned int index(void) { return _index; } + bool done(void) { return _done; } + bool newkblock(void) { return _newkblock; } + }; + + // A working size: One of these needed, regardless of thread count. Divided according to window. + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; + } + + // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings. + size_t get_b_working_size() const { + return ROUND_UP(sizeof(Toi) * _x_block * _k_block); + } + + // C working size: One needed per thread. + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + } + + void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + UNUSED(mthreadid); + + strategy strat(_ci); + + /* Translate 'start' and 'end' into a position within the batches and rows. */ + const unsigned int window_per_batch = _Mround / strategy::out_height(); + unsigned int batch_0 = m_start / window_per_batch; + unsigned int batch_end = m_end / window_per_batch; + + /* Compute the M values to operate on */ + unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); + unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); + + unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); + unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); + + blockwalker current(*this, n_0, n_max); + + /* get workspace as int8_t */ + assert(_working_space); + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); + + auto c_panel_start = working_space_bytes; + auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; + auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads; + + auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); + auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid); + auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid); + + + // newkblock() is always true on the first iteration, so this will be set properly on the first loop. + + int kern_k = 0; + for (;!current.done();current.advance()) { + const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + /* + * The entirity of A^kblock is transpose upfront and computed against individual + * blocks of B (xblock) + * + * Therefore, we only need to retranspose when k_block progresses + */ + if (current.newkblock()) { + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + auto a_thread_panel_in = this->_Aptr + + (batch * this->_A_batch_stride) + + (current.multi() * this->_A_multi_stride); + + auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); + + strat.transforms.PrepareA( + a_thread_panel_out, + a_thread_panel_in, + this->_lda, + first_m, + last_m, + current.k0(), + current.kmax(), + _trA); + } + + kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); + kern_k *= strat.k_unroll(); + } + + auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride); + + strat.transforms.PrepareB( + b_panel, //dst + b_panel_in, //src + this->_ldb, + current.x0(), //idx from + current.xmax(), //idx to + current.k0(), + current.kmax(), + _trB); + + //Iterate over the batches + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; + + + //Iterate over the inerleaved rows of the packed A matrix + for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { + unsigned int ymax = std::min(_Msize, y + strategy::out_height()); + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + a_ptr += (strategy::out_height() * kern_k); + + const bool first_pass = current.k0()==0; + const bool last_pass = current.kmax()==_Ksize; + + auto c_panel_out = this->_Cptr + + this->_C_batch_stride * batch + + this->_C_multi_stride * current.multi(); + + auto bias = (first_pass && this->_bias) + ? this->_bias + (current.multi() * this->_bias_multi_stride) + : nullptr; + + auto act = last_pass ? _act : Activation(); + + strat.transforms.Merge( + c_panel_out, + c_panel, + this->_ldc, + y, + ymax, + current.x0(), + current.xmax(), + bias, + act, + !first_pass); //Append + } + } + } + } +public: + GemmInterleaved2d(GemmInterleaved2d &) = delete; + GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete; + + /* Constructor */ + /* Constructor */ + GemmInterleaved2d(const GemmArgs &args) + : _ci(args._ci) + , _Msize(args._Msize) + , _Nsize(args._Nsize) + , _Ksize(args._Ksize) + , _nbatches(args._nbatches) + , _nmulti(args._nmulti) + , _trA(args._trA) + , _trB(args._trB) + , _act(args._act) + , _maxthreads(args._maxthreads) + , _nthreads(args._maxthreads) + + // Work out the rounded size of M - needed for some buffers. + , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) + , _Mround ( _Mround_div * strategy::out_height() ) + + , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) + , _Nround ( _Nround_div * strategy::out_width() ) + { + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + assert(_maxthreads > 0); + + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + + // Work out the rounded size of M - needed for some buffers. + } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + unsigned m = (_Mround / strategy::out_height()) * _nbatches; + unsigned n = _Nround_div; + + return { m, n, 1u, 1u, 1u, 1u }; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override { + _nthreads = std::min(nthreads, _maxthreads); + } + + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + /* + * This particular GEMM implementation can only be broken up over the M & N + * dimensions, we inform the frame work of this limitation via the get_window_size function + */ + assert(ndrange_popcount(work_range) <= 2); + + const auto m_start = work_range.get_position(0); + const auto n_start = work_range.get_position(1); + const auto m_size = work_range.get_size(0); + const auto n_size = work_range.get_size(1); + const auto m_end = m_start + m_size; + const auto n_end = n_start + n_size; + + const auto m_threadid = thread_locator.get_position(0); + const auto n_threadid = thread_locator.get_position(1); + + execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); + } + + std::size_t get_working_size()const override { + /* + * Because we do not know how schedular will break up + * the task, we need to ensure that alloc enough + * space to be able to handle the case where every thread + * is parallelised across B AND also every thrread is parallelised across A + * + * If we parallelise across A, then we only need one buffer of A and 64 buffers of B + * If we parallelise across B, then we only need 64 buffer of B and + */ + return get_c_working_size() * _maxthreads + + get_a_working_size() * _maxthreads + + get_b_working_size() * _maxthreads + + 64; //to account for cacheline alignment + } + + + void set_working_space(void *working_space) override { + // Make sure everything ends up cache line aligned + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); + intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); + + size_t diff=0; + + if (working_space_int & 0x3F) { + diff = 0x40 - (working_space_int & 0x3F); + } + + working_space_bytes += diff; + + _working_space = reinterpret_cast<void *>(working_space_bytes); + } + + ~GemmInterleaved2d() override { } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp new file mode 100644 index 0000000000..eff4877198 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#include <algorithm> +#include <cassert> + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +namespace arm_gemm { + +template<typename strategy, typename To, typename Tr> +class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + const bool _trA; + const bool _trB; + + const Activation _act; + + const int _maxthreads; + int _nthreads; + + /* Blocking info */ + unsigned int _k_block=0; + unsigned int _x_block=0; + + unsigned int _Mround_div=0; + unsigned int _Mround=0; + unsigned int _Nround_div=0; + unsigned int _Nround=0; + + /* Working space, pretransposed buffer */ + const Toi *_B_transposed=nullptr; + void *_working_space=nullptr; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker { + private: + /* Size loops, etc. based on our parent's configuration */ + const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent; + + /* K, X and multi parameters for current iteration. */ + unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; + + unsigned int _index=0; + bool _done=false; + bool _newkblock=true; + bool _newmulti=true; + + public: + blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent) + : _parent(parent) + , _xmax { parent._Nsize } + { } + + blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) + : _parent(parent) + , _x0 { x0 } + , _xmin { x0 } + , _xmax { xmax } + { + assert(_x0 <= _xmax); + } + + unsigned int xmax() { + return std::min(_x0 + _parent._x_block, _xmax); + } + + unsigned int kmax() { + return std::min(_k0 + _parent._k_block, _parent._Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) { + if (_done) { + return false; + } + + _newkblock=false; + _x0 += _parent._x_block; + if (_x0 >= _xmax) { + _x0=_xmin; + _k0 += _parent._k_block; + if (_k0 >= _parent._Ksize) { + _k0=0; + _multi++; + if (_multi >= _parent._nmulti) { + _done=true; + return false; + } + _newmulti=true; + } + _newkblock=true; + } + _index++; + + return true; + } + + unsigned int k0(void) { return _k0; } + unsigned int x0(void) { return _x0; } + unsigned int multi(void) { return _multi; } + unsigned int index(void) { return _index; } + bool done(void) { return _done; } + bool newkblock(void) { return _newkblock; } + }; + + // A working size: One of these needed, regardless of thread count. Divided according to window. + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; + } + + // As B will be pretranspose we do not need to alloc any space for it + size_t get_b_working_size() const { + return 0; + } + + // C working size: One needed per thread. + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + } + + // Internal execute function. + // This supports both the "pretransposed" and "standard" interfaces via the template parameter. + void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + assert(_working_space); + assert(this->_Aptr); + assert(this->_Cptr); + + UNUSED(mthreadid); + UNUSED(nthreadid); + +#ifdef CYCLE_PROFILING + profiler prof; +#endif + strategy strat(_ci); + + /* Translate 'start' and 'end' into a position within the batches and rows. */ + const unsigned int window_per_batch = _Mround / strategy::out_height(); + unsigned int batch_0 = m_start / window_per_batch; + unsigned int batch_end = m_end / window_per_batch; + + /* Compute the M values to operate on */ + unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); + unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); + + unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); + unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); + + blockwalker current(*this, n_0, n_max); + + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); + + auto c_panel_start = working_space_bytes; + auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; + + auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); + auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid); + + /* B^t is stored in interleaved panels separated by their K-block component + * we want to store a pointer to the start of the current k-page + * then when we come to the next k-block we just add the size of the previous to + * this base pointer + */ + const Toi *b_panel_start = _B_transposed; + // b_panels stores a pointer to the start of our current block inside of the k-block + const Toi *b_panel = b_panel_start; + + // newkblock() is always true on the first iteration, so this will be set properly on the first loop. + unsigned b_page_size = 0; + int kern_k = 0; + for (;!current.done();current.advance()) { + int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + + if (current.newkblock()) { + kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); + kern_k *= strat.k_unroll(); + + unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width()); + + b_panel_start += b_page_size; + b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k); + b_page_size = _Nround * kern_k; + + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + auto a_thread_panel_in = this->_Aptr + + (batch * this->_A_batch_stride) + + (current.multi() * this->_A_multi_stride); + + auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); + + strat.transforms.PrepareA( + a_thread_panel_out, + a_thread_panel_in, + this->_lda, + first_m, + last_m, + current.k0(), + current.kmax(), + _trA); + } + } + + /* Do the actual work. */ + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; + + if (first_m >= last_m) + continue; + + for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { + unsigned int ymax = std::min(_Msize, y + strategy::out_height()); + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + a_ptr += (strategy::out_height() * kern_k); + + /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */ + const bool first_pass = current.k0()==0; + const bool last_pass = current.kmax()==_Ksize; + + auto c_panel_out = this->_Cptr + + this->_C_batch_stride * batch + + this->_C_multi_stride * current.multi(); + + auto bias = (first_pass && this->_bias) + ? this->_bias + (current.multi() * this->_bias_multi_stride) + : nullptr; + + auto act = last_pass ? _act : Activation(); + + strat.transforms.Merge( + c_panel_out, + c_panel, + this->_ldc, + y, + ymax, + current.x0(), + current.xmax(), + bias, + act, + !first_pass); //Append + } + } + + b_panel += (bblocks * strat.out_width() * kern_k); + } + } + +public: + GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete; + GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete; + + /* Constructor */ + GemmInterleavedPretransposed2d(const GemmArgs &args) + : _ci(args._ci) + , _Msize(args._Msize) + , _Nsize(args._Nsize) + , _Ksize(args._Ksize) + , _nbatches(args._nbatches) + , _nmulti(args._nmulti) + , _trA(args._trA) + , _trB(args._trB) + , _act(args._act) + , _maxthreads(args._maxthreads) + , _nthreads(args._maxthreads) + + // Work out the rounded size of M - needed for some buffers. + , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) + , _Mround ( _Mround_div * strategy::out_height() ) + + , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) + , _Nround ( _Nround_div * strategy::out_width() ) + { + + assert(args._pretransposed_hint); + assert(_maxthreads > 0); + + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + unsigned m = (_Mround / strategy::out_height()) * _nbatches; + unsigned n = _Nround_div; + + return { m, n, 1u, 1u, 1u, 1u }; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override { + _nthreads = std::min(nthreads, _maxthreads); + } + + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + /* This particular GEMM implementation can only be broken up over the M & N + * dimensions, we inform the frame work of this limitation via the get_window_size function + */ + assert(ndrange_popcount(work_range) <= 2); + + const auto m_start = work_range.get_position(0); + const auto n_start = work_range.get_position(1); + const auto m_size = work_range.get_size(0); + const auto n_size = work_range.get_size(1); + const auto m_end = m_start + m_size; + const auto n_end = n_start + n_size; + + const auto m_threadid = thread_locator.get_position(0); + const auto n_threadid = thread_locator.get_position(1); + + execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); + } + + std::size_t get_working_size()const override { + /* Because we do not know how schedular will break up + * the task, we need to ensure that alloc enough + * space to be able to handle the case where every thread + * is parallelised across B AND also every thrread is parallelised across A + * + * If we parallelise across A, then we only need one buffer of A and 64 buffers of B + * If we parallelise across B, then we only need 64 buffer of B and + */ + return get_c_working_size() * _maxthreads + + get_a_working_size() * _maxthreads + + 64; //to account for cacheline alignment + } + + + void set_working_space(void *working_space) override { + // Make sure everything ends up cache line aligned + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); + intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); + + size_t diff=0; + + if (working_space_int & 0x3F) { + diff = 0x40 - (working_space_int & 0x3F); + } + + working_space_bytes += diff; + + _working_space = reinterpret_cast<void *>(working_space_bytes); + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override { + return true; + } + + bool B_pretranspose_required() const override { + return _B_transposed==nullptr; + } + + // TODO: this could almost certainly be considerably simpler. + size_t get_B_pretransposed_array_size() const override { + size_t total=0; + blockwalker current(*this); + + do { + /* Figure out the size of each block. */ + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width()); + x_size *= strategy::out_width(); + + k_size = iceildiv(k_size, strategy::k_unroll()); + k_size *= strategy::k_unroll(); + + total += x_size * k_size * sizeof(Toi); + } while (current.advance()); + + return total; + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + blockwalker current(*this); + Toi *buffer = reinterpret_cast<Toi *>(in_buffer); + _B_transposed = buffer; + strategy strat(_ci); + + do { + /* Figure out the size of each block. */ + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width()); + x_size *= strategy::out_width(); + + k_size = iceildiv(k_size, strategy::k_unroll()); + k_size *= strategy::k_unroll(); + + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + + buffer += (x_size * k_size); + } while (current.advance()); + } + + void set_pretransposed_B_data(void *in_buffer) override { + _B_transposed = reinterpret_cast<Toi *>(in_buffer); + } + + ~GemmInterleavedPretransposed2d() override { } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index fe6ebef045..c2f742b5cf 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -87,8 +87,8 @@ public: _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { } // Window is amount per multi multiplied by total number of multis. - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // Native GEMMs can always be dynamically scheduled (whether requested or not) @@ -97,7 +97,7 @@ public: } // Actually execute the GEMM. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -139,6 +139,16 @@ public: } } while (p.next_dim1()); } + + //Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto stop = work_range.get_position_end(0); + + execute_1d(start, stop, threadid); + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index be2f5614be..939788ed8d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,7 +58,7 @@ public: UNUSED(ldc); } - unsigned int get_window_size() const override { + ndrange_t get_window_size() const override { return _subgemm->get_window_size(); } @@ -66,8 +66,8 @@ public: _subgemm->set_nthreads(nthreads); } - void execute(unsigned int start, unsigned int end, int threadid) override { - _subgemm->execute(start, end, threadid); + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + _subgemm->execute(work_range, thread_locator, threadid); } size_t get_working_size() const override { diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 49681ec404..190f4aa643 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,12 +72,12 @@ public: } // Window is number of out_width blocks times number of multis. - unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width()) * _nmultis; + ndrange_t get_window_size() const override { + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; } // Actually execute the GEMV. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -127,6 +127,17 @@ public: } } } + + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 26fdfba8ff..7f52ac5a14 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -86,12 +86,12 @@ public: } // Window is number of out_width blocks, times number of multis. - unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width()) * _nmultis; + ndrange_t get_window_size() const override { + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; } // Actually execute the GEMV. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -145,6 +145,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + /* Pretransposed interface implementation */ bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp index 20824dfc8b..0c068db011 100644 --- a/src/core/NEON/kernels/arm_gemm/ndrange.hpp +++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,16 +23,19 @@ */ #pragma once +#include <array> #include <algorithm> #include <initializer_list> +#include <cassert> + namespace arm_gemm { template<unsigned int D> class NDRange { private: - unsigned int m_sizes[D]; - unsigned int m_totalsizes[D]; + std::array<unsigned int, D> m_sizes {}; + std::array<unsigned int, D> m_totalsizes {}; class NDRangeIterator { private: @@ -81,8 +84,25 @@ private: }; public: + NDRange& operator=(const NDRange& rhs)=default; + NDRange(const NDRange& rhs) =default; + template <typename... T> - NDRange(T... ts) : m_sizes{ts...} { + NDRange(T... ts) + : m_sizes{ts...} + { + unsigned int t=1; + + for (unsigned int i=0; i<D; i++) { + t *= m_sizes[i]; + + m_totalsizes[i] = t; + } + } + + NDRange(const std::array<unsigned int, D>& n) + : m_sizes{n} + { unsigned int t=1; for (unsigned int i=0; i<D; i++) { @@ -105,4 +125,61 @@ public: } }; +/** NDCoordinate builds upon a range, but specifies a starting position + * in addition to a size which it inherits from NDRange + */ +template<unsigned int N> +class NDCoordinate : public NDRange<N> { + using int_t =unsigned int; + using ndrange_t = NDRange<N>; + + std::array<int_t, N> m_positions {}; +public: + NDCoordinate& operator=(const NDCoordinate& rhs)=default; + NDCoordinate(const NDCoordinate& rhs) =default; + NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list) + { + std::array<int_t, N> sizes; + + std::size_t i = 0; + for(auto& p : list) { + m_positions[i]= p.first; + sizes[i++] = p.second; + } + + //update the parents sizes + static_cast<ndrange_t&>(*this) = ndrange_t(sizes); + } + + int_t get_position(int_t d) const { + assert(d < m_positions.size()); + return m_positions[d]; + } + + void set_position(int_t d, int_t v) { + assert(d < size(m_positions)); + assert(v < ndrange_t::get_size(d)); + + m_positions[d] = v; + } + + int_t get_position_end(int_t d) const { + return get_position(d) + NDRange<N>::get_size(d); + } +}; //class NDCoordinate + +/** @returns the number of dimensions in the NDRange which have none-1 values + * IE there is actual work in these dimensions that can be broken up + */ +template<unsigned int N> +std::size_t ndrange_popcount(const NDRange<N>& ndr) { + std::size_t count = 0; + + for(unsigned int d = 0; d != N; ++d) { + if(ndr.get_size(d) != 1) + ++count; + } + return count; +} + } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp index 345060f206..18f030fec0 100644 --- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -148,7 +148,7 @@ public: set_child_arrays(); } - unsigned int get_window_size() const override { + ndrange_t get_window_size() const override { return _subgemm->get_window_size(); } @@ -158,8 +158,9 @@ public: _args._maxthreads = nthreads; } - void execute(unsigned int start, unsigned int end, int threadid) override { - _subgemm->execute(start, end, threadid); + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + _subgemm->execute(work_range, thread_locator, threadid); if (!_args._pretransposed_hint) { col_sums_runtime(threadid); } diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index e684eeee98..0a03497cb9 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -71,6 +71,61 @@ private: const unsigned int _end; }; +/** Given two dimensions and a maxium number of threads to utilise, calcualte the best + * combination of threads that fit in (mutliplied together) max_threads. + * + * This algorithm assumes that work in either of the dimensions is equally difficult + * to compute + * + * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension + */ +std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n) +{ + /* + * We want the same ratio of threads in M & N to the ratio of m and n problem size + * + * Therefore: mt/nt == m/n where mt*nt == max_threads + * + * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt + * nt^2 = max_threads * (m/n) + * nt = sqrt( max_threads * (m/n) ) + */ + //ratio of m to n in problem dimensions + double ratio = m / static_cast<double>(n); + + // nt = sqrt(max_threads * (m / n) ) + const unsigned adjusted = std::round( + std::sqrt(max_threads * ratio)); + + //find the nearest factor of max_threads + for(unsigned i = 0; i!= adjusted; ++i) + { + //try down + const unsigned adj_down = adjusted - i; + if(max_threads % adj_down == 0) + { + return { adj_down, max_threads / adj_down }; + } + + //try up + const unsigned adj_up = adjusted + i; + if(max_threads % adj_up == 0) + { + return { adj_up, max_threads / adj_up }; + } + } + + //we didn't find anything so lets bail out with maxes biased to the largest dimension + if(m > n) + { + return{ std::min<unsigned>(m, max_threads), 1 }; + } + else + { + return{ 1, std::min<unsigned>(n, max_threads) }; + } +} + /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run. * * Will run workloads until the feeder reaches the end of its range. @@ -314,50 +369,95 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); const Window &max_window = kernel->window(); - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); - if(num_iterations == 0) + if(hints.split_dimension() == IScheduler::split_dimensions_all) { - return; - } + /* + * if the split dim is size_t max then this signals we should parallelise over + * all dimensions + */ + const std::size_t m = max_window.num_iterations(Window::DimX); + const std::size_t n = max_window.num_iterations(Window::DimY); + + //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... + unsigned m_threads, n_threads; + std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n); + + std::vector<IScheduler::Workload> workloads; + for(unsigned int ni = 0; ni != n_threads; ++ni) + { + for(unsigned int mi = 0; mi != m_threads; ++mi) + { + workloads.push_back( + [ ni, mi, m_threads, n_threads, &max_window, &kernel ] + (const ThreadInfo & info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); - if(!kernel->is_parallelisable() || num_threads == 1) - { - ThreadInfo info; - info.cpu_info = &_cpu_info; - kernel->run(max_window, info); + win.validate(); + + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + + thread_locator.validate(); + + kernel->run_nd(win, info, thread_locator); + } + ); + } + } + run_workloads(workloads); } else { - unsigned int num_windows = 0; - switch(hints.strategy()) + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); + + if(num_iterations == 0) { - case StrategyHint::STATIC: - num_windows = num_threads; - break; - case StrategyHint::DYNAMIC: - { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); - // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder - num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; - break; - } - default: - ARM_COMPUTE_ERROR("Unknown strategy"); + return; } - std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + + if(!kernel->is_parallelisable() || num_threads == 1) { - //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) + ThreadInfo info; + info.cpu_info = &_cpu_info; + kernel->run(max_window, info); + } + else + { + unsigned int num_windows = 0; + switch(hints.strategy()) + { + case StrategyHint::STATIC: + num_windows = num_threads; + break; + case StrategyHint::DYNAMIC: + { + const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); + // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder + num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; + break; + } + default: + ARM_COMPUTE_ERROR("Unknown strategy"); + } + std::vector<IScheduler::Workload> workloads(num_windows); + for(unsigned int t = 0; t < num_windows; t++) { - Window win = max_window.split_window(hints.split_dimension(), t, num_windows); - win.validate(); - kernel->run(win, info); - }; + //Capture 't' by copy, all the other variables by reference: + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) + { + Window win = max_window.split_window(hints.split_dimension(), t, num_windows); + win.validate(); + kernel->run(win, info); + }; + } + run_workloads(workloads); } - run_workloads(workloads); } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index a3080e7f29..24bd7d7a8c 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -280,8 +280,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { - const int window_size = _gemm_kernel_asm->get_window_size(); - if(window_size < args._maxthreads) + const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); + if(window_size < static_cast<unsigned int>(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); } @@ -404,7 +404,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() if(_workspace.buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer())); - const unsigned int window_size = _gemm_kernel_asm->get_window_size(); + const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); unsigned int num_threads = NEScheduler::get().num_threads(); if(window_size < num_threads) { @@ -427,14 +427,21 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d, bias, 0); - // Schedule assembly kernel IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32) { const int granule_threshold = 200; scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); + + } + else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32) + { + //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions + const int granule_threshold = 200; + scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } + NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } |