aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/CPP/ICPPKernel.h22
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h35
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp3
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp121
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_common.hpp40
-rw-r--r--arm_compute/runtime/IScheduler.h10
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp449
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp514
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_native.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp19
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp19
-rw-r--r--src/core/NEON/kernels/arm_gemm/ndrange.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp9
-rw-r--r--src/runtime/CPP/CPPScheduler.cpp168
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp15
20 files changed, 1518 insertions, 101 deletions
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index f41567ee11..ec05af20bd 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,25 @@ public:
* @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
* @param[in] info Info about executing thread and CPU.
*/
- virtual void run(const Window &window, const ThreadInfo &info) = 0;
+ virtual void run(const Window &window, const ThreadInfo &info)
+ {
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
+ }
+
+ /** legacy compatibility layer for implemantions which do not support thread_locator
+ * In these cases we simply narrow the interface down the legacy version
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
+ * @param[in] info Info about executing thread and CPU.
+ * @param[in] thread_locator Specifies "where" the current thread is in the multi-dimensional space
+ */
+ virtual void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator)
+ {
+ ARM_COMPUTE_UNUSED(thread_locator);
+ run(window, info);
+ }
/** Name of the kernel
*
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
index d612681c41..0e3dd74577 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
#include "arm_compute/core/NEON/INEKernel.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -65,15 +66,33 @@ public:
{
return _name.c_str();
}
- // Inherited methods overridden:
+
+
void run(const Window &window, const ThreadInfo &info) override
{
ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- auto first = window.x().start();
- auto last = window.x().end();
- _kernel->execute(first, last, info.thread_id);
+
+ auto win=arm_gemm::to_ndcoord(window);
+
+ arm_gemm::ndcoord_t thread_locator { };
+
+ _kernel->execute(win, thread_locator, info.thread_id);
}
+
+ // Inherited methods overridden:
+ void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+ //convert between arm_compute and arm_gemm types
+ auto ndc_win = arm_gemm::to_ndcoord(window);
+ auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
+
+ _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
+ }
+
/** Initialise the kernel's input and output.
*
* @param[in] kernel Pointer to an assembly kernel implementation.
@@ -83,9 +102,9 @@ public:
{
ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
_kernel = kernel;
- auto win_last = _kernel->get_window_size();
- Window win;
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+
+ Window win = to_window(kernel->get_window_size());
+
INEKernel::configure(win);
if(!kernel_name_tag.empty())
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
index e89523981d..7723224ec8 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,7 @@ enum class GemmMethod
GEMM_NATIVE,
GEMM_HYBRID,
GEMM_INTERLEAVED,
+ GEMM_INTERLEAVED_2D,
QUANTIZE_WRAPPER,
GEMM_HYBRID_QUANTIZED
};
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
new file mode 100644
index 0000000000..7dff01003d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Dimensions.h"
+#include "src/core/NEON/kernels/arm_gemm/ndrange.hpp"
+
+#include <cassert>
+
+/* This file contains mapping between integral types used in arm_compute and arm_gemm
+ * These two codebases both require a degree of separation for the sake of modularity
+ * so maintain their own types which represent similar information.
+ */
+
+namespace arm_gemm {
+
+//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
+constexpr std::size_t ndrange_max =
+ arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+
+using ndrange_t=NDRange<ndrange_max>;
+using ndcoord_t=NDCoordinate<ndrange_max>;
+
+/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
+ *
+ * As `NDRange<T>` does not not encode start positions, we specify
+ * the start to be zero in the produced `arm_compute::Window`
+ *
+ * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
+ */
+inline arm_compute::Window to_window(const ndrange_t& ndr) {
+ arm_compute::Window win;
+
+ for(unsigned int i = 0; i!=ndrange_max; ++i) {
+ //populate the window with the dimensions of the NDRange
+ win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
+ }
+
+ return win;
+}
+
+/*
+ * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
+ *
+ * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
+ */
+inline arm_compute::Window to_window(const ndcoord_t& ndc) {
+ arm_compute::Window win;
+
+ for(unsigned int i = 0; i!=ndrange_max; ++i) {
+ const auto start = ndc.get_position(i);
+ const auto size = ndc.get_size(i);
+ const auto stop = start + size;
+
+ //populate the window with the dimensions of the NDRange
+ win.set(i, arm_compute::Window::Dimension(start, stop));
+ }
+
+ return win;
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
+ *
+ * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
+ * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
+ * @return the resultant ndrange_t
+ */
+inline ndrange_t to_ndrange(const arm_compute::Window& win) {
+ return {
+ static_cast<unsigned int>(win[0].end() - win[0].start()),
+ static_cast<unsigned int>(win[1].end() - win[1].start()),
+ static_cast<unsigned int>(win[2].end() - win[2].start()),
+ static_cast<unsigned int>(win[3].end() - win[3].start()),
+ static_cast<unsigned int>(win[4].end() - win[4].start()),
+ static_cast<unsigned int>(win[5].end() - win[5].start())
+ };
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
+ * @return the resultant ndcoord_t
+ */
+inline ndcoord_t to_ndcoord(const arm_compute::Window& win) {
+ return {
+ { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
+ { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
+ { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
+ { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
+ { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
+ { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
+ };
+}
+
+} //namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index d17fd5fe97..ea9b524e15 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,10 @@
*/
#pragma once
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
+
#include <cstddef>
+#include <cassert>
#define UNUSED(x) (void)(x)
@@ -51,10 +54,10 @@ public:
void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
- /* For threading, we divide the work into some number of units and work
- * out internally what unit corresponds to what work. This returns the
- * total number of units. */
- virtual unsigned int get_window_size() const = 0;
+ /** @returns an ndrange containing ranges of the compute space which can be
+ * broken up and parallelised over
+ */
+ virtual ndrange_t get_window_size() const = 0;
/* The maximum thread count is specified when the GEMM is created. Some
* implementations need to know how many threads will actually run in
@@ -73,9 +76,12 @@ public:
/* Whether this GEMM can be dynamically scheduled or not. */
virtual bool supports_dynamic_scheduling() const { return false; }
- /* Actually do the work. Provide a threadid to index any per-thread
- * buffers, and a start/end range to indicate which work to do. */
- virtual void execute(unsigned int, unsigned int, int) = 0;
+ /** Main execute member fucntion
+ * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size()
+ * @param [in] thread_locator where are we inside of the thread space
+ * @naram [in] threadid a unique threadid
+ */
+ virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
/*** Working space interface (optional) ***/
/* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */
@@ -108,8 +114,7 @@ public:
virtual ~IGemmCommon() { }
};
-/*
- * "Real" GemmCommon class which is templated on the operand and return types.
+/* "Real" GemmCommon class which is templated on the operand and return types.
*
* In addition to correctly typed versions of the functions that operate on
* operand and return data, this class provides a default implementation of
@@ -178,4 +183,19 @@ public:
}
};
+template<typename GemmKernel>
+inline
+int unsigned get_total_window_size(const GemmKernel& kernel)
+{
+ auto window=kernel.get_window_size();
+
+ unsigned int total = 1;
+ for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
+ {
+ total *= window.get_size(i);
+ }
+
+ return total;
+}
+
} // namespace arm_gemm
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index f68294016a..a5e20ee627 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/CPP/CPPTypes.h"
#include <functional>
+#include <limits>
namespace arm_compute
{
@@ -42,6 +43,13 @@ public:
STATIC, /**< Split the workload evenly among the threads */
DYNAMIC, /**< Split the workload dynamically using a bucket system */
};
+
+ /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value
+ * then the schedular is free to break down the problem space over as many dimensions
+ * as it wishes
+ */
+ static constexpr unsigned int split_dimensions_all = std::numeric_limits<unsigned>::max();
+
/** Scheduler hints
*
* Collection of preferences set by the function regarding how to split a given workload
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 96e3ce832c..e3355ed2d5 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -26,6 +26,8 @@
#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_interleaved_2d.hpp"
+#include "gemm_interleaved_pretransposed_2d.hpp"
#include "gemm_native.hpp"
#include "gemv_batched.hpp"
#include "gemv_native_transposed.hpp"
@@ -144,13 +146,31 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
},
#endif // __ARM_FEATURE_SVE
+//Pretranpose, 2D split
+{
+ GemmMethod::GEMM_INTERLEAVED_2D,
+ "sgemm_12x8",
+ [](const GemmArgs &args) { return args._pretransposed_hint; },
+ [](const GemmArgs &args) { return args._pretransposed_hint; },
+ [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
+},
+//Tranpose, 2D split, no blockmanager
+{
+ GemmMethod::GEMM_INTERLEAVED_2D,
+ "sgemm_12x8",
+ [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
+ [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
+ [](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); }
+},
+//Tranpose, 1D split, with blockmanager
{
GemmMethod::GEMM_INTERLEAVED,
"sgemm_12x8",
- nullptr,
- nullptr,
+ [](const GemmArgs &args) { return (!args._pretransposed_hint); },
+ [](const GemmArgs &args) { return (!args._pretransposed_hint); },
[](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
},
+
#endif // __aarch64__
#ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c3abb04db7..0cb3160de4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -142,8 +142,8 @@ public:
_window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }
// Interface implementation - Compulsory functions
- unsigned int get_window_size() const override {
- return _window_range.total_size();
+ ndrange_t get_window_size() const override {
+ return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
}
// This kernel can always be dynamically scheduled.
@@ -151,8 +151,7 @@ public:
return true;
}
- // Execute
- void execute(unsigned int start, unsigned int end, int threadid) override {
+ void execute_1d(unsigned int start, unsigned int end, int threadid) {
UNUSED(threadid);
#ifdef CYCLE_PROFILING
profiler prof;
@@ -215,6 +214,17 @@ public:
}
}
+ // Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto size = work_range.get_size(0);
+ const auto stop = start + size;
+
+ execute_1d(start, stop, threadid);
+ }
+
// Interface implementation - pretransposed
bool B_is_pretransposed() const override {
return true;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 22b6960baf..3d7ad99d1e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -149,8 +149,8 @@ public:
_qp (qp), _nthreads(args._maxthreads) { }
// Interface implementation - Compulsory functions
- unsigned int get_window_size() const override {
- return _window_range.total_size();
+ ndrange_t get_window_size() const override {
+ return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
}
// This kernel can always be dynamically scheduled.
@@ -158,8 +158,7 @@ public:
return true;
}
- // Execute
- void execute(unsigned int start, unsigned int end, int threadid) override {
+ void execute_1d(unsigned int start, unsigned int end, int threadid) {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -234,6 +233,17 @@ public:
}
}
+ // Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto size = work_range.get_size(0);
+ const auto stop = start + size;
+
+ execute_1d(start, stop, threadid);
+ }
+
// Working space needed for intermediate result buffers.
size_t get_working_size() const override {
return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri));
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index efd984561d..4897bedf47 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -385,9 +385,9 @@ public:
// out work in units of out_height. Factor batches into the window, but
// not multi for now (as this would cause problems with the buffer
// manager).
- unsigned int get_window_size() const override {
- // _Mround is a multiple of out_height by definition.
- return (_Mround / strategy::out_height()) * _nbatches;
+ ndrange_t get_window_size() const override {
+ auto m_win_size = (_Mround / strategy::out_height()) * _nbatches;
+ return { m_win_size, 1u, 1u, 1u, 1u, 1u };
}
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -399,7 +399,7 @@ public:
}
// Execute
- void execute(unsigned int start, unsigned int end, int threadid) override {
+ void execute_1d(unsigned int start, unsigned int end, int threadid) {
if (_pretransposed) {
execute_internal<true>(start, end, threadid);
} else {
@@ -407,6 +407,16 @@ public:
}
}
+ //Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto stop = work_range.get_position_end(0);
+
+ execute_1d(start, stop, threadid);
+ }
+
// Interface implementation - working space
size_t get_working_size() const override {
// In all cases, we need one A buffer plus a C buffer per thread.
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
new file mode 100644
index 0000000000..53f8e6c938
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <algorithm>
+#include <cassert>
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved2d : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo * const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ const bool _trA;
+ const bool _trB;
+
+ const Activation _act;
+
+ const int _maxthreads;
+ int _nthreads;
+
+ /* Blocking info */
+ unsigned int _k_block=0;
+ unsigned int _x_block=0;
+
+ unsigned int _Mround_div=0;
+ unsigned int _Mround=0;
+ unsigned int _Nround_div=0;
+ unsigned int _Nround=0;
+
+ /* Working space, pretransposed buffer */
+ void *_working_space=nullptr;
+
+ /* We will need to walk through the blocks of B in a few contexts, so
+ * factor that out. */
+ class blockwalker {
+ private:
+ /* Size loops, etc. based on our parent's configuration */
+ const GemmInterleaved2d<strategy, To, Tr> &_parent;
+
+ /* K, X and multi parameters for current iteration. */
+ unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
+
+ unsigned int _index=0;
+ bool _done=false;
+ bool _newkblock=true;
+ bool _newmulti=true;
+
+ public:
+ blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent)
+ : _parent(parent)
+ , _xmax { parent._Nsize }
+ { }
+
+ blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
+ : _parent(parent)
+ , _x0 { x0 }
+ , _xmin { x0 }
+ , _xmax { xmax }
+ {
+ assert(_x0 <= _xmax);
+ }
+
+ unsigned int xmax() {
+ return std::min(_x0 + _parent._x_block, _xmax);
+ }
+
+ unsigned int kmax() {
+ return std::min(_k0 + _parent._k_block, _parent._Ksize);
+ }
+
+ /* Advance to the next block, return false at the end. */
+ bool advance(void) {
+ if (_done) {
+ return false;
+ }
+
+ _newkblock=false;
+ _x0 += _parent._x_block;
+ if (_x0 >= _xmax) {
+ _x0=_xmin;
+ _k0 += _parent._k_block;
+ if (_k0 >= _parent._Ksize) {
+ _k0=0;
+ _multi++;
+ if (_multi >= _parent._nmulti) {
+ _done=true;
+ return false;
+ }
+ _newmulti=true;
+ }
+ _newkblock=true;
+ }
+ _index++;
+
+ return true;
+ }
+
+ unsigned int k0(void) { return _k0; }
+ unsigned int x0(void) { return _x0; }
+ unsigned int multi(void) { return _multi; }
+ unsigned int index(void) { return _index; }
+ bool done(void) { return _done; }
+ bool newkblock(void) { return _newkblock; }
+ };
+
+ // A working size: One of these needed, regardless of thread count. Divided according to window.
+ size_t get_a_working_size() const {
+ return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
+ }
+
+ // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+ size_t get_b_working_size() const {
+ return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+ }
+
+ // C working size: One needed per thread.
+ size_t get_c_working_size() const {
+ return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+ }
+
+ void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+ UNUSED(mthreadid);
+
+ strategy strat(_ci);
+
+ /* Translate 'start' and 'end' into a position within the batches and rows. */
+ const unsigned int window_per_batch = _Mround / strategy::out_height();
+ unsigned int batch_0 = m_start / window_per_batch;
+ unsigned int batch_end = m_end / window_per_batch;
+
+ /* Compute the M values to operate on */
+ unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
+ unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
+
+ unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start);
+ unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
+
+ blockwalker current(*this, n_0, n_max);
+
+ /* get workspace as int8_t */
+ assert(_working_space);
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+ auto c_panel_start = working_space_bytes;
+ auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
+ auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads;
+
+ auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
+ auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid);
+ auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid);
+
+
+ // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+
+ int kern_k = 0;
+ for (;!current.done();current.advance()) {
+ const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+ /*
+ * The entirity of A^kblock is transpose upfront and computed against individual
+ * blocks of B (xblock)
+ *
+ * Therefore, we only need to retranspose when k_block progresses
+ */
+ if (current.newkblock()) {
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ if (first_m >= last_m)
+ continue;
+
+ auto a_thread_panel_in = this->_Aptr
+ + (batch * this->_A_batch_stride)
+ + (current.multi() * this->_A_multi_stride);
+
+ auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
+
+ strat.transforms.PrepareA(
+ a_thread_panel_out,
+ a_thread_panel_in,
+ this->_lda,
+ first_m,
+ last_m,
+ current.k0(),
+ current.kmax(),
+ _trA);
+ }
+
+ kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
+ kern_k *= strat.k_unroll();
+ }
+
+ auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride);
+
+ strat.transforms.PrepareB(
+ b_panel, //dst
+ b_panel_in, //src
+ this->_ldb,
+ current.x0(), //idx from
+ current.xmax(), //idx to
+ current.k0(),
+ current.kmax(),
+ _trB);
+
+ //Iterate over the batches
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ if (first_m >= last_m)
+ continue;
+
+ const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+
+ //Iterate over the inerleaved rows of the packed A matrix
+ for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
+ unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ a_ptr += (strategy::out_height() * kern_k);
+
+ const bool first_pass = current.k0()==0;
+ const bool last_pass = current.kmax()==_Ksize;
+
+ auto c_panel_out = this->_Cptr
+ + this->_C_batch_stride * batch
+ + this->_C_multi_stride * current.multi();
+
+ auto bias = (first_pass && this->_bias)
+ ? this->_bias + (current.multi() * this->_bias_multi_stride)
+ : nullptr;
+
+ auto act = last_pass ? _act : Activation();
+
+ strat.transforms.Merge(
+ c_panel_out,
+ c_panel,
+ this->_ldc,
+ y,
+ ymax,
+ current.x0(),
+ current.xmax(),
+ bias,
+ act,
+ !first_pass); //Append
+ }
+ }
+ }
+ }
+public:
+ GemmInterleaved2d(GemmInterleaved2d &) = delete;
+ GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete;
+
+ /* Constructor */
+ /* Constructor */
+ GemmInterleaved2d(const GemmArgs &args)
+ : _ci(args._ci)
+ , _Msize(args._Msize)
+ , _Nsize(args._Nsize)
+ , _Ksize(args._Ksize)
+ , _nbatches(args._nbatches)
+ , _nmulti(args._nmulti)
+ , _trA(args._trA)
+ , _trB(args._trB)
+ , _act(args._act)
+ , _maxthreads(args._maxthreads)
+ , _nthreads(args._maxthreads)
+
+ // Work out the rounded size of M - needed for some buffers.
+ , _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
+ , _Mround ( _Mround_div * strategy::out_height() )
+
+ , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
+ , _Nround ( _Nround_div * strategy::out_width() )
+ {
+ const unsigned int L1_size = _ci->get_L1_cache_size();
+ const unsigned int L2_size = _ci->get_L2_cache_size();
+
+ assert(_maxthreads > 0);
+
+ // Work out blocking parameters, or override from provided GemmConfig
+ if (args._cfg && args._cfg->inner_block_size) {
+ _k_block = args._cfg->inner_block_size;
+ } else {
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll();
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
+
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(_Ksize, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll());
+ _k_block *= strategy::k_unroll();
+ }
+
+ if (args._cfg && args._cfg->outer_block_size) {
+ _x_block = args._cfg->outer_block_size;
+ } else {
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * _k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width();
+ _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+ _x_block = iceildiv(_Nsize, num_x_blocks);
+
+ _x_block = iceildiv(_x_block, strategy::out_width());
+ _x_block *= strategy::out_width();
+ }
+
+ // Work out the rounded size of M - needed for some buffers.
+ }
+
+ // Interface implementation - Compulsory functions
+ ndrange_t get_window_size() const override {
+ unsigned m = (_Mround / strategy::out_height()) * _nbatches;
+ unsigned n = _Nround_div;
+
+ return { m, n, 1u, 1u, 1u, 1u };
+ }
+
+ // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+ void set_nthreads(int nthreads) override {
+ _nthreads = std::min(nthreads, _maxthreads);
+ }
+
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ /*
+ * This particular GEMM implementation can only be broken up over the M & N
+ * dimensions, we inform the frame work of this limitation via the get_window_size function
+ */
+ assert(ndrange_popcount(work_range) <= 2);
+
+ const auto m_start = work_range.get_position(0);
+ const auto n_start = work_range.get_position(1);
+ const auto m_size = work_range.get_size(0);
+ const auto n_size = work_range.get_size(1);
+ const auto m_end = m_start + m_size;
+ const auto n_end = n_start + n_size;
+
+ const auto m_threadid = thread_locator.get_position(0);
+ const auto n_threadid = thread_locator.get_position(1);
+
+ execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
+ }
+
+ std::size_t get_working_size()const override {
+ /*
+ * Because we do not know how schedular will break up
+ * the task, we need to ensure that alloc enough
+ * space to be able to handle the case where every thread
+ * is parallelised across B AND also every thrread is parallelised across A
+ *
+ * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
+ * If we parallelise across B, then we only need 64 buffer of B and
+ */
+ return get_c_working_size() * _maxthreads
+ + get_a_working_size() * _maxthreads
+ + get_b_working_size() * _maxthreads
+ + 64; //to account for cacheline alignment
+ }
+
+
+ void set_working_space(void *working_space) override {
+ // Make sure everything ends up cache line aligned
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+ intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
+
+ size_t diff=0;
+
+ if (working_space_int & 0x3F) {
+ diff = 0x40 - (working_space_int & 0x3F);
+ }
+
+ working_space_bytes += diff;
+
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+
+ ~GemmInterleaved2d() override { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
new file mode 100644
index 0000000000..eff4877198
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <algorithm>
+#include <cassert>
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo * const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ const bool _trA;
+ const bool _trB;
+
+ const Activation _act;
+
+ const int _maxthreads;
+ int _nthreads;
+
+ /* Blocking info */
+ unsigned int _k_block=0;
+ unsigned int _x_block=0;
+
+ unsigned int _Mround_div=0;
+ unsigned int _Mround=0;
+ unsigned int _Nround_div=0;
+ unsigned int _Nround=0;
+
+ /* Working space, pretransposed buffer */
+ const Toi *_B_transposed=nullptr;
+ void *_working_space=nullptr;
+
+ /* We will need to walk through the blocks of B in a few contexts, so
+ * factor that out. */
+ class blockwalker {
+ private:
+ /* Size loops, etc. based on our parent's configuration */
+ const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent;
+
+ /* K, X and multi parameters for current iteration. */
+ unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
+
+ unsigned int _index=0;
+ bool _done=false;
+ bool _newkblock=true;
+ bool _newmulti=true;
+
+ public:
+ blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent)
+ : _parent(parent)
+ , _xmax { parent._Nsize }
+ { }
+
+ blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
+ : _parent(parent)
+ , _x0 { x0 }
+ , _xmin { x0 }
+ , _xmax { xmax }
+ {
+ assert(_x0 <= _xmax);
+ }
+
+ unsigned int xmax() {
+ return std::min(_x0 + _parent._x_block, _xmax);
+ }
+
+ unsigned int kmax() {
+ return std::min(_k0 + _parent._k_block, _parent._Ksize);
+ }
+
+ /* Advance to the next block, return false at the end. */
+ bool advance(void) {
+ if (_done) {
+ return false;
+ }
+
+ _newkblock=false;
+ _x0 += _parent._x_block;
+ if (_x0 >= _xmax) {
+ _x0=_xmin;
+ _k0 += _parent._k_block;
+ if (_k0 >= _parent._Ksize) {
+ _k0=0;
+ _multi++;
+ if (_multi >= _parent._nmulti) {
+ _done=true;
+ return false;
+ }
+ _newmulti=true;
+ }
+ _newkblock=true;
+ }
+ _index++;
+
+ return true;
+ }
+
+ unsigned int k0(void) { return _k0; }
+ unsigned int x0(void) { return _x0; }
+ unsigned int multi(void) { return _multi; }
+ unsigned int index(void) { return _index; }
+ bool done(void) { return _done; }
+ bool newkblock(void) { return _newkblock; }
+ };
+
+ // A working size: One of these needed, regardless of thread count. Divided according to window.
+ size_t get_a_working_size() const {
+ return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
+ }
+
+ // As B will be pretranspose we do not need to alloc any space for it
+ size_t get_b_working_size() const {
+ return 0;
+ }
+
+ // C working size: One needed per thread.
+ size_t get_c_working_size() const {
+ return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+ }
+
+ // Internal execute function.
+ // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+ void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+ assert(_working_space);
+ assert(this->_Aptr);
+ assert(this->_Cptr);
+
+ UNUSED(mthreadid);
+ UNUSED(nthreadid);
+
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_ci);
+
+ /* Translate 'start' and 'end' into a position within the batches and rows. */
+ const unsigned int window_per_batch = _Mround / strategy::out_height();
+ unsigned int batch_0 = m_start / window_per_batch;
+ unsigned int batch_end = m_end / window_per_batch;
+
+ /* Compute the M values to operate on */
+ unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
+ unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
+
+ unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start);
+ unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
+
+ blockwalker current(*this, n_0, n_max);
+
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+ auto c_panel_start = working_space_bytes;
+ auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
+
+ auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
+ auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid);
+
+ /* B^t is stored in interleaved panels separated by their K-block component
+ * we want to store a pointer to the start of the current k-page
+ * then when we come to the next k-block we just add the size of the previous to
+ * this base pointer
+ */
+ const Toi *b_panel_start = _B_transposed;
+ // b_panels stores a pointer to the start of our current block inside of the k-block
+ const Toi *b_panel = b_panel_start;
+
+ // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+ unsigned b_page_size = 0;
+ int kern_k = 0;
+ for (;!current.done();current.advance()) {
+ int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+
+ if (current.newkblock()) {
+ kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
+ kern_k *= strat.k_unroll();
+
+ unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width());
+
+ b_panel_start += b_page_size;
+ b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
+ b_page_size = _Nround * kern_k;
+
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ if (first_m >= last_m)
+ continue;
+
+ auto a_thread_panel_in = this->_Aptr
+ + (batch * this->_A_batch_stride)
+ + (current.multi() * this->_A_multi_stride);
+
+ auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
+
+ strat.transforms.PrepareA(
+ a_thread_panel_out,
+ a_thread_panel_in,
+ this->_lda,
+ first_m,
+ last_m,
+ current.k0(),
+ current.kmax(),
+ _trA);
+ }
+ }
+
+ /* Do the actual work. */
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+ if (first_m >= last_m)
+ continue;
+
+ for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
+ unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ a_ptr += (strategy::out_height() * kern_k);
+
+ /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
+ const bool first_pass = current.k0()==0;
+ const bool last_pass = current.kmax()==_Ksize;
+
+ auto c_panel_out = this->_Cptr
+ + this->_C_batch_stride * batch
+ + this->_C_multi_stride * current.multi();
+
+ auto bias = (first_pass && this->_bias)
+ ? this->_bias + (current.multi() * this->_bias_multi_stride)
+ : nullptr;
+
+ auto act = last_pass ? _act : Activation();
+
+ strat.transforms.Merge(
+ c_panel_out,
+ c_panel,
+ this->_ldc,
+ y,
+ ymax,
+ current.x0(),
+ current.xmax(),
+ bias,
+ act,
+ !first_pass); //Append
+ }
+ }
+
+ b_panel += (bblocks * strat.out_width() * kern_k);
+ }
+ }
+
+public:
+ GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
+ GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
+
+ /* Constructor */
+ GemmInterleavedPretransposed2d(const GemmArgs &args)
+ : _ci(args._ci)
+ , _Msize(args._Msize)
+ , _Nsize(args._Nsize)
+ , _Ksize(args._Ksize)
+ , _nbatches(args._nbatches)
+ , _nmulti(args._nmulti)
+ , _trA(args._trA)
+ , _trB(args._trB)
+ , _act(args._act)
+ , _maxthreads(args._maxthreads)
+ , _nthreads(args._maxthreads)
+
+ // Work out the rounded size of M - needed for some buffers.
+ , _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
+ , _Mround ( _Mround_div * strategy::out_height() )
+
+ , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
+ , _Nround ( _Nround_div * strategy::out_width() )
+ {
+
+ assert(args._pretransposed_hint);
+ assert(_maxthreads > 0);
+
+ const unsigned int L1_size = _ci->get_L1_cache_size();
+ const unsigned int L2_size = _ci->get_L2_cache_size();
+
+ // Work out blocking parameters, or override from provided GemmConfig
+ if (args._cfg && args._cfg->inner_block_size) {
+ _k_block = args._cfg->inner_block_size;
+ } else {
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll();
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
+
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(_Ksize, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll());
+ _k_block *= strategy::k_unroll();
+ }
+
+ if (args._cfg && args._cfg->outer_block_size) {
+ _x_block = args._cfg->outer_block_size;
+ } else {
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * _k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width();
+ _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+ _x_block = iceildiv(_Nsize, num_x_blocks);
+
+ _x_block = iceildiv(_x_block, strategy::out_width());
+ _x_block *= strategy::out_width();
+ }
+ }
+
+ // Interface implementation - Compulsory functions
+ ndrange_t get_window_size() const override {
+ unsigned m = (_Mround / strategy::out_height()) * _nbatches;
+ unsigned n = _Nround_div;
+
+ return { m, n, 1u, 1u, 1u, 1u };
+ }
+
+ // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+ void set_nthreads(int nthreads) override {
+ _nthreads = std::min(nthreads, _maxthreads);
+ }
+
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ /* This particular GEMM implementation can only be broken up over the M & N
+ * dimensions, we inform the frame work of this limitation via the get_window_size function
+ */
+ assert(ndrange_popcount(work_range) <= 2);
+
+ const auto m_start = work_range.get_position(0);
+ const auto n_start = work_range.get_position(1);
+ const auto m_size = work_range.get_size(0);
+ const auto n_size = work_range.get_size(1);
+ const auto m_end = m_start + m_size;
+ const auto n_end = n_start + n_size;
+
+ const auto m_threadid = thread_locator.get_position(0);
+ const auto n_threadid = thread_locator.get_position(1);
+
+ execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
+ }
+
+ std::size_t get_working_size()const override {
+ /* Because we do not know how schedular will break up
+ * the task, we need to ensure that alloc enough
+ * space to be able to handle the case where every thread
+ * is parallelised across B AND also every thrread is parallelised across A
+ *
+ * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
+ * If we parallelise across B, then we only need 64 buffer of B and
+ */
+ return get_c_working_size() * _maxthreads
+ + get_a_working_size() * _maxthreads
+ + 64; //to account for cacheline alignment
+ }
+
+
+ void set_working_space(void *working_space) override {
+ // Make sure everything ends up cache line aligned
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+ intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
+
+ size_t diff=0;
+
+ if (working_space_int & 0x3F) {
+ diff = 0x40 - (working_space_int & 0x3F);
+ }
+
+ working_space_bytes += diff;
+
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override {
+ return _B_transposed==nullptr;
+ }
+
+ // TODO: this could almost certainly be considerably simpler.
+ size_t get_B_pretransposed_array_size() const override {
+ size_t total=0;
+ blockwalker current(*this);
+
+ do {
+ /* Figure out the size of each block. */
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width());
+ x_size *= strategy::out_width();
+
+ k_size = iceildiv(k_size, strategy::k_unroll());
+ k_size *= strategy::k_unroll();
+
+ total += x_size * k_size * sizeof(Toi);
+ } while (current.advance());
+
+ return total;
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ blockwalker current(*this);
+ Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+ _B_transposed = buffer;
+ strategy strat(_ci);
+
+ do {
+ /* Figure out the size of each block. */
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width());
+ x_size *= strategy::out_width();
+
+ k_size = iceildiv(k_size, strategy::k_unroll());
+ k_size *= strategy::k_unroll();
+
+ strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+
+ buffer += (x_size * k_size);
+ } while (current.advance());
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override {
+ _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+ }
+
+ ~GemmInterleavedPretransposed2d() override { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index fe6ebef045..c2f742b5cf 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,8 +87,8 @@ public:
_window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
// Window is amount per multi multiplied by total number of multis.
- unsigned int get_window_size() const override {
- return _window_range.total_size();
+ ndrange_t get_window_size() const override {
+ return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
}
// Native GEMMs can always be dynamically scheduled (whether requested or not)
@@ -97,7 +97,7 @@ public:
}
// Actually execute the GEMM.
- void execute(unsigned int start, unsigned int end, int) override {
+ void execute_1d(unsigned int start, unsigned int end, int) {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -139,6 +139,16 @@ public:
}
} while (p.next_dim1());
}
+
+ //Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto stop = work_range.get_position_end(0);
+
+ execute_1d(start, stop, threadid);
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index be2f5614be..939788ed8d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,7 +58,7 @@ public:
UNUSED(ldc);
}
- unsigned int get_window_size() const override {
+ ndrange_t get_window_size() const override {
return _subgemm->get_window_size();
}
@@ -66,8 +66,8 @@ public:
_subgemm->set_nthreads(nthreads);
}
- void execute(unsigned int start, unsigned int end, int threadid) override {
- _subgemm->execute(start, end, threadid);
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ _subgemm->execute(work_range, thread_locator, threadid);
}
size_t get_working_size() const override {
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 49681ec404..190f4aa643 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,12 +72,12 @@ public:
}
// Window is number of out_width blocks times number of multis.
- unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
+ ndrange_t get_window_size() const override {
+ return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
}
// Actually execute the GEMV.
- void execute(unsigned int start, unsigned int end, int) override {
+ void execute_1d(unsigned int start, unsigned int end, int) {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -127,6 +127,17 @@ public:
}
}
}
+
+ // Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto size = work_range.get_size(0);
+ const auto stop = start + size;
+
+ execute_1d(start, stop, threadid);
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 26fdfba8ff..7f52ac5a14 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,12 +86,12 @@ public:
}
// Window is number of out_width blocks, times number of multis.
- unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
+ ndrange_t get_window_size() const override {
+ return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
}
// Actually execute the GEMV.
- void execute(unsigned int start, unsigned int end, int) override {
+ void execute_1d(unsigned int start, unsigned int end, int) {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -145,6 +145,17 @@ public:
}
}
+ // Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ UNUSED(thread_locator);
+
+ const auto start = work_range.get_position(0);
+ const auto size = work_range.get_size(0);
+ const auto stop = start + size;
+
+ execute_1d(start, stop, threadid);
+ }
+
/* Pretransposed interface implementation */
bool B_is_pretransposed() const override {
return true;
diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
index 20824dfc8b..0c068db011 100644
--- a/src/core/NEON/kernels/arm_gemm/ndrange.hpp
+++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,16 +23,19 @@
*/
#pragma once
+#include <array>
#include <algorithm>
#include <initializer_list>
+#include <cassert>
+
namespace arm_gemm {
template<unsigned int D>
class NDRange {
private:
- unsigned int m_sizes[D];
- unsigned int m_totalsizes[D];
+ std::array<unsigned int, D> m_sizes {};
+ std::array<unsigned int, D> m_totalsizes {};
class NDRangeIterator {
private:
@@ -81,8 +84,25 @@ private:
};
public:
+ NDRange& operator=(const NDRange& rhs)=default;
+ NDRange(const NDRange& rhs) =default;
+
template <typename... T>
- NDRange(T... ts) : m_sizes{ts...} {
+ NDRange(T... ts)
+ : m_sizes{ts...}
+ {
+ unsigned int t=1;
+
+ for (unsigned int i=0; i<D; i++) {
+ t *= m_sizes[i];
+
+ m_totalsizes[i] = t;
+ }
+ }
+
+ NDRange(const std::array<unsigned int, D>& n)
+ : m_sizes{n}
+ {
unsigned int t=1;
for (unsigned int i=0; i<D; i++) {
@@ -105,4 +125,61 @@ public:
}
};
+/** NDCoordinate builds upon a range, but specifies a starting position
+ * in addition to a size which it inherits from NDRange
+ */
+template<unsigned int N>
+class NDCoordinate : public NDRange<N> {
+ using int_t =unsigned int;
+ using ndrange_t = NDRange<N>;
+
+ std::array<int_t, N> m_positions {};
+public:
+ NDCoordinate& operator=(const NDCoordinate& rhs)=default;
+ NDCoordinate(const NDCoordinate& rhs) =default;
+ NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
+ {
+ std::array<int_t, N> sizes;
+
+ std::size_t i = 0;
+ for(auto& p : list) {
+ m_positions[i]= p.first;
+ sizes[i++] = p.second;
+ }
+
+ //update the parents sizes
+ static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
+ }
+
+ int_t get_position(int_t d) const {
+ assert(d < m_positions.size());
+ return m_positions[d];
+ }
+
+ void set_position(int_t d, int_t v) {
+ assert(d < size(m_positions));
+ assert(v < ndrange_t::get_size(d));
+
+ m_positions[d] = v;
+ }
+
+ int_t get_position_end(int_t d) const {
+ return get_position(d) + NDRange<N>::get_size(d);
+ }
+}; //class NDCoordinate
+
+/** @returns the number of dimensions in the NDRange which have none-1 values
+ * IE there is actual work in these dimensions that can be broken up
+ */
+template<unsigned int N>
+std::size_t ndrange_popcount(const NDRange<N>& ndr) {
+ std::size_t count = 0;
+
+ for(unsigned int d = 0; d != N; ++d) {
+ if(ndr.get_size(d) != 1)
+ ++count;
+ }
+ return count;
+}
+
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index 345060f206..18f030fec0 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -148,7 +148,7 @@ public:
set_child_arrays();
}
- unsigned int get_window_size() const override {
+ ndrange_t get_window_size() const override {
return _subgemm->get_window_size();
}
@@ -158,8 +158,9 @@ public:
_args._maxthreads = nthreads;
}
- void execute(unsigned int start, unsigned int end, int threadid) override {
- _subgemm->execute(start, end, threadid);
+ // Execute
+ void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ _subgemm->execute(work_range, thread_locator, threadid);
if (!_args._pretransposed_hint) {
col_sums_runtime(threadid);
}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index e684eeee98..0a03497cb9 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,6 +71,61 @@ private:
const unsigned int _end;
};
+/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
+ * combination of threads that fit in (mutliplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+ /*
+ * We want the same ratio of threads in M & N to the ratio of m and n problem size
+ *
+ * Therefore: mt/nt == m/n where mt*nt == max_threads
+ *
+ * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt
+ * nt^2 = max_threads * (m/n)
+ * nt = sqrt( max_threads * (m/n) )
+ */
+ //ratio of m to n in problem dimensions
+ double ratio = m / static_cast<double>(n);
+
+ // nt = sqrt(max_threads * (m / n) )
+ const unsigned adjusted = std::round(
+ std::sqrt(max_threads * ratio));
+
+ //find the nearest factor of max_threads
+ for(unsigned i = 0; i!= adjusted; ++i)
+ {
+ //try down
+ const unsigned adj_down = adjusted - i;
+ if(max_threads % adj_down == 0)
+ {
+ return { adj_down, max_threads / adj_down };
+ }
+
+ //try up
+ const unsigned adj_up = adjusted + i;
+ if(max_threads % adj_up == 0)
+ {
+ return { adj_up, max_threads / adj_up };
+ }
+ }
+
+ //we didn't find anything so lets bail out with maxes biased to the largest dimension
+ if(m > n)
+ {
+ return{ std::min<unsigned>(m, max_threads), 1 };
+ }
+ else
+ {
+ return{ 1, std::min<unsigned>(n, max_threads) };
+ }
+}
+
/** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
*
* Will run workloads until the feeder reaches the end of its range.
@@ -314,50 +369,95 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
const Window &max_window = kernel->window();
- const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
- const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads);
- if(num_iterations == 0)
+ if(hints.split_dimension() == IScheduler::split_dimensions_all)
{
- return;
- }
+ /*
+ * if the split dim is size_t max then this signals we should parallelise over
+ * all dimensions
+ */
+ const std::size_t m = max_window.num_iterations(Window::DimX);
+ const std::size_t n = max_window.num_iterations(Window::DimY);
+
+ //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(...
+ unsigned m_threads, n_threads;
+ std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
+
+ std::vector<IScheduler::Workload> workloads;
+ for(unsigned int ni = 0; ni != n_threads; ++ni)
+ {
+ for(unsigned int mi = 0; mi != m_threads; ++mi)
+ {
+ workloads.push_back(
+ [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
+ (const ThreadInfo & info)
+ {
+ //narrow the window to our mi-ni workload
+ Window win = max_window.split_window(Window::DimX, mi, m_threads)
+ .split_window(Window::DimY, ni, n_threads);
- if(!kernel->is_parallelisable() || num_threads == 1)
- {
- ThreadInfo info;
- info.cpu_info = &_cpu_info;
- kernel->run(max_window, info);
+ win.validate();
+
+ Window thread_locator;
+ thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+ thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+ thread_locator.validate();
+
+ kernel->run_nd(win, info, thread_locator);
+ }
+ );
+ }
+ }
+ run_workloads(workloads);
}
else
{
- unsigned int num_windows = 0;
- switch(hints.strategy())
+ const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+ const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads);
+
+ if(num_iterations == 0)
{
- case StrategyHint::STATIC:
- num_windows = num_threads;
- break;
- case StrategyHint::DYNAMIC:
- {
- const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
- // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
- num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unknown strategy");
+ return;
}
- std::vector<IScheduler::Workload> workloads(num_windows);
- for(unsigned int t = 0; t < num_windows; t++)
+
+ if(!kernel->is_parallelisable() || num_threads == 1)
{
- //Capture 't' by copy, all the other variables by reference:
- workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+ ThreadInfo info;
+ info.cpu_info = &_cpu_info;
+ kernel->run(max_window, info);
+ }
+ else
+ {
+ unsigned int num_windows = 0;
+ switch(hints.strategy())
+ {
+ case StrategyHint::STATIC:
+ num_windows = num_threads;
+ break;
+ case StrategyHint::DYNAMIC:
+ {
+ const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+ // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+ num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unknown strategy");
+ }
+ std::vector<IScheduler::Workload> workloads(num_windows);
+ for(unsigned int t = 0; t < num_windows; t++)
{
- Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
- win.validate();
- kernel->run(win, info);
- };
+ //Capture 't' by copy, all the other variables by reference:
+ workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+ {
+ Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+ win.validate();
+ kernel->run(win, info);
+ };
+ }
+ run_workloads(workloads);
}
- run_workloads(workloads);
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index a3080e7f29..24bd7d7a8c 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -280,8 +280,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
//if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
//the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
{
- const int window_size = _gemm_kernel_asm->get_window_size();
- if(window_size < args._maxthreads)
+ const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+ if(window_size < static_cast<unsigned int>(args._maxthreads))
{
_gemm_kernel_asm->set_nthreads(window_size);
}
@@ -404,7 +404,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
if(_workspace.buffer() != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
- const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+ const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
unsigned int num_threads = NEScheduler::get().num_threads();
if(window_size < num_threads)
{
@@ -427,14 +427,21 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
in1_ptr, ldb, multi_stride_b,
out_ptr, ldd, batch_stride_d, multi_stride_d,
bias, 0);
-
// Schedule assembly kernel
IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
{
const int granule_threshold = 200;
scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+
+ }
+ else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32)
+ {
+ //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+ const int granule_threshold = 200;
+ scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
}
+
NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
}