aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Mansell <David.Mansell@arm.com>2020-07-08 13:28:45 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-07-23 11:55:45 +0000
commit318c9f40770b2d1c06f8c0fe3f7929812503733e (patch)
tree58926bc05cd825d885cf9af9553b1309068f36b6
parent40aad9bbbae5308d7302e61e1372328c9b5daf99 (diff)
downloadComputeLibrary-318c9f40770b2d1c06f8c0fe3f7929812503733e.tar.gz
COMPMID-3578: Update FP32/int8 kernel selection.
Upgrade the current 'is_preferred()' mechanism with a new framework, where kernels instead provide an estimated cycle count figure. Compatibility with old mechanism is achieved via a wrapper which replaces a "true" result with an estimate of 0, and a "false" result with UINT64_MAX. This mechanism is then used to select between 'interleaved' and 'hybrid' FP32 NEON kernels. This uses a simple system based on counting MACs performed and bytes of data transferred (for rearrange/merge operations) and dividing by fixed performance figures, which are provided for A53, A55, A73 and 'default' figures (based on A76). Separately, a new route for performing int8 GEMMs by using the int16 kernel is provided. This performs significantly (for uint8) or slightly (for int8) better on A53 than the existing int8 route. Optimized 8-to-16 bit transforms are also included. Change-Id: I53b2e59eb9368793c78c2081e17d2445361bcc47 Signed-off-by: David Mansell <David.Mansell@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/250120 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3609 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/CPP/CPPTypes.h7
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp30
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp98
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp84
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp118
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/performance_parameters.hpp37
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp224
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp224
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/list.hpp2
-rw-r--r--src/core/NEON/kernels/assembly/arm_gemm.hpp11
-rw-r--r--src/runtime/CPUUtils.cpp3
-rw-r--r--utils/TypePrinter.h6
17 files changed, 802 insertions, 108 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index be6dfacb38..fd6bfc3907 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -45,7 +45,8 @@ enum class CPUModel
A53,
A55r0,
A55r1,
- X1
+ X1,
+ A73
};
/** Global memory policy.
@@ -99,6 +100,10 @@ inline std::string cpu_model_to_string(CPUModel val)
{
return std::string("X1");
}
+ case CPUModel::A73:
+ {
+ return std::string("A73");
+ }
default:
{
ARM_COMPUTE_ERROR("Invalid CPUModel.");
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index aa206e3f42..ddb438f06c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -120,13 +120,13 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return (args._Nsize < 12); },
[](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
},
-{
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
"hybrid_fp32_mla_16x4",
[](const GemmArgs &args) { return (args._Ksize >= 4); },
- [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); },
+ [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
[](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
-},
+),
#ifdef __ARM_FEATURE_SVE
{
@@ -138,21 +138,21 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
},
#endif // __ARM_FEATURE_SVE
// Pretranposed, 2D split
-{
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED_2D,
"sgemm_12x8_2d",
nullptr,
- [](const GemmArgs &args) { return args._maxthreads >= 8; },
+ [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
[](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-},
+),
// 1D split (with pretransposed or not)
-{
+GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
"sgemm_12x8_1d",
nullptr,
- nullptr,
+ [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
[](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
-},
+),
#endif // __aarch64__
#ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 353d681fe2..7a983ed6ac 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -23,17 +23,15 @@
*/
#pragma once
-#include <assert.h>
-
#include <algorithm>
+#include <cassert>
#include "arm_gemm.hpp"
#include "bias_adder.hpp"
#include "ndrange.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
+#include "performance_parameters.hpp"
#include "transform.hpp"
+#include "utils.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -252,6 +250,28 @@ public:
void set_pretransposed_B_data(void *in_buffer) override {
_B_transposed = reinterpret_cast<Toi *>(in_buffer);
}
+
+ // Estimate cycles for given problem given provided parameters
+ static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+ // Note: Current hybrid kernels don't actually round up height (they
+ // have paths for each possible height). Might need to make this
+ // configurable in future.
+ uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+ float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+ // TODO: A bit of a kludge here: current hybrid kernels incur extra
+ // overhead where the width is not a multiple of kernel width. It's
+ // most noticable where the overall width is quite low, so add 15%
+ // penalty for such widths.
+ if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+ mac_cycles *= 1.15f;
+ }
+
+ uint64_t total_cycles = mac_cycles;
+
+ return total_cycles;
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index c726d7b0aa..261e7d2d9c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -24,6 +24,7 @@
#include "arm_gemm.hpp"
+#include <cstdint>
#include <functional>
namespace arm_gemm {
@@ -37,7 +38,7 @@ struct GemmImplementation {
const GemmMethod method;
const char * name;
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported;
- std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended;
+ std::function<uint64_t(const GemmArgs &, const OutputStage &)> cycle_estimate;
std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate;
bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
@@ -48,17 +49,27 @@ struct GemmImplementation {
}
}
- bool do_is_recommended(const GemmArgs &args, const OutputStage &os) const {
- if (is_recommended != nullptr) {
- return is_recommended(args, os);
+ uint64_t do_cycle_estimate(const GemmArgs &args, const OutputStage &os) const {
+ if (cycle_estimate != nullptr) {
+ return cycle_estimate(args, os);
} else {
- return true;
+ return 0;
}
}
+ GemmImplementation(const GemmImplementation &) = default;
+ GemmImplementation &operator= (const GemmImplementation &) = default;
+
GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
return instantiate(args, os);
}
+
+ GemmImplementation(GemmMethod m, const char *n,
+ std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+ method(m), name(n), is_supported(is_supported),
+ cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
+ instantiate(instantiate) { }
};
/* Slightly different version of above for straightforward GEMMs with no
@@ -69,7 +80,7 @@ struct GemmImplementation<Top, Tret, Nothing> {
const GemmMethod method;
const char * name;
std::function<bool(const GemmArgs &)> is_supported;
- std::function<bool(const GemmArgs &)> is_recommended;
+ std::function<uint64_t(const GemmArgs &)> cycle_estimate;
std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate;
bool do_is_supported(const GemmArgs &args, const Nothing &) const {
@@ -80,17 +91,42 @@ struct GemmImplementation<Top, Tret, Nothing> {
}
}
- bool do_is_recommended(const GemmArgs &args, const Nothing &) const {
- if (is_recommended != nullptr) {
- return is_recommended(args);
+ uint64_t do_cycle_estimate(const GemmArgs &args, const Nothing &) const {
+ if (cycle_estimate != nullptr) {
+ return cycle_estimate(args);
} else {
- return true;
+ return 0;
}
}
GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const Nothing &) const {
return instantiate(args);
}
+
+
+ static GemmImplementation with_estimate(GemmMethod m, const char *n,
+ std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
+ GemmImplementation impl(m,n);
+
+ impl.is_supported=is_supported;
+ impl.cycle_estimate=cycle_estimate;
+ impl.instantiate=instantiate;
+
+ return impl;
+ }
+
+ GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+
+ GemmImplementation(GemmMethod m, const char *n,
+ std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
+ method(m), name(n), is_supported(is_supported),
+ cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
+ instantiate(instantiate) { }
+
+ GemmImplementation(const GemmImplementation &) = default;
+ GemmImplementation &operator=(const GemmImplementation &) = default;
};
/* "Master" function implemented for each valid combination of types.
@@ -103,13 +139,11 @@ const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
/*
* Select a GEMM implementation for the given arguments.
*
- * The logic here returns the first method on the list which supports the
+ * The logic here returns the method on the list which supports the
* requested problem parameters, matches the provided filters (method and/or
- * name string match) and recommends itself.
- *
- * If there is no such method, it will return the first method which
- * supports the requested parameters and passes the filters, regardless of
- * recommendation.
+ * name string match) and offers the lowest cycle estimate. A cycle
+ * estimate of '0' is treated as a special value, causing the corresponding
+ * method to be selected immediately.
*
* If no method supports the requested parameters and passes the filters,
* this function returns false and doesn't touch the provided pointer
@@ -121,6 +155,7 @@ bool find_implementation(const GemmArgs &args, const OutputStage &os, const Gemm
const GemmConfig *cfg = args._cfg;
const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
+ uint64_t best_estimate = 0;
for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Skip if this implementation doesn't support these args. */
@@ -138,27 +173,24 @@ bool find_implementation(const GemmArgs &args, const OutputStage &os, const Gemm
continue;
}
- /* At this point, if we don't have a saved implementation, save this
- * one. This is so that we always return something if a filter
- * matches, even if it doesn't recommend itself.
- */
- if (saved_impl == nullptr) {
- saved_impl=i;
- }
+ /* Test the cycle estimate */
+ uint64_t estimate = i->do_cycle_estimate(args, os);
- /* Check that this method recommends itself. */
- if (!i->do_is_recommended(args, os)) {
- continue;
+ /* Short circuit - if the estimate is zero, return this one immediately. */
+ if (estimate==0) {
+ impl=i;
+ return true;
}
- impl=i;
-
- return true;
+ /* Otherwise, remember this is our best so far if we don't yet have
+ * a valid candidate, or we beat the estimate. */
+ if ((saved_impl == nullptr) || (estimate < best_estimate)) {
+ saved_impl = i;
+ best_estimate = estimate;
+ }
}
- /* We didn't find an option matching the filters that recommended
- * itself. But if we found something earlier that matched the filters
- * but wasn't recommended, return it here. */
+ /* Return whichever method gave the best estimate. */
if (saved_impl != nullptr) {
impl = saved_impl;
return true;
@@ -183,7 +215,7 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
continue;
}
- res.push_back(KernelDescription(i->method, i->name, i==default_impl));
+ res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
}
return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 3ee47492db..bddcc8dab1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -122,6 +122,13 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
[](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
},
{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s16_12x8",
+ nullptr,
+ [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+ [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
+},
+{
GemmMethod::GEMM_INTERLEAVED_2D,
"gemm_s8_4x4_2d",
nullptr,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 3b829491ca..c4dceef922 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -23,15 +23,14 @@
*/
#pragma once
-#include <stdio.h>
-#include <assert.h>
-
#include <algorithm>
+#include <cassert>
#include "arm_gemm.hpp"
#include "utils.hpp"
#include "mergeresults.hpp"
+#include "performance_parameters.hpp"
#include "transform.hpp"
#ifdef CYCLE_PROFILING
@@ -149,6 +148,33 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
}
+ static unsigned int get_k_block_size(const GemmArgs &args) {
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
+ unsigned int k_block;
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ k_block = roundup(k_block, strategy::k_unroll());
+
+ return k_block;
+ }
public:
GemmInterleaved(GemmInterleaved &) = delete;
@@ -158,35 +184,14 @@ public:
GemmInterleaved(const GemmArgs &args)
: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
_nbatches(args._nbatches), _nmulti(args._nmulti),
- _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads) {
- const unsigned int L1_size = _ci->get_L1_cache_size();
+ _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _k_block(get_k_block_size(args)) {
const unsigned int L2_size = _ci->get_L2_cache_size();
assert(_maxthreads > 0);
// Work out blocking parameters, or override from provided GemmConfig
- if (args._cfg && args._cfg->inner_block_size) {
- _k_block = args._cfg->inner_block_size;
- } else {
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
- // Now tune to presented problem size; this is how many blocks we need.
- unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
-
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
- }
-
+ // TODO: Move outer block into a static function too.
if (args._cfg && args._cfg->outer_block_size) {
_x_block = args._cfg->outer_block_size;
} else {
@@ -422,6 +427,31 @@ public:
void set_pretransposed_B_data(void *in_buffer) override {
_B_transposed = reinterpret_cast<Toi *>(in_buffer);
}
+
+ // Estimate cycles for given problem given provided parameters
+ static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+ unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+
+ uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+ uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+ uint64_t merge_bytes = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+ float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+ float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+ float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+ float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+ // We can't thread over multis or width, which makes this a poor
+ // choice in many threaded cases. Penalize that here.
+ float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
+
+ if (parallelism_available < args._maxthreads) {
+ total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+ }
+
+ return static_cast<uint64_t>(total_cycles);
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index ebe33ab271..bdccd05326 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -35,6 +35,7 @@
#include <algorithm>
#include <cassert>
+#include <cmath>
// Some macros used to decide how much working space to allocate.
// Round allocations up to the next cache line.
@@ -301,6 +302,36 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
}
}
+ static unsigned int get_k_block_size(const GemmArgs &args) {
+ // Work out blocking parameters, or override from provided GemmConfig
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
+ unsigned int k_block;
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, numk_blocks);
+
+ // And round UP to the K unroll level required.
+ k_block = iceildiv(k_block, strategy::k_unroll());
+ k_block *= strategy::k_unroll();
+
+ return k_block;
+ }
+
public:
GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
@@ -315,8 +346,8 @@ public:
, _nmulti(args._nmulti)
, _act(args._act)
, _maxthreads(args._maxthreads)
- , _nthreads(args._maxthreads)
-
+ , _nthreads(args._maxthreads)
+ , _k_block(get_k_block_size(args))
// Work out the rounded size of M - needed for some buffers.
, _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
, _Mround ( _Mround_div * strategy::out_height() )
@@ -326,32 +357,8 @@ public:
{
assert(_maxthreads > 0);
- const unsigned int L1_size = _ci->get_L1_cache_size();
const unsigned int L2_size = _ci->get_L2_cache_size();
- // Work out blocking parameters, or override from provided GemmConfig
- if (args._cfg && args._cfg->inner_block_size) {
- _k_block = args._cfg->inner_block_size;
- } else {
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
- // Now tune to presented problem size; this is how many blocks we need.
- unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
-
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
- }
-
if (args._cfg && args._cfg->outer_block_size) {
_x_block = args._cfg->outer_block_size;
} else {
@@ -381,6 +388,10 @@ public:
return { m, n };
}
+ bool supports_dynamic_scheduling() const override {
+ return true;
+ }
+
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
void set_nthreads(int nthreads) override {
_nthreads = std::min(nthreads, _maxthreads);
@@ -495,7 +506,60 @@ public:
_B_transposed = reinterpret_cast<Toi *>(in_buffer);
}
- ~GemmInterleavedPretransposed2d() override { }
+ // Estimate cycles for given problem given provided parameters
+ static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+ unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+ unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+ unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
+
+ uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+ uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+ uint64_t merge_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+ // Wide problems incur extra preparation cost, as it is done per thread.
+ // Duplicate the logic the scheduler will later use to figure out how much that will affect us
+ float ratio = m_blocks / static_cast<float>(n_blocks);
+
+ unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
+ unsigned int height = 1;
+
+ if (ideal_height == 0) {
+ height = 1;
+ } else {
+ for (unsigned int adj=0; adj<ideal_height; adj++) {
+ const unsigned int round_down = ideal_height - adj;
+ if (args._maxthreads % round_down == 0) {
+ height = round_down;
+ break;
+ }
+
+ const unsigned int round_up = ideal_height + adj;
+ if (args._maxthreads % round_up == 0) {
+ height = round_up;
+ break;
+ }
+ }
+ }
+
+ // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
+ prepare_bytes *= (args._maxthreads / height);
+
+ float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+ float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+ float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+ float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+ // We can't thread over multis, which might be a problem in some
+ // threaded cases. Penalize that here.
+ float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
+
+ if (parallelism_available < args._maxthreads) {
+ total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+ }
+
+ return static_cast<uint64_t>(total_cycles);
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index caab2e2cc2..88726b1448 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -101,6 +101,13 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
[](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
},
{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u16_12x8",
+ nullptr,
+ [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+ [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+},
+{
GemmMethod::GEMM_HYBRID,
"hybrid_u8u32_dot_16x4",
[](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
index 8d8ede8137..4147ab60dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -25,7 +25,7 @@
#ifdef __aarch64__
-
+#include "../performance_parameters.hpp"
#include "../std_transforms_fixed.hpp"
namespace arm_gemm
@@ -75,6 +75,22 @@ public:
return true;
}
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ switch (ci->get_cpu_model()) {
+ case CPUModel::A55r1:
+ return { 2.866 };
+
+ case CPUModel::A53:
+ return { 1.419 };
+
+ case CPUModel::A73:
+ return { 2.551 };
+
+ default:
+ return { 6.25 };
+ }
+ }
+
StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
// Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 5c3d6409b9..981ce34b49 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -67,6 +67,22 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ switch (ci->get_cpu_model()) {
+ case CPUModel::A55r1:
+ return { 3.724, 1.416, 1.113 };
+
+ case CPUModel::A53:
+ return { 2.777, 0.987, 0.898 };
+
+ case CPUModel::A73:
+ return { 2.885, 1.429, 1.163 };
+
+ default:
+ return { 6.949, 4.149, 2.826 };
+ }
+ }
+
kern_type kernel=a64_sgemm_asimd_12x8;
sgemm_12x8(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
new file mode 100644
index 0000000000..059ab5f7df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+namespace arm_gemm {
+
+struct PerformanceParameters {
+ float kernel_macs_cycle;
+ float prepare_bytes_cycle = 0.0f;
+ float merge_bytes_cycle = 0.0f;
+
+ PerformanceParameters(float k) : kernel_macs_cycle(k) { }
+ PerformanceParameters(float k, float p, float m) : kernel_macs_cycle(k), prepare_bytes_cycle(p), merge_bytes_cycle(m) { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
new file mode 100644
index 0000000000..37344a82a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+ int16_t *outptr = out;
+ const int8_t *inptr = in;
+ bool first = true;
+
+ int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+ for (int y=y0; y<ymax; y+=8) {
+ const int8_t *inptr0 = inptr + y * ldin + k0;
+ const int8_t *inptr1 = inptr0 + ldin;
+ const int8_t *inptr2 = inptr1 + ldin;
+ const int8_t *inptr3 = inptr2 + ldin;
+ const int8_t *inptr4 = inptr3 + ldin;
+ const int8_t *inptr5 = inptr4 + ldin;
+ const int8_t *inptr6 = inptr5 + ldin;
+ const int8_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x=(kmax-k0);
+ for (;(x>15) || first;x-=16) {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+ if ((y + 7) >= ymax) {
+ switch ((y + 7) - ymax) {
+ case 6:
+ inptr1 = zerobuff;
+ // fall through
+ case 5:
+ inptr2 = zerobuff;
+ // fall through
+ case 4:
+ inptr3 = zerobuff;
+ // fall through
+ case 3:
+ inptr4 = zerobuff;
+ // fall through
+ case 2:
+ inptr5 = zerobuff;
+ // fall through
+ case 1:
+ inptr6 = zerobuff;
+ // fall through
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ if (first) {
+ if (x<=15) {
+ break;
+ }
+
+ first = false;
+ }
+
+ __asm __volatile (
+ // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+ "LDR q0, [%[inptr0]], #16\n"
+ "LDR q2, [%[inptr1]], #16\n"
+ "SSHLL2 v1.8h, v0.16b, #0\n"
+ "SSHLL v0.8h, v0.8b, #0\n"
+ "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+ "SSHLL2 v3.8h, v2.16b, #0\n"
+ "SSHLL v2.8h, v2.8b, #0\n"
+ "SSHLL2 v5.8h, v4.16b, #0\n"
+ "SSHLL v4.8h, v4.8b, #0\n"
+ "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+ "SSHLL2 v7.8h, v6.16b, #0\n"
+ "SSHLL v6.8h, v6.8b, #0\n"
+ "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+ "LDR q8, [%[inptr4]], #16\n"
+ "LDR q10, [%[inptr5]], #16\n"
+ "SSHLL2 v9.8h, v8.16b, #0\n"
+ "SSHLL v8.8h, v8.8b, #0\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
+ "LDR q12, [%[inptr6]], #16\n"
+ "SSHLL2 v11.8h, v10.16b, #0\n"
+ "SSHLL v10.8h, v10.8b, #0\n"
+ "SSHLL2 v13.8h, v12.16b, #0\n"
+ "SSHLL v12.8h, v12.8b, #0\n"
+ "ZIP1 v18.8h, v8.8h, v12.8h\n"
+ "LDR q14, [%[inptr7]], #16\n"
+ "SSHLL2 v15.8h, v14.16b, #0\n"
+ "SSHLL v14.8h, v14.8b, #0\n"
+ "ZIP1 v19.8h, v10.8h, v14.8h\n"
+
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+ "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+ "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+ "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+ ASM_PREFETCH("[%[inptr3], #128]")
+
+ "ZIP2 v16.8h, v0.8h, v4.8h\n"
+ "ZIP2 v17.8h, v2.8h, v6.8h\n"
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v18.8h, v8.8h, v12.8h\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP2 v19.8h, v10.8h, v14.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v16.8h, v1.8h, v5.8h\n"
+ "ZIP1 v17.8h, v3.8h, v7.8h\n"
+ ASM_PREFETCH("[%[inptr6], #128]")
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP1 v18.8h, v9.8h, v13.8h\n"
+ "ZIP1 v19.8h, v11.8h, v15.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Third element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+ ASM_PREFETCH("[%[inptr7], #128]")
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Fourth element
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+
+ "ZIP2 v16.8h, v1.8h, v5.8h\n"
+ "ZIP2 v17.8h, v3.8h, v7.8h\n"
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v18.8h, v9.8h, v13.8h\n"
+ "ZIP2 v19.8h, v11.8h, v15.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Fifth element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Sixth element
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Seventh element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+ [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+ );
+ }
+
+ for (;x>0;x--) {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
new file mode 100644
index 0000000000..a3a269c9cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+ uint16_t *outptr = out;
+ const uint8_t *inptr = in;
+ bool first = true;
+
+ uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+ for (int y=y0; y<ymax; y+=8) {
+ const uint8_t *inptr0 = inptr + y * ldin + k0;
+ const uint8_t *inptr1 = inptr0 + ldin;
+ const uint8_t *inptr2 = inptr1 + ldin;
+ const uint8_t *inptr3 = inptr2 + ldin;
+ const uint8_t *inptr4 = inptr3 + ldin;
+ const uint8_t *inptr5 = inptr4 + ldin;
+ const uint8_t *inptr6 = inptr5 + ldin;
+ const uint8_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x=(kmax-k0);
+ for (;(x>15) || first;x-=16) {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+ if ((y + 7) >= ymax) {
+ switch ((y + 7) - ymax) {
+ case 6:
+ inptr1 = zerobuff;
+ // fall through
+ case 5:
+ inptr2 = zerobuff;
+ // fall through
+ case 4:
+ inptr3 = zerobuff;
+ // fall through
+ case 3:
+ inptr4 = zerobuff;
+ // fall through
+ case 2:
+ inptr5 = zerobuff;
+ // fall through
+ case 1:
+ inptr6 = zerobuff;
+ // fall through
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ if (first) {
+ if (x<=15) {
+ break;
+ }
+
+ first = false;
+ }
+
+ __asm __volatile (
+ // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+ "LDR q0, [%[inptr0]], #16\n"
+ "LDR q2, [%[inptr1]], #16\n"
+ "USHLL2 v1.8h, v0.16b, #0\n"
+ "USHLL v0.8h, v0.8b, #0\n"
+ "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+ "USHLL2 v3.8h, v2.16b, #0\n"
+ "USHLL v2.8h, v2.8b, #0\n"
+ "USHLL2 v5.8h, v4.16b, #0\n"
+ "USHLL v4.8h, v4.8b, #0\n"
+ "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+ "USHLL2 v7.8h, v6.16b, #0\n"
+ "USHLL v6.8h, v6.8b, #0\n"
+ "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+ "LDR q8, [%[inptr4]], #16\n"
+ "LDR q10, [%[inptr5]], #16\n"
+ "USHLL2 v9.8h, v8.16b, #0\n"
+ "USHLL v8.8h, v8.8b, #0\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
+ "LDR q12, [%[inptr6]], #16\n"
+ "USHLL2 v11.8h, v10.16b, #0\n"
+ "USHLL v10.8h, v10.8b, #0\n"
+ "USHLL2 v13.8h, v12.16b, #0\n"
+ "USHLL v12.8h, v12.8b, #0\n"
+ "ZIP1 v18.8h, v8.8h, v12.8h\n"
+ "LDR q14, [%[inptr7]], #16\n"
+ "USHLL2 v15.8h, v14.16b, #0\n"
+ "USHLL v14.8h, v14.8b, #0\n"
+ "ZIP1 v19.8h, v10.8h, v14.8h\n"
+
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+ "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+ "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+ "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+ ASM_PREFETCH("[%[inptr3], #128]")
+
+ "ZIP2 v16.8h, v0.8h, v4.8h\n"
+ "ZIP2 v17.8h, v2.8h, v6.8h\n"
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v18.8h, v8.8h, v12.8h\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP2 v19.8h, v10.8h, v14.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v16.8h, v1.8h, v5.8h\n"
+ "ZIP1 v17.8h, v3.8h, v7.8h\n"
+ ASM_PREFETCH("[%[inptr6], #128]")
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP1 v18.8h, v9.8h, v13.8h\n"
+ "ZIP1 v19.8h, v11.8h, v15.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Third element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+ ASM_PREFETCH("[%[inptr7], #128]")
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Fourth element
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+
+ "ZIP2 v16.8h, v1.8h, v5.8h\n"
+ "ZIP2 v17.8h, v3.8h, v7.8h\n"
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v18.8h, v9.8h, v13.8h\n"
+ "ZIP2 v19.8h, v11.8h, v15.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Fifth element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+
+ "ZIP1 v20.8h, v16.8h, v17.8h\n"
+ "ZIP1 v21.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Sixth element
+ "TRN1 v24.2d, v20.2d, v21.2d\n"
+ "TRN2 v25.2d, v20.2d, v21.2d\n"
+
+ "ZIP2 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v23.8h, v18.8h, v19.8h\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Seventh element
+ "TRN1 v24.2d, v22.2d, v23.2d\n"
+ "TRN2 v25.2d, v22.2d, v23.2d\n"
+ "STP q24, q25, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+ [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+ );
+ }
+
+ for (;x>0;x--) {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 2c698b2576..b825e1c358 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -28,6 +28,8 @@
#include "a64_interleave_8way_32bit.hpp"
#include "a64_interleave_8way_block4_8bit.hpp"
#include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_interleave_8way_s8_to_s16.hpp"
+#include "a64_interleave_8way_u8_to_u16.hpp"
#include "a64_transpose_interleave_12way_16bit.hpp"
#include "a64_transpose_interleave_12way_half_to_float.hpp"
#include "a64_transpose_interleave_24way_16bit.hpp"
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index e3e15478fb..58db511547 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -48,12 +48,13 @@ enum class GemmMethod
struct KernelDescription
{
- GemmMethod method = GemmMethod::DEFAULT;
- std::string name = "";
- bool is_default = false;
+ GemmMethod method = GemmMethod::DEFAULT;
+ std::string name = "";
+ bool is_default = false;
+ uint64_t cycle_estimate = 0;
- KernelDescription(GemmMethod m, std::string n, bool d = false)
- : method(m), name(n), is_default(d)
+ KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
+ : method(m), name(n), is_default(d), cycle_estimate(c)
{
}
KernelDescription() noexcept
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 865e038488..4d6caaee01 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -126,6 +126,9 @@ CPUModel midr_to_model(const unsigned int midr)
case 0xd44: // X1
model = CPUModel::X1;
break;
+ case 0xd09: // A73
+ model = CPUModel::A73;
+ break;
case 0xd0a: // A75
if(variant != 0)
{
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 2f7177ad25..372d4e7629 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -2292,6 +2292,12 @@ inline ::std::ostream &operator<<(::std::ostream &os, const CPUModel &cpu_model)
case CPUModel::A55r1:
os << "A55r1";
break;
+ case CPUModel::A73:
+ os << "A73";
+ break;
+ case CPUModel::X1:
+ os << "X1";
+ break;
default:
ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
}