aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-03-14 17:55:27 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commit7fad9b1d00f3ee1488ba4038d1371f6ea219f8b7 (patch)
treeded71e1cfa8e0c085f8bce5dfc26a99786d60e52 /arm_compute
parent1562be3e8a449360a90af75f6f1481a30d41be75 (diff)
downloadComputeLibrary-7fad9b1d00f3ee1488ba4038d1371f6ea219f8b7.tar.gz
COMPMID-1021: CPUInfo refactoring.
Removed CPUTarget in favor of the CPUModel type. CPUInfo now holds a vector of N CPUs. CPUInfo autoinitialise upon construction with 1 GENERIC CPU. CPPScheduler fills CPUInfo's vector upon construction (runtime). IScheduler has a single CPUInfo obj and ThreadInfo always gets a pointer to it (avoid copying the vector) Change-Id: I30f293258c959c87f6bac5eac8b963beb6a4d365 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124626 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/CPP/CPPTypes.h125
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp14
-rw-r--r--arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp407
-rw-r--r--arm_compute/runtime/CPUUtils.h44
-rw-r--r--arm_compute/runtime/IScheduler.h10
-rw-r--r--arm_compute/runtime/NEON/AssemblyHelper.h80
6 files changed, 180 insertions, 500 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 3abc0a2e88..8a9ada81c1 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -24,44 +24,115 @@
#ifndef __ARM_COMPUTE_CPP_TYPES_H__
#define __ARM_COMPUTE_CPP_TYPES_H__
+#include <vector>
+
namespace arm_compute
{
-/** Available CPU Targets */
-enum class CPUTarget
+/** CPU models - we only need to detect CPUs we have
+ * microarchitecture-specific code for.
+ *
+ * Architecture features are detected via HWCAPs.
+ */
+enum class CPUModel
{
- ARCH_MASK = 0x0F00,
- CPU_MODEL = 0x00FF,
- INTRINSICS = 0x0100,
- ARMV7 = 0x0200,
- ARMV8 = 0x0300,
- ARMV8_2 = 0x0400,
- A7x = 0x0070,
- A5x = 0x0050,
- DOT = 0x1000,
-
- A53 = (ARMV8 | A7x | 0x3),
- A55 = (ARMV8_2 | A5x | 0x5),
- A55_DOT = (A55 | DOT),
- A72 = (ARMV8 | A7x | 0x2),
- A73 = (ARMV8 | A7x | 0x3),
- A75 = (ARMV8_2 | A7x | 0x5),
- A75_DOT = (A75 | DOT),
+ GENERIC,
+ A53,
+ A55r0,
+ A55r1,
};
-/** Information about a CPU. */
-struct CPUInfo
+class CPUInfo final
{
- CPUTarget CPU{ CPUTarget::INTRINSICS }; /**< CPU target. */
- int L1_size{ 0 }; /**< Size of L1 cache. */
- int L2_size{ 0 }; /**< Size of L2 cache. */
+public:
+ /** Constructor */
+ CPUInfo();
+
+ /** Disable copy constructor and assignment operator to avoid copying the vector of CPUs each time
+ * CPUInfo is initialized once in the IScheduler and ThreadInfo will get a pointer to it.
+ */
+ CPUInfo &operator=(const CPUInfo &cpuinfo) = delete;
+ CPUInfo(const CPUInfo &cpuinfo) = delete;
+ CPUInfo &operator=(const CPUInfo &&cpuinfo) = delete;
+ CPUInfo(const CPUInfo &&cpuinfo) = delete;
+
+ /** Checks if the cpu model supports fp16.
+ *
+ * @return true of the cpu supports fp16, false otherwise
+ */
+ bool has_fp16() const;
+ /** Checks if the cpu model supports dot product.
+ *
+ * @return true of the cpu supports dot product, false otherwise
+ */
+ bool has_dotprod() const;
+ /** Gets the cpu model for a given cpuid.
+ *
+ * @param[in] cpuid the id of the cpu core to be retrieved,
+ *
+ * @return the @ref CPUModel of the cpuid queiried.
+ */
+ CPUModel get_cpu_model(unsigned int cpuid) const;
+ /** Gets the current thread's cpu model
+ *
+ * @return Current thread's @ref CPUModel
+ */
+ CPUModel get_cpu_model() const;
+ /** Gets the L1 cache size
+ *
+ * @return the size of the L1 cache
+ */
+ unsigned int get_L1_cache_size() const;
+ /** Gets the L2 cache size
+ *
+ * @return the size of the L1 cache
+ */
+ unsigned int get_L2_cache_size() const;
+ /** Set the L1 cache size
+ *
+ * @param[in] size the new size to be set.
+ */
+ void set_L1_cache_size(unsigned int size);
+ /** Set the L2 cache size
+ *
+ * @param[in] size the new size to be set.
+ */
+ void set_L2_cache_size(unsigned int size);
+ /** Set fp16 support
+ *
+ * @param[in] fp16 whether the cpu supports fp16.
+ */
+ void set_fp16(const bool fp16);
+ /** Set dot product support
+ *
+ * @param[in] dotprod whether the cpu supports dot product.
+ */
+ void set_dotprod(const bool dotprod);
+ /** Set the cpumodel for a given cpu core
+ *
+ * @param[in] cpuid the id of the core to be set.
+ * @param[in] model the @ref CPUModel to be set.
+ */
+ void set_cpu_model(unsigned int cpuid, CPUModel model);
+ /** Set max number of cpus
+ *
+ * @param[in] cpu_count the number of CPUs in the system.
+ */
+ void set_cpu_num(unsigned int cpu_count);
+
+private:
+ std::vector<CPUModel> _percpu = {};
+ bool _fp16 = false;
+ bool _dotprod = false;
+ unsigned int _L1_cache_size = 32768;
+ unsigned int _L2_cache_size = 262144;
};
/** Information about executing thread and CPU. */
struct ThreadInfo
{
- int thread_id{ 0 }; /**< Executing thread. */
- int num_threads{ 1 }; /**< Number of CPU threads. */
- CPUInfo cpu_info{}; /**< CPU information. */
+ int thread_id{ 0 };
+ int num_threads{ 1 };
+ const CPUInfo *cpu_info{ nullptr };
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_CPP_TYPES_H__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
index a608566634..8d3db4adf2 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -23,7 +23,15 @@
*/
#pragma once
-/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */
+/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+namespace arm_gemm
+{
+using CPUModel = arm_compute::CPUModel;
+using CPUInfo = arm_compute::CPUInfo;
+} // namespace arm_compute
+
+
-/* Our CPUInfo is defined in newgemm_lib.hpp */
-#include "newgemm_lib.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
deleted file mode 100644
index 0e232b6bc5..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <fcntl.h>
-#include <sched.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <sstream>
-#include <thread>
-
-extern int l1_cache_size;
-extern int l2_cache_size;
-extern int force_cpu;
-
-#ifdef __ANDROID__
-inline unsigned long stoul( const std::string& str, std::size_t* pos = 0, int base = 10 )
-{
- char *end;
- const unsigned long ret = strtoul( str.c_str(), &end, base);
- *pos = end - str.c_str();
- return ret;
-}
-inline int stoi( const std::string& str, std::size_t* pos = 0, int base = 10 )
-{
- return atoi(str.c_str());
-}
-#endif
-
-
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-#include <sys/auxv.h>
-
-/* Get HWCAP bits from asm/hwcap.h */
-#include <asm/hwcap.h>
-#endif /* !BARE_METAL */
-
-/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
- * out of date (or for bare metal mode) */
-#ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10)
-#endif
-
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif
-
-#ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20)
-#endif
-
-#define CPUINFO_HACK
-
-//unsigned int get_cpu_impl();
-
-
-/* CPU models - we only need to detect CPUs we have
- * microarchitecture-specific code for.
- *
- * Architecture features are detected via HWCAPs.
- */
-enum class CPUModel {
- GENERIC = 0x0001,
- A53 = 0x0010,
- A55r0 = 0x0011,
- A55r1 = 0x0012,
-};
-
-class CPUInfo
-{
-private:
- struct PerCPUData {
- CPUModel model = CPUModel::GENERIC;
- uint32_t midr = 0;
- bool model_set = false;
- };
-
- std::vector<PerCPUData> _percpu={};
-
- bool _cpuid = false;
- bool _fp16 = false;
- bool _dotprod = false;
-
- unsigned int L1_cache_size = 32768;
- unsigned int L2_cache_size = 262144;
-
- /* Convert an MIDR register value to a CPUModel enum value. */
- CPUModel midr_to_model(const unsigned int midr) const {
- CPUModel model;
-
- // Unpack variant and CPU ID
- int variant = (midr >> 20) & 0xF;
- int cpunum = (midr >> 4) & 0xFFF;
-
- /* Only CPUs we have code paths for are detected. All other CPUs
- * can be safely classed as "GENERIC"
- */
-
- switch(cpunum) {
- case 0xd03:
- model = CPUModel::A53;
- break;
-
- case 0xd05:
- if (variant) {
- model = CPUModel::A55r1;
- } else {
- model = CPUModel::A55r0;
- }
- break;
-
- default:
- model = CPUModel::GENERIC;
- break;
- }
-
- return model;
- }
-
- /* If the CPUID capability is present, MIDR information is provided in
- /sys. Use that to populate the CPU model table. */
- void populate_models_cpuid() {
- for (unsigned long int i=0; i<_percpu.size(); i++) {
- std::stringstream str;
- str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
- std::ifstream file;
-
- file.open(str.str(), std::ios::in);
-
- if (file.is_open()) {
- std::string line;
-
- if (bool(getline(file, line))) {
- const unsigned long midr = stoul(line, nullptr, 16);
-
- _percpu[i].midr = (midr & 0xffffffff);
- _percpu[i].model = midr_to_model(_percpu[i].midr);
- _percpu[i].model_set = true;
- }
- }
- }
- }
-
- /* If "long-form" cpuinfo is present, parse that to populate models. */
- void populate_models_cpuinfo() {
- std::regex proc_regex("^processor.*(\\d+)$");
- std::regex imp_regex("^CPU implementer.*0x(..)$");
- std::regex var_regex("^CPU variant.*0x(.)$");
- std::regex part_regex("^CPU part.*0x(...)$");
- std::regex rev_regex("^CPU revision.*(\\d+)$");
-
- std::ifstream file;
- file.open("/proc/cpuinfo", std::ios::in);
-
- if (file.is_open()) {
- std::string line;
- int midr=0;
- int curcpu=-1;
-
- while(bool(getline(file, line))) {
- std::smatch match;
-
- if (std::regex_match(line, match, proc_regex)) {
- std::string id = match[1];
- int newcpu=stoi(id, nullptr, 0);
-
- if (curcpu >= 0 && midr==0) {
- // Matched a new CPU ID without any description of the previous one - looks like old format.
- return;
- }
-
- if (curcpu >= 0) {
- _percpu[curcpu].midr = midr;
- _percpu[curcpu].model = midr_to_model(midr);
- _percpu[curcpu].model_set = true;
- }
-
- midr=0;
- curcpu=newcpu;
-
- continue;
- }
-
- if (std::regex_match(line, match, imp_regex)) {
- int impv = stoi(match[1], nullptr, 16);
- midr |= (impv << 24);
- continue;
- }
-
- if (std::regex_match(line, match, var_regex)) {
- int varv = stoi(match[1], nullptr, 16);
- midr |= (varv << 16);
- continue;
- }
-
- if (std::regex_match(line, match, part_regex)) {
- int partv = stoi(match[1], nullptr, 16);
- midr |= (partv << 4);
- continue;
- }
-
- if (std::regex_match(line, match, rev_regex)) {
- int regv = stoi(match[1], nullptr, 10);
- midr |= (regv);
- midr |= (0xf << 16);
- continue;
- }
- }
-
- if (curcpu >= 0) {
- _percpu[curcpu].midr = midr;
- _percpu[curcpu].model = midr_to_model(midr);
- _percpu[curcpu].model_set = true;
-
- }
- }
- }
-
- /* Identify the maximum valid CPUID in the system. This reads
- * /sys/devices/system/cpu/present to get the information. */
- int get_max_cpus() {
- int max_cpus = 1;
-
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
- std::ifstream CPUspresent;
- CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
- bool success = false;
-
- if (CPUspresent.is_open()) {
- std::string line;
-
- if (bool(getline(CPUspresent, line))) {
- /* The content of this file is a list of ranges or single values, e.g.
- * 0-5, or 1-3,5,7 or similar. As we are interested in the
- * max valid ID, we just need to find the last valid
- * delimiter ('-' or ',') and parse the integer immediately after that.
- */
- auto startfrom=line.begin();
-
- for (auto i=line.begin(); i<line.end(); ++i) {
- if (*i=='-' || *i==',') {
- startfrom=i+1;
- }
- }
-
- line.erase(line.begin(), startfrom);
-
- max_cpus = stoi(line, nullptr, 0) + 1;
- success = true;
- }
- }
-
- // Return std::thread::hardware_concurrency() as a fallback.
- if (!success) {
- max_cpus = std::thread::hardware_concurrency();
- }
-#endif // !BARE_METAL
-
- return max_cpus;
- }
-
-public:
- CPUInfo() {
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
- unsigned long hwcaps = getauxval(AT_HWCAP);
-
- if (hwcaps & HWCAP_CPUID) {
- _cpuid = true;
- }
-
- if (hwcaps & HWCAP_ASIMDHP) {
- _fp16 = true;
- }
-
- if (hwcaps & HWCAP_ASIMDDP) {
- _dotprod = true;
- }
-
-#ifdef __aarch64__
- /* Pre-4.15 kernels don't have the ASIMDDP bit.
- *
- * Although the CPUID bit allows us to read the feature register
- * directly, the kernel quite sensibly masks this to only show
- * features known by it to be safe to show to userspace. As a
- * result, pre-4.15 kernels won't show the relevant bit in the
- * feature registers either.
- *
- * So for now, use a whitelist of CPUs known to support the feature.
- */
- if (!_dotprod && _cpuid) {
- /* List of CPUs with dot product support: A55r1 A75r1 A75r2 */
- const unsigned int dotprod_whitelist_masks[] = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
- const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
-
- unsigned long cpuid;
-
- __asm __volatile (
- "mrs %0, midr_el1\n"
- : "=r" (cpuid)
- :
- :
- );
-
- for (int i=0;dotprod_whitelist_values[i];i++) {
- if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) {
- _dotprod = true;
- break;
- }
- }
- }
-#endif
- _percpu.resize(get_max_cpus());
-#endif
- if (_cpuid) {
- populate_models_cpuid();
- } else {
- populate_models_cpuinfo();
- }
- }
-
- void set_fp16(const bool fp16) {
- _fp16 = fp16;
- }
-
- void set_dotprod(const bool dotprod) {
- _dotprod = dotprod;
- }
-
- void set_cpu_model(unsigned long cpuid, CPUModel model) {
- if (_percpu.size() > cpuid) {
- _percpu[cpuid].model = model;
- _percpu[cpuid].model_set = true;
- }
- }
-
- bool has_fp16() const {
- return _fp16;
- }
-
- bool has_dotprod() const {
- return _dotprod;
- }
-
- CPUModel get_cpu_model(unsigned long cpuid) const {
- if (cpuid < _percpu.size()) {
- return _percpu[cpuid].model;
- }
-
- return CPUModel::GENERIC;
- }
-
- CPUModel get_cpu_model() const {
-#if defined(BARE_METAL) || (!defined(__arm__) && !defined( __aarch64__) )
- return get_cpu_model(0);
-#else
- return get_cpu_model(sched_getcpu());
-#endif
- }
-
- unsigned int get_L1_cache_size() const {
- return L1_cache_size;
- }
-
- void set_L1_cache_size(unsigned int size) {
- L1_cache_size = size;
- }
-
- unsigned int get_L2_cache_size() const {
- return L2_cache_size;
- }
-
- void set_L2_cache_size(unsigned int size) {
- L2_cache_size = size;
- }
-};
-
-CPUInfo *get_CPUInfo();
diff --git a/arm_compute/runtime/CPUUtils.h b/arm_compute/runtime/CPUUtils.h
new file mode 100644
index 0000000000..70211a5817
--- /dev/null
+++ b/arm_compute/runtime/CPUUtils.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__
+
+namespace arm_compute
+{
+class CPUInfo;
+/** This function will try to detect the CPU configuration on the system and will fill
+ * the cpuinfo object accordingly to reflect this.
+ *
+ * @param[out] cpuinfo @ref CPUInfo to be used to hold the system's cpu configuration.
+ */
+void get_cpu_configuration(CPUInfo &cpuinfo);
+/** Some systems have both big and small cores, this fuction computes the minimum number of cores
+ * that are exactly the same on the system. To maximize performance the library attempts to process
+ * workloads concurrently using as many threads as big cores are available on the system.
+ *
+ * @return The minumum number of common cores.
+ */
+unsigned int get_threads_hint();
+}
+#endif /* __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__ */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 1dd7c2cfb2..a0bcada722 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -59,17 +59,11 @@ public:
*/
virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
- /** Sets the target CPU architecture.
- *
- * @param[in] target Target CPU.
- */
- void set_target(CPUTarget target);
-
/** Get CPU info.
*
* @return CPU info.
*/
- CPUInfo cpu_info() const;
+ CPUInfo &cpu_info();
/** Get a hint for the best possible number of execution threads
*
* @warning In case we can't work out the best number of threads,
@@ -80,7 +74,7 @@ public:
unsigned int num_threads_hint() const;
protected:
- CPUInfo _info{};
+ CPUInfo _cpu_info;
private:
unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index e2d27cf941..40f28587c2 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -127,70 +127,32 @@ inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryG
/** Create a wrapper kernel.
*
- * @param[in] a Input tensor A.
- * @param[in] b Input tensor B.
- * @param[in] c (Optional) Input tensor C.
- * @param[out] d Output tensor.
- * @param[in] alpha Alpha value.
- * @param[in] beta Beta value.
- *
- * @return the wrapper kernel.
- */
-template <typename T>
-std::unique_ptr<NEGEMMAssemblyWrapper<T>> create_wrapper_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
-{
- // rework this function, why are we checking data type and other things here ? should we create another function can_run_optimised_kernel() ?
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
- }
- else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
- return nullptr;
-}
-
-/** Setup assembly kernel.
- *
* @param[in] a Input tensor A.
* @param[in] b Input tensor B.
- * @param[in] c (Optional) Input tensor C.
- * @param[in] d Output tensor.
+ * @param[out] d Output tensor.
* @param[in] alpha Alpha value.
* @param[in] beta Beta value.
* @param[out] workspace Workspace tensor
* @param[in] memory_group Tensor memory group.
* @param[out] asm_glue Assembly glue kernel.
*
- * @return True if the assembly kernel is setup correctly.
+ * @return the wrapper kernel.
*/
template <typename T>
-inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta,
+inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta,
Tensor &workspace, MemoryGroup &memory_group, T &asm_glue)
{
- const ::CPUInfo *ci = get_CPUInfo();
- const int M = d->info()->tensor_shape().y();
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const int M = d->info()->tensor_shape().y();
+ const int N = d->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
+ unsigned int num_threads = NEScheduler::get().num_threads();
// unique_ptr to a Gemm object
- std::unique_ptr<typename T::AssemblyGemm> asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(*ci, M, N, K, false, false, alpha, beta, num_threads,
- false));
-
+ std::unique_ptr<typename T::AssemblyGemm>
+ asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(ci, M, N, K, false, false, alpha, beta, num_threads, false));
// arm_compute wrapper for the Gemm object (see above)
- std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>> acl_gemm_wrapper = create_wrapper_kernel<typename T::AssemblyGemm>(a, b, c, d, alpha, beta);
+ std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>
+ acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>();
if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr)
{
acl_gemm_wrapper->configure(asm_gemm.get());
@@ -198,15 +160,23 @@ inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITen
if(workspace_size)
{
// Allocate workspace
- allocate_workspace(workspace_size, workspace, memory_group, 4096, num_threads);
+ const unsigned int alignment = 4096;
+ allocate_workspace(workspace_size, workspace, memory_group, alignment, num_threads);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(workspace.buffer());
asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.buffer()));
}
- const unsigned int window_size = asm_gemm->get_window_size();
- if(window_size < num_threads)
+
+ //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+ //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
{
- num_threads = window_size;
- asm_gemm->set_nthreads(num_threads);
+ const unsigned int window_size = asm_gemm->get_window_size();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ asm_gemm->set_nthreads(num_threads);
+ }
}
+
asm_glue._gemm_kernel_asm = std::move(asm_gemm);
asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
// We need to setup the ptrs in the run() method