21 files changed, 748 insertions, 881 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 3abc0a2e88..8a9ada81c1 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -24,44 +24,115 @@
 #ifndef __ARM_COMPUTE_CPP_TYPES_H__
 #define __ARM_COMPUTE_CPP_TYPES_H__
 
+#include <vector>
+
 namespace arm_compute
 {
-/** Available CPU Targets */
-enum class CPUTarget
+/** CPU models - we only need to detect CPUs we have
+ * microarchitecture-specific code for.
+ *
+ * Architecture features are detected via HWCAPs.
+ */
+enum class CPUModel
 {
-    ARCH_MASK  = 0x0F00,
-    CPU_MODEL  = 0x00FF,
-    INTRINSICS = 0x0100,
-    ARMV7      = 0x0200,
-    ARMV8      = 0x0300,
-    ARMV8_2    = 0x0400,
-    A7x        = 0x0070,
-    A5x        = 0x0050,
-    DOT        = 0x1000,
-
-    A53     = (ARMV8 | A7x | 0x3),
-    A55     = (ARMV8_2 | A5x | 0x5),
-    A55_DOT = (A55 | DOT),
-    A72     = (ARMV8 | A7x | 0x2),
-    A73     = (ARMV8 | A7x | 0x3),
-    A75     = (ARMV8_2 | A7x | 0x5),
-    A75_DOT = (A75 | DOT),
+    GENERIC,
+    A53,
+    A55r0,
+    A55r1,
 };
 
-/** Information about a CPU. */
-struct CPUInfo
+class CPUInfo final
 {
-    CPUTarget CPU{ CPUTarget::INTRINSICS }; /**< CPU target. */
-    int       L1_size{ 0 };                 /**< Size of L1 cache. */
-    int       L2_size{ 0 };                 /**< Size of L2 cache. */
+public:
+    /** Constructor */
+    CPUInfo();
+
+    /** Disable copy constructor and assignment operator to avoid copying the vector of CPUs each time
+     *  CPUInfo is initialized once in the IScheduler and ThreadInfo will get a pointer to it.
+     */
+    CPUInfo &operator=(const CPUInfo &cpuinfo) = delete;
+    CPUInfo(const CPUInfo &cpuinfo)            = delete;
+    CPUInfo &operator=(const CPUInfo &&cpuinfo) = delete;
+    CPUInfo(const CPUInfo &&cpuinfo)            = delete;
+
+    /** Checks if the cpu model supports fp16.
+     *
+     * @return true of the cpu supports fp16, false otherwise
+     */
+    bool has_fp16() const;
+    /** Checks if the cpu model supports dot product.
+     *
+     * @return true of the cpu supports dot product, false otherwise
+     */
+    bool has_dotprod() const;
+    /** Gets the cpu model for a given cpuid.
+     *
+     * @param[in] cpuid the id of the cpu core to be retrieved,
+     *
+     * @return the @ref CPUModel of the cpuid queiried.
+     */
+    CPUModel get_cpu_model(unsigned int cpuid) const;
+    /** Gets the current thread's cpu model
+     *
+     * @return Current thread's @ref CPUModel
+     */
+    CPUModel get_cpu_model() const;
+    /** Gets the L1 cache size
+     *
+     * @return the size of the L1 cache
+     */
+    unsigned int get_L1_cache_size() const;
+    /** Gets the L2 cache size
+     *
+     * @return the size of the L1 cache
+     */
+    unsigned int get_L2_cache_size() const;
+    /** Set the L1 cache size
+     *
+     * @param[in] size the new size to be set.
+     */
+    void set_L1_cache_size(unsigned int size);
+    /** Set the L2 cache size
+     *
+     * @param[in] size the new size to be set.
+     */
+    void set_L2_cache_size(unsigned int size);
+    /** Set fp16 support
+     *
+     * @param[in] fp16 whether the cpu supports fp16.
+     */
+    void set_fp16(const bool fp16);
+    /** Set dot product support
+     *
+     * @param[in] dotprod whether the cpu supports dot product.
+     */
+    void set_dotprod(const bool dotprod);
+    /** Set the cpumodel for a given cpu core
+     *
+     * @param[in] cpuid the id of the core to be set.
+     * @param[in] model the @ref CPUModel to be set.
+     */
+    void set_cpu_model(unsigned int cpuid, CPUModel model);
+    /** Set max number of cpus
+     *
+     * @param[in] cpu_count the number of CPUs in the system.
+     */
+    void set_cpu_num(unsigned int cpu_count);
+
+private:
+    std::vector<CPUModel> _percpu        = {};
+    bool                  _fp16          = false;
+    bool                  _dotprod       = false;
+    unsigned int          _L1_cache_size = 32768;
+    unsigned int          _L2_cache_size = 262144;
 };
 
 /** Information about executing thread and CPU. */
 struct ThreadInfo
 {
-    int     thread_id{ 0 };   /**< Executing thread. */
-    int     num_threads{ 1 }; /**< Number of CPU threads. */
-    CPUInfo cpu_info{};       /**< CPU information. */
+    int            thread_id{ 0 };
+    int            num_threads{ 1 };
+    const CPUInfo *cpu_info{ nullptr };
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CPP_TYPES_H__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
index a608566634..8d3db4adf2 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -23,7 +23,15 @@
  */
 #pragma once
 
-/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */
+/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+namespace arm_gemm
+{
+using CPUModel = arm_compute::CPUModel;
+using CPUInfo  = arm_compute::CPUInfo;
+} // namespace arm_compute
+
+
 
-/* Our CPUInfo is defined in newgemm_lib.hpp */
-#include "newgemm_lib.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
deleted file mode 100644
index 0e232b6bc5..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <fcntl.h>
-#include <sched.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <sstream>
-#include <thread>
-
-extern int l1_cache_size;
-extern int l2_cache_size;
-extern int force_cpu;
-
-#ifdef __ANDROID__
-inline unsigned long      stoul( const std::string& str, std::size_t* pos = 0, int base = 10 )
-{
-        char *end;
-        const unsigned long ret = strtoul( str.c_str(), &end, base);
-        *pos = end - str.c_str();
-        return ret;
-}
-inline int       stoi( const std::string& str, std::size_t* pos = 0, int base = 10 )
-{
-        return atoi(str.c_str());        
-}
-#endif
-
-
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-#include <sys/auxv.h>
-
-/* Get HWCAP bits from asm/hwcap.h */
-#include <asm/hwcap.h>
-#endif /* !BARE_METAL */
-
-/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
- * out of date (or for bare metal mode) */
-#ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP      (1 << 10)
-#endif
-
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID        (1 << 11)
-#endif
-
-#ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP      (1 << 20)
-#endif
-
-#define CPUINFO_HACK
-
-//unsigned int get_cpu_impl();
-
-
-/* CPU models - we only need to detect CPUs we have
- * microarchitecture-specific code for.
- *
- * Architecture features are detected via HWCAPs.
- */
-enum class CPUModel {
-    GENERIC    = 0x0001,
-    A53        = 0x0010,
-    A55r0      = 0x0011,
-    A55r1      = 0x0012,
-};
-
-class CPUInfo
-{
-private:
-    struct PerCPUData {
-        CPUModel  model      = CPUModel::GENERIC;
-        uint32_t  midr       = 0;
-        bool      model_set  = false;
-    };
-
-    std::vector<PerCPUData> _percpu={};
-
-    bool _cpuid   = false;
-    bool _fp16    = false;
-    bool _dotprod = false;
-
-    unsigned int L1_cache_size = 32768;
-    unsigned int L2_cache_size = 262144;
-
-    /* Convert an MIDR register value to a CPUModel enum value. */
-    CPUModel midr_to_model(const unsigned int midr) const {
-        CPUModel model;
-
-        // Unpack variant and CPU ID
-        int variant = (midr >> 20) & 0xF;
-        int cpunum = (midr >> 4) & 0xFFF;
-
-        /* Only CPUs we have code paths for are detected.  All other CPUs
-         * can be safely classed as "GENERIC"
-         */
-
-        switch(cpunum) {
-            case 0xd03:
-                model = CPUModel::A53;
-                break;
-
-            case 0xd05:
-                if (variant) {
-                    model = CPUModel::A55r1;
-                } else {
-                    model = CPUModel::A55r0;
-                }
-                break;
-
-            default:
-                model = CPUModel::GENERIC;
-                break;
-        }
-
-        return model;
-    }
-
-    /* If the CPUID capability is present, MIDR information is provided in
-       /sys.  Use that to populate the CPU model table.  */
-    void populate_models_cpuid() {
-        for (unsigned long int i=0; i<_percpu.size(); i++) {
-            std::stringstream str;
-            str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
-            std::ifstream file;
-
-            file.open(str.str(), std::ios::in);
-
-            if (file.is_open()) {
-                std::string line;
-
-                if (bool(getline(file, line))) {
-                    const unsigned long midr = stoul(line, nullptr, 16);
-
-                    _percpu[i].midr      = (midr & 0xffffffff);
-                    _percpu[i].model     = midr_to_model(_percpu[i].midr);
-                    _percpu[i].model_set = true;
-                }
-            }
-        }
-    }
-
-    /* If "long-form" cpuinfo is present, parse that to populate models. */
-    void populate_models_cpuinfo() {
-        std::regex   proc_regex("^processor.*(\\d+)$");
-        std::regex   imp_regex("^CPU implementer.*0x(..)$");
-        std::regex   var_regex("^CPU variant.*0x(.)$");
-        std::regex   part_regex("^CPU part.*0x(...)$");
-        std::regex   rev_regex("^CPU revision.*(\\d+)$");
-
-        std::ifstream file;
-        file.open("/proc/cpuinfo", std::ios::in);
-
-        if (file.is_open()) {
-            std::string line;
-            int midr=0;
-            int curcpu=-1;
-
-            while(bool(getline(file, line))) {
-                std::smatch match;
-
-                if (std::regex_match(line, match, proc_regex)) {
-                    std::string id = match[1];
-                    int newcpu=stoi(id, nullptr, 0);
-
-                    if (curcpu >= 0 && midr==0) {
-                        // Matched a new CPU ID without any description of the previous one - looks like old format.
-                        return;
-                    }
-
-                    if (curcpu >= 0) {
-                        _percpu[curcpu].midr      = midr;
-                        _percpu[curcpu].model     = midr_to_model(midr);
-                        _percpu[curcpu].model_set = true;
-                    }
-
-                    midr=0;
-                    curcpu=newcpu;
-
-                    continue;
-                }
-
-                if (std::regex_match(line, match, imp_regex)) {
-                    int impv = stoi(match[1], nullptr, 16);
-                    midr |= (impv << 24);
-                    continue;
-                }
-
-                if (std::regex_match(line, match, var_regex)) {
-                    int varv = stoi(match[1], nullptr, 16);
-                    midr |= (varv << 16);
-                    continue;
-                }
-
-                if (std::regex_match(line, match, part_regex)) {
-                    int partv = stoi(match[1], nullptr, 16);
-                    midr |= (partv << 4);
-                    continue;
-                }
-
-                if (std::regex_match(line, match, rev_regex)) {
-                    int regv = stoi(match[1], nullptr, 10);
-                    midr |= (regv);
-                    midr |= (0xf << 16);
-                    continue;
-                }
-            }
-
-            if (curcpu >= 0) {
-                _percpu[curcpu].midr      = midr;
-                _percpu[curcpu].model     = midr_to_model(midr);
-                _percpu[curcpu].model_set = true;
-
-            }
-        }
-    }
-
-    /* Identify the maximum valid CPUID in the system.  This reads
-     * /sys/devices/system/cpu/present to get the information.  */
-    int get_max_cpus() {
-        int max_cpus = 1;
-
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-        std::ifstream CPUspresent;
-        CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
-        bool success = false;
-
-        if (CPUspresent.is_open()) {
-            std::string line;
-
-            if (bool(getline(CPUspresent, line))) {
-                /* The content of this file is a list of ranges or single values, e.g.
-                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
-                 * max valid ID, we just need to find the last valid
-                 * delimiter ('-' or ',') and parse the integer immediately after that.
-                 */
-                auto startfrom=line.begin();
-
-                for (auto i=line.begin(); i<line.end(); ++i) {
-                    if (*i=='-' || *i==',') {
-                        startfrom=i+1;
-                    }
-                }
-
-                line.erase(line.begin(), startfrom);
-
-                max_cpus = stoi(line, nullptr, 0) + 1;
-                success = true;
-            }
-        }
-
-        // Return std::thread::hardware_concurrency() as a fallback.
-        if (!success) {
-            max_cpus = std::thread::hardware_concurrency();
-        }
-#endif // !BARE_METAL
-
-        return max_cpus;
-    }
-
-public:
-    CPUInfo() {
-#if ! defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-        unsigned long hwcaps = getauxval(AT_HWCAP);
-
-        if (hwcaps & HWCAP_CPUID) {
-            _cpuid = true;
-        }
-
-        if (hwcaps & HWCAP_ASIMDHP) {
-            _fp16 = true;
-        }
-
-        if (hwcaps & HWCAP_ASIMDDP) {
-            _dotprod = true;
-        }
-
-#ifdef __aarch64__
-        /* Pre-4.15 kernels don't have the ASIMDDP bit.
-         *
-         * Although the CPUID bit allows us to read the feature register
-         * directly, the kernel quite sensibly masks this to only show
-         * features known by it to be safe to show to userspace.  As a
-         * result, pre-4.15 kernels won't show the relevant bit in the
-         * feature registers either.
-         *
-         * So for now, use a whitelist of CPUs known to support the feature.
-         */
-        if (!_dotprod && _cpuid) {
-            /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
-            const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
-            const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
-
-            unsigned long cpuid;
-
-            __asm __volatile (
-                "mrs %0, midr_el1\n"
-                : "=r" (cpuid)
-                :
-                :
-            );
-
-            for (int i=0;dotprod_whitelist_values[i];i++) {
-                if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) {
-                    _dotprod = true;
-                    break;
-                }
-            }
-        }
-#endif
-        _percpu.resize(get_max_cpus());
-#endif
-        if (_cpuid) {
-            populate_models_cpuid();
-        } else {
-            populate_models_cpuinfo();
-        }
-    }
-
-    void set_fp16(const bool fp16) {
-        _fp16 = fp16;
-    }
-
-    void set_dotprod(const bool dotprod) {
-        _dotprod = dotprod;
-    }
-
-    void set_cpu_model(unsigned long cpuid, CPUModel model) {
-        if (_percpu.size() > cpuid) {
-            _percpu[cpuid].model     = model;
-            _percpu[cpuid].model_set = true;
-        }
-    }
-
-    bool has_fp16() const {
-        return _fp16;
-    }
-
-    bool has_dotprod() const {
-        return _dotprod;
-    }
-
-    CPUModel get_cpu_model(unsigned long cpuid) const {
-        if (cpuid < _percpu.size()) {
-            return _percpu[cpuid].model;
-        }
-
-        return CPUModel::GENERIC;
-    }
-
-    CPUModel get_cpu_model() const {
-#if defined(BARE_METAL) || (!defined(__arm__) && !defined( __aarch64__) )
-        return get_cpu_model(0);
-#else
-        return get_cpu_model(sched_getcpu());
-#endif
-    }
-
-    unsigned int get_L1_cache_size() const {
-        return L1_cache_size;
-    }
-
-    void set_L1_cache_size(unsigned int size) {
-        L1_cache_size = size;
-    }
-
-    unsigned int get_L2_cache_size() const {
-        return L2_cache_size;
-    }
-
-    void set_L2_cache_size(unsigned int size) {
-        L2_cache_size = size;
-    }
-};
-
-CPUInfo *get_CPUInfo();
diff --git a/arm_compute/runtime/CPUUtils.h b/arm_compute/runtime/CPUUtils.h
new file mode 100644
index 0000000000..70211a5817
--- /dev/null
+++ b/arm_compute/runtime/CPUUtils.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__
+
+namespace arm_compute
+{
+class CPUInfo;
+/** This function will try to detect the CPU configuration on the system and will fill
+ *  the cpuinfo object accordingly to reflect this.
+ *
+ * @param[out] cpuinfo @ref CPUInfo to be used to hold the system's cpu configuration.
+ */
+void get_cpu_configuration(CPUInfo &cpuinfo);
+/** Some systems have both big and small cores, this fuction computes the minimum number of cores
+ *  that are exactly the same on the system. To maximize performance the library attempts to process
+ *  workloads concurrently using as many threads as big cores are available on the system.
+ *
+ * @return The minumum number of common cores.
+ */
+unsigned int get_threads_hint();
+}
+#endif /* __ARM_COMPUTE_RUNTIME_CPU_UTILS_H__ */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 1dd7c2cfb2..a0bcada722 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -59,17 +59,11 @@ public:
      */
     virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
 
-    /** Sets the target CPU architecture.
-     *
-     * @param[in] target Target CPU.
-     */
-    void set_target(CPUTarget target);
-
     /** Get CPU info.
      *
      * @return CPU info.
      */
-    CPUInfo cpu_info() const;
+    CPUInfo &cpu_info();
     /** Get a hint for the best possible number of execution threads
      *
      * @warning In case we can't work out the best number of threads,
@@ -80,7 +74,7 @@ public:
     unsigned int num_threads_hint() const;
 
 protected:
-    CPUInfo _info{};
+    CPUInfo _cpu_info;
 
 private:
     unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index e2d27cf941..40f28587c2 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -127,70 +127,32 @@ inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryG
 
 /** Create a wrapper kernel.
  *
- * @param[in]  a     Input tensor A.
- * @param[in]  b     Input tensor B.
- * @param[in]  c     (Optional) Input tensor C.
- * @param[out] d     Output tensor.
- * @param[in]  alpha Alpha value.
- * @param[in]  beta  Beta value.
- *
- * @return the wrapper kernel.
- */
-template <typename T>
-std::unique_ptr<NEGEMMAssemblyWrapper<T>> create_wrapper_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
-{
-    // rework this function, why are we checking data type and other things here ? should we create another function can_run_optimised_kernel() ?
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-    {
-        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
-    }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-    {
-        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
-    }
-    else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-    return nullptr;
-}
-
-/** Setup assembly kernel.
- *
  * @param[in]  a            Input tensor A.
  * @param[in]  b            Input tensor B.
- * @param[in]  c            (Optional) Input tensor C.
- * @param[in]  d            Output tensor.
+ * @param[out] d            Output tensor.
  * @param[in]  alpha        Alpha value.
  * @param[in]  beta         Beta value.
  * @param[out] workspace    Workspace tensor
  * @param[in]  memory_group Tensor memory group.
  * @param[out] asm_glue     Assembly glue kernel.
  *
- * @return True if the assembly kernel is setup correctly.
+ * @return the wrapper kernel.
  */
 template <typename T>
-inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta,
+inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta,
                                   Tensor &workspace, MemoryGroup &memory_group, T &asm_glue)
 {
-    const ::CPUInfo *ci          = get_CPUInfo();
-    const int        M           = d->info()->tensor_shape().y();
-    const int        N           = d->info()->tensor_shape().x();
-    const int        K           = a->info()->tensor_shape().x();
-    unsigned int     num_threads = NEScheduler::get().num_threads();
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    const int      M           = d->info()->tensor_shape().y();
+    const int      N           = d->info()->tensor_shape().x();
+    const int      K           = a->info()->tensor_shape().x();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
     // unique_ptr to a Gemm object
-    std::unique_ptr<typename T::AssemblyGemm> asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(*ci, M, N, K, false, false, alpha, beta, num_threads,
-                                                                                                                        false));
-
+    std::unique_ptr<typename T::AssemblyGemm>
+    asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(ci, M, N, K, false, false, alpha, beta, num_threads, false));
     // arm_compute wrapper for the Gemm object (see above)
-    std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>> acl_gemm_wrapper = create_wrapper_kernel<typename T::AssemblyGemm>(a, b, c, d, alpha, beta);
+    std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>
+                                                                  acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>();
     if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr)
     {
         acl_gemm_wrapper->configure(asm_gemm.get());
@@ -198,15 +160,23 @@ inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITen
         if(workspace_size)
         {
             // Allocate workspace
-            allocate_workspace(workspace_size, workspace, memory_group, 4096, num_threads);
+            const unsigned int alignment = 4096;
+            allocate_workspace(workspace_size, workspace, memory_group, alignment, num_threads);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(workspace.buffer());
             asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.buffer()));
         }
-        const unsigned int window_size = asm_gemm->get_window_size();
-        if(window_size < num_threads)
+
+        //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+        //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
         {
-            num_threads = window_size;
-            asm_gemm->set_nthreads(num_threads);
+            const unsigned int window_size = asm_gemm->get_window_size();
+            if(window_size < num_threads)
+            {
+                num_threads = window_size;
+                asm_gemm->set_nthreads(num_threads);
+            }
         }
+
         asm_glue._gemm_kernel_asm  = std::move(asm_gemm);
         asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
         // We need to setup the ptrs in the run() method
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index e5793e5061..6c5b8ca26b 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -56,6 +56,7 @@ def filter_clang_tidy_lines( lines ):
                 ("NEMath.inl" in line and "statement expression not allowed at file scope" in line) or
                 ("Utils.h" in line and "no member named 'unmap' in 'arm_compute::Tensor'" in line) or
                 ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or
+                ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or
                 "3rdparty" in line):
                 print_context=False
                 continue
@@ -95,6 +96,8 @@ def filter_clang_tidy_lines( lines ):
                ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
                ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
                ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
+               ("CPUUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
+               ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                "3rdparty" in line):
                 print_context=False
                 continue
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
new file mode 100644
index 0000000000..7459957f8f
--- /dev/null
+++ b/src/core/CPP/CPPTypes.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+#include "arm_compute/core/Error.h"
+
+#ifndef BARE_METAL
+#include <sched.h>
+#endif /* defined(BARE_METAL) */
+
+using namespace arm_compute;
+
+void CPUInfo::set_fp16(const bool fp16)
+{
+    _fp16 = fp16;
+}
+
+void CPUInfo::set_dotprod(const bool dotprod)
+{
+    _dotprod = dotprod;
+}
+
+void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model)
+{
+    ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size());
+    if(_percpu.size() > cpuid)
+    {
+        _percpu[cpuid] = model;
+    }
+}
+
+bool CPUInfo::has_fp16() const
+{
+    return _fp16;
+}
+
+bool CPUInfo::has_dotprod() const
+{
+    return _dotprod;
+}
+
+CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
+{
+    ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size());
+    if(cpuid < _percpu.size())
+    {
+        return _percpu[cpuid];
+    }
+    return CPUModel::GENERIC;
+}
+
+unsigned int CPUInfo::get_L1_cache_size() const
+{
+    return _L1_cache_size;
+}
+
+void CPUInfo::set_L1_cache_size(unsigned int size)
+{
+    _L1_cache_size = size;
+}
+
+unsigned int CPUInfo::get_L2_cache_size() const
+{
+    return _L2_cache_size;
+}
+
+void CPUInfo::set_L2_cache_size(unsigned int size)
+{
+    _L2_cache_size = size;
+}
+
+void CPUInfo::set_cpu_num(unsigned int cpu_count)
+{
+    _percpu.resize(cpu_count);
+}
+
+CPUInfo::CPUInfo()
+    : _percpu(1)
+{
+    // The core library knows nothing about the CPUs so we set only 1 CPU to be generic.
+    // The runtime NESCheduler will initialise this vector with the correct CPU models.
+    // See void detect_cpus_configuration(CPUInfo &cpuinfo) in CPPUtils.h
+    _percpu[0] = CPUModel::GENERIC;
+}
+
+CPUModel CPUInfo::get_cpu_model() const
+{
+#if defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__))
+    return get_cpu_model(0);
+#else  /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+    return get_cpu_model(sched_getcpu());
+#endif /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+}
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
deleted file mode 100644
index b29cc58d5d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <newgemm_lib.hpp>
-
-unsigned int get_cpu_impl()
-{
-#ifndef BARE_METAL
-    int   fd = open("/proc/cpuinfo", 0);
-    char  buff[3000];
-    char *pos;
-    char *end;
-    int   foundid = 0;
-    int   variant = 0;
-
-    int cpu = sched_getcpu();
-
-    if(!fd)
-    {
-        return 0;
-    }
-
-    int charsread = read(fd, buff, 3000);
-    pos           = buff;
-    end           = buff + charsread;
-
-    close(fd);
-
-    /* So, to date I've encountered two formats for /proc/cpuinfo.
-     *
-     * One of them just lists processor : n  for each processor (with no
-     * other info), then at the end lists part information for the current
-     * CPU.
-     *
-     * The other has an entire clause (including part number info) for each
-     * CPU in the system, with "processor : n" headers.
-     *
-     * We can cope with either of these formats by waiting to see
-     * "processor: n" (where n = our CPU ID), and then looking for the next
-     * "CPU part" field.
-     */
-    while(pos < end)
-    {
-        if(foundid && !strncmp(pos, "CPU variant", 11))
-        {
-            pos += 13;
-            char *resume = end; // Need to continue scanning after this
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch    = '\0';
-                    resume = ch + 1;
-                    break;
-                }
-            }
-
-            variant = strtoul(pos, NULL, 0);
-
-            pos = resume;
-        }
-
-        if(foundid && !strncmp(pos, "CPU part", 8))
-        {
-            /* Found part number */
-            pos += 11;
-            unsigned int num;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            num = strtoul(pos, NULL, 0);
-
-            return (num << 4) | (variant << 20);
-        }
-
-        if(!strncmp(pos, "processor", 9))
-        {
-            /* Found processor ID, see if it's ours. */
-            pos += 11;
-            int num;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            num = strtol(pos, NULL, 0);
-
-            if(num == cpu)
-            {
-                foundid = 1;
-            }
-        }
-
-        while(pos < end)
-        {
-            char ch = *pos++;
-            if(ch == '\n' || ch == '\0')
-            {
-                break;
-            }
-        }
-    }
-#endif
-
-    return 0;
-}
-
-CPUInfo *get_CPUInfo()
-{
-    static CPUInfo ci;
-
-    return &ci;
-}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 168ed6e30f..92dce34c71 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
 
 #include <condition_variable>
 #include <iostream>
@@ -159,6 +160,7 @@ CPPScheduler::CPPScheduler()
     : _num_threads(num_threads_hint()),
       _threads(_num_threads - 1)
 {
+    get_cpu_configuration(_cpu_info);
 }
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -178,7 +180,7 @@ void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 
     /** [Scheduler example] */
     ThreadInfo info;
-    info.cpu_info = _info;
+    info.cpu_info = &_cpu_info;
 
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index c8285b43a7..2adc14ce80 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dime
 {
     ARM_COMPUTE_UNUSED(split_dimension);
     ThreadInfo info;
-    info.cpu_info = cpu_info();
+    info.cpu_info = &_cpu_info;
     kernel->run(kernel->window(), info);
 }
 
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
new file mode 100644
index 0000000000..7e8bf2bb3f
--- /dev/null
+++ b/src/runtime/CPUUtils.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPUUtils.h"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <map>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef BARE_METAL
+#include <regex>
+#include <thread>
+#endif /* BARE_METAL */
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif /* HWCAP_ASIMDHP */
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif /* HWCAP_CPUID */
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif /* HWCAP_ASIMDDP */
+
+namespace
+{
+using namespace arm_compute;
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+struct PerCPUData
+{
+    CPUModel     model     = CPUModel::GENERIC;
+    unsigned int midr      = 0;
+    bool         model_set = false;
+};
+
+/* Convert an MIDR register value to a CPUModel enum value. */
+CPUModel midr_to_model(const unsigned int midr)
+{
+    CPUModel model;
+
+    // Unpack variant and CPU ID
+    const int variant = (midr >> 20) & 0xF;
+    const int cpunum  = (midr >> 4) & 0xFFF;
+
+    // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
+    switch(cpunum)
+    {
+        case 0xd03:
+            model = CPUModel::A53;
+            break;
+
+        case 0xd05:
+            if(variant != 0)
+            {
+                model = CPUModel::A55r1;
+            }
+            else
+            {
+                model = CPUModel::A55r0;
+            }
+            break;
+
+        default:
+            model = CPUModel::GENERIC;
+            break;
+    }
+
+    return model;
+}
+
+void populate_models_cpuid(std::vector<PerCPUData> &cpusv)
+{
+    // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
+    uint32_t i = 0;
+    for(auto &c : cpusv)
+    {
+        std::stringstream str;
+        str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1";
+        std::ifstream file;
+        file.open(str.str(), std::ios::in);
+        if(file.is_open())
+        {
+            std::string line;
+            if(bool(getline(file, line)))
+            {
+                const unsigned long midr = support::cpp11::stoul(line, nullptr, 16);
+                c.midr                   = (midr & 0xffffffff);
+                c.model                  = midr_to_model(c.midr);
+                c.model_set              = true;
+            }
+        }
+    }
+}
+
+void populate_models_cpuinfo(std::vector<PerCPUData> &cpusv)
+{
+    // If "long-form" cpuinfo is present, parse that to populate models.
+    std::regex proc_regex("^processor.*(\\d+)$");
+    std::regex imp_regex("^CPU implementer.*0x(..)$");
+    std::regex var_regex("^CPU variant.*0x(.)$");
+    std::regex part_regex("^CPU part.*0x(...)$");
+    std::regex rev_regex("^CPU revision.*(\\d+)$");
+
+    std::ifstream file;
+    file.open("/proc/cpuinfo", std::ios::in);
+
+    if(file.is_open())
+    {
+        std::string line;
+        int         midr   = 0;
+        int         curcpu = -1;
+
+        while(bool(getline(file, line)))
+        {
+            std::smatch match;
+
+            if(std::regex_match(line, match, proc_regex))
+            {
+                std::string id     = match[1];
+                int         newcpu = support::cpp11::stoi(id, nullptr, 0);
+
+                if(curcpu >= 0 && midr == 0)
+                {
+                    // Matched a new CPU ID without any description of the previous one - looks like old format.
+                    return;
+                }
+
+                if(curcpu >= 0)
+                {
+                    cpusv[curcpu].midr      = midr;
+                    cpusv[curcpu].model     = midr_to_model(midr);
+                    cpusv[curcpu].model_set = true;
+                }
+
+                midr   = 0;
+                curcpu = newcpu;
+
+                continue;
+            }
+
+            if(std::regex_match(line, match, imp_regex))
+            {
+                int impv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (impv << 24);
+                continue;
+            }
+
+            if(std::regex_match(line, match, var_regex))
+            {
+                int varv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (varv << 16);
+                continue;
+            }
+
+            if(std::regex_match(line, match, part_regex))
+            {
+                int partv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (partv << 4);
+                continue;
+            }
+
+            if(std::regex_match(line, match, rev_regex))
+            {
+                int regv = support::cpp11::stoi(match[1], nullptr, 10);
+                midr |= (regv);
+                midr |= (0xf << 16);
+                continue;
+            }
+        }
+
+        if(curcpu >= 0)
+        {
+            cpusv[curcpu].midr      = midr;
+            cpusv[curcpu].model     = midr_to_model(midr);
+            cpusv[curcpu].model_set = true;
+        }
+    }
+}
+
+int get_max_cpus()
+{
+    int max_cpus = 1;
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    std::ifstream CPUspresent;
+    CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+    bool success = false;
+
+    if(CPUspresent.is_open())
+    {
+        std::string line;
+
+        if(bool(getline(CPUspresent, line)))
+        {
+            /* The content of this file is a list of ranges or single values, e.g.
+                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
+                 * max valid ID, we just need to find the last valid
+                 * delimiter ('-' or ',') and parse the integer immediately after that.
+                 */
+            auto startfrom = line.begin();
+
+            for(auto i = line.begin(); i < line.end(); ++i)
+            {
+                if(*i == '-' || *i == ',')
+                {
+                    startfrom = i + 1;
+                }
+            }
+
+            line.erase(line.begin(), startfrom);
+
+            max_cpus = support::cpp11::stoi(line, nullptr, 0) + 1;
+            success  = true;
+        }
+    }
+
+    // Return std::thread::hardware_concurrency() as a fallback.
+    if(!success)
+    {
+        max_cpus = std::thread::hardware_concurrency();
+    }
+#endif /* BARE_METAL */
+
+    return max_cpus;
+}
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+
+} // namespace
+
+namespace arm_compute
+{
+void get_cpu_configuration(CPUInfo &cpuinfo)
+{
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    bool cpuid        = false;
+    bool fp16_support = false;
+    bool dot_support  = false;
+
+    const uint32_t hwcaps = getauxval(AT_HWCAP);
+
+    if((hwcaps & HWCAP_CPUID) != 0)
+    {
+        cpuid = true;
+    }
+
+    if((hwcaps & HWCAP_ASIMDHP) != 0)
+    {
+        fp16_support = true;
+    }
+
+    if((hwcaps & HWCAP_ASIMDDP) != 0)
+    {
+        dot_support = true;
+    }
+
+#ifdef __aarch64__
+    /* Pre-4.15 kernels don't have the ASIMDDP bit.
+     *
+     * Although the CPUID bit allows us to read the feature register
+     * directly, the kernel quite sensibly masks this to only show
+     * features known by it to be safe to show to userspace.  As a
+     * result, pre-4.15 kernels won't show the relevant bit in the
+     * feature registers either.
+     *
+     * So for now, use a whitelist of CPUs known to support the feature.
+     */
+    if(!dot_support && cpuid)
+    {
+        /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
+        const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+        const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+        unsigned long cpuid;
+
+        __asm __volatile(
+            "mrs %0, midr_el1\n"
+            : "=r"(cpuid)
+            :
+            : );
+
+        for(int i = 0; dotprod_whitelist_values[i] != 0; i++)
+        {
+            if((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i])
+            {
+                dot_support = true;
+                break;
+            }
+        }
+    }
+#endif /* __aarch64__ */
+    const unsigned int max_cpus = get_max_cpus();
+    cpuinfo.set_cpu_num(max_cpus);
+    cpuinfo.set_fp16(fp16_support);
+    cpuinfo.set_dotprod(dot_support);
+    std::vector<PerCPUData> percpu(max_cpus);
+    if(cpuid)
+    {
+        populate_models_cpuid(percpu);
+    }
+    else
+    {
+        populate_models_cpuinfo(percpu);
+    }
+    int j(0);
+    for(const auto &v : percpu)
+    {
+        cpuinfo.set_cpu_model(j++, v.model);
+    }
+#else  /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+    ARM_COMPUTE_UNUSED(cpuinfo);
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+}
+
+unsigned int get_threads_hint()
+{
+    unsigned int num_threads_hint = 1;
+
+#ifndef BARE_METAL
+    std::map<std::string, unsigned int> cpu_part_occurrence_map;
+
+    // CPU part regex
+    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
+    std::smatch cpu_part_match;
+
+    // Read cpuinfo and get occurrence of each core
+    std::ifstream cpuinfo;
+    cpuinfo.open("/proc/cpuinfo", std::ios::in);
+    if(cpuinfo.is_open())
+    {
+        std::string line;
+        while(bool(getline(cpuinfo, line)))
+        {
+            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+            {
+                std::string cpu_part = cpu_part_match[1];
+                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
+                {
+                    cpu_part_occurrence_map[cpu_part]++;
+                }
+                else
+                {
+                    cpu_part_occurrence_map[cpu_part] = 1;
+                }
+            }
+        }
+    }
+
+    // Get min number of threads
+    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
+                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
+    {
+        return p1.second < p2.second;
+    });
+
+    // Set thread hint
+    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
+#endif /* BARE_METAL */
+
+    return num_threads_hint;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 583cb40eca..54a2bd2182 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -23,202 +23,20 @@
  */
 #include "arm_compute/runtime/IScheduler.h"
 
-#include <array>
-#include <cstdlib>
-#include <cstring>
-#include <fcntl.h>
-#include <fstream>
-#include <map>
-#include <sched.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#ifndef BARE_METAL
-#include <regex>
-#include <thread>
-#endif /* BARE_METAL */
-
-namespace
-{
-unsigned int get_threads_hint()
-{
-    unsigned int num_threads_hint = 1;
-
-#ifndef BARE_METAL
-    std::map<std::string, unsigned int> cpu_part_occurrence_map;
-
-    // CPU part regex
-    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
-    std::smatch cpu_part_match;
-
-    // Read cpuinfo and get occurrence of each core
-    std::ifstream cpuinfo;
-    cpuinfo.open("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo.is_open())
-    {
-        std::string line;
-        while(bool(getline(cpuinfo, line)))
-        {
-            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
-            {
-                std::string cpu_part = cpu_part_match[1];
-                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
-                {
-                    cpu_part_occurrence_map[cpu_part]++;
-                }
-                else
-                {
-                    cpu_part_occurrence_map[cpu_part] = 1;
-                }
-            }
-        }
-    }
-
-    // Get min number of threads
-    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
-                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
-    {
-        return p1.second < p2.second;
-    });
-
-    // Set thread hint
-    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
-#endif /* BARE_METAL */
-
-    return num_threads_hint;
-}
-
-unsigned int get_cpu_impl()
-{
-#ifndef BARE_METAL
-    int fd = open("/proc/cpuinfo", 0); // NOLINT
-    std::array<char, 3000> buff{ {} };
-    char *pos     = nullptr;
-    char *end     = nullptr;
-    bool  foundid = false;
-
-    int cpu = sched_getcpu();
-
-    if(fd == -1)
-    {
-        return 0;
-    }
-
-    int charsread = read(fd, buff.data(), 3000);
-    pos           = buff.data();
-    end           = buff.data() + charsread;
-
-    close(fd);
-
-    /* So, to date I've encountered two formats for /proc/cpuinfo.
-     *
-     * One of them just lists processor : n  for each processor (with no
-     * other info), then at the end lists part information for the current
-     * CPU.
-     *
-     * The other has an entire clause (including part number info) for each
-     * CPU in the system, with "processor : n" headers.
-     *
-     * We can cope with either of these formats by waiting to see
-     * "processor: n" (where n = our CPU ID), and then looking for the next
-     * "CPU part" field.
-     */
-    while(pos < end)
-    {
-        if(foundid && strncmp(pos, "CPU part", 8) == 0)
-        {
-            /* Found part number */
-            pos += 11;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            return strtoul(pos, nullptr, 0);
-        }
-
-        if(strncmp(pos, "processor", 9) == 0)
-        {
-            /* Found processor ID, see if it's ours. */
-            pos += 11;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            int num = strtol(pos, nullptr, 0);
-
-            if(num == cpu)
-            {
-                foundid = true;
-            }
-        }
-
-        while(pos < end)
-        {
-            char ch = *pos++;
-            if(ch == '\n' || ch == '\0')
-            {
-                break;
-            }
-        }
-    }
-#endif /* BARE_METAL */
-
-    return 0;
-}
-} // namespace
+#include "arm_compute/runtime/CPUUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
+    : _cpu_info()
 {
     // Work out the best possible number of execution threads
     _num_threads_hint = get_threads_hint();
-
-    // Work out the CPU implementation
-    switch(get_cpu_impl())
-    {
-        case 0xd0f:
-            _info.CPU = CPUTarget::A55_DOT;
-            break;
-        case 0xd03:
-            _info.CPU = CPUTarget::A53;
-            break;
-        default:
-#ifdef __arm__
-            _info.CPU = CPUTarget::ARMV7;
-#elif __aarch64__
-            _info.CPU = CPUTarget::ARMV8;
-#else  /* __arm__ || __aarch64__ */
-            _info.CPU = CPUTarget::INTRINSICS;
-#endif /* __arm__ || __aarch64__ */
-            break;
-    }
-
-    _info.L1_size = 31000;
-    _info.L2_size = 500000;
-}
-
-void IScheduler::set_target(CPUTarget target)
-{
-    _info.CPU = target;
 }
 
-CPUInfo IScheduler::cpu_info() const
+CPUInfo &IScheduler::cpu_info()
 {
-    return _info;
+    return _cpu_info;
 }
 
 unsigned int IScheduler::num_threads_hint() const
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index c8cba8a174..18e6e919c3 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -65,7 +65,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    const bool run_optimised          = setup_assembly_kernel(a, b, c, d, alpha, beta, _workspace, _memory_group, _asm_glue);
+
+    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f) && setup_assembly_kernel(a, b, d, alpha, beta, _workspace, _memory_group, _asm_glue);
 
     // Check if the first input tensor is a vector.
     // If so, all the kernels for reshaping the tensors can be skipped
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index b2dd0227a5..cdbd32373a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -271,8 +271,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
     const unsigned int fixed_point_position = input->info()->fixed_point_position();
     const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
 
-    bool run_optimised =
-        (NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32) || (NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32);
+    bool run_optimised = dt == DataType::F32;
 
     // Reshape weights if needed
     if(run_optimised)
@@ -369,8 +368,10 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
     // Configure matrix multiply
     if(run_optimised)
     {
-        run_optimised = setup_assembly_kernel(&_input_im2col_reshaped, weights, nullptr, &_gemm_output, 1.f, 0.f, _workspace, _memory_group, _asm_glue);
-        ARM_COMPUTE_ERROR_ON_MSG(run_optimised == false, "setup_assembly_kernel failed.");
+        if(!setup_assembly_kernel(&_input_im2col_reshaped, weights, &_gemm_output, 1.f, 0.f, _workspace, _memory_group, _asm_glue))
+        {
+            ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
+        }
     }
     else
     {
@@ -450,17 +451,10 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
     bool                         optimised_kernel = false;
 
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
-    {
-        optimised_kernel = true;
-    }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    if(dt == DataType::F32)
     {
         optimised_kernel = true;
     }
-#endif /* defined(__arm__) || defined(__aarch64__) */
 
     // Reshape weights if needed
     if(optimised_kernel)
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index a02872c3d5..27dd6c51d7 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -58,13 +58,13 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe
     {
         case DataType::S8:
         {
-            run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
             break;
         }
         default:
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index b0a7bdd7bc..7372c6ca57 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -62,13 +62,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     {
         case DataType::S8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
             break;
         }
         default:
@@ -156,10 +156,6 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
-    else
-    {
-        _workspace.allocator()->allocate();
-    }
 
     if(_a_offset != 0)
     {
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 3b30f1e56b..795c96caf0 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
 
 #include <omp.h>
 
@@ -41,6 +42,7 @@ OMPScheduler &OMPScheduler::get()
 OMPScheduler::OMPScheduler() // NOLINT
     : _num_threads(omp_get_max_threads())
 {
+    get_cpu_configuration(_cpu_info);
 }
 
 unsigned int OMPScheduler::num_threads() const
@@ -59,7 +61,7 @@ void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     ThreadInfo info;
-    info.cpu_info = _info;
+    info.cpu_info = &_cpu_info;
 
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
@@ -71,7 +73,7 @@ void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
     }
     else
     {
-        #pragma omp parallel private(info) num_threads(info.num_threads)
+        #pragma omp parallel firstprivate(info) num_threads(info.num_threads)
         {
             const int tid  = omp_get_thread_num();
             Window    win  = max_window.split_window(split_dimension, tid, info.num_threads);
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index cb156f39c7..88c17009cc 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -69,12 +69,17 @@ inline std::string to_string(T && value)
  *
  * @return Integer representation of @p str.
  */
-inline int stoi(const std::string &str)
+inline int stoi(const std::string &str, std::size_t *pos = 0, int base = 10)
 {
-    std::stringstream stream(str);
-    int               value = 0;
-    stream >> value;
-    return value;
+    unsigned int      x;
+    std::stringstream ss;
+    if(base == 16)
+    {
+        ss << std::hex;
+    }
+    ss << str;
+    ss >> x;
+    return x;
 }
 
 /** Convert string values to unsigned long.
@@ -86,10 +91,15 @@ inline int stoi(const std::string &str)
  *
  * @return Unsigned long representation of @p str.
  */
-inline unsigned long stoul(const std::string &str)
+inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, int base = 10)
 {
-    std::stringstream stream(str);
+    std::stringstream stream;
     unsigned long     value = 0;
+    if(base == 16)
+    {
+        stream << std::hex;
+    }
+    stream << str;
     stream >> value;
     return value;
 }
diff --git a/tests/networks/AlexNetNetwork.h b/tests/networks/AlexNetNetwork.h
index a30b7f8f75..97991b0121 100644
--- a/tests/networks/AlexNetNetwork.h
+++ b/tests/networks/AlexNetNetwork.h
@@ -107,7 +107,7 @@ public:
         {
             auto reshape = [&](unsigned int width, unsigned int height, bool convolution_layer) -> TensorShape
             {
-                const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV7 && data_type == DataType::F32;
+                const bool is_optimised = std::is_same<ITensorType, ITensor>::value && data_type == DataType::F32;
 
                 if(convolution_layer && is_optimised)
                 {
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 6a100acef3..3d073e3f79 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -100,6 +100,8 @@ protected:
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
                               bool reshape_weights, const Size2D &dilation)
     {
+        const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && _data_type == DataType::F32;
+
         WeightsInfo weights_info(!reshape_weights, weights_shape.x(), weights_shape.y(), weights_shape[3]);
         TensorShape reshaped_weights_shape(weights_shape);
 
@@ -107,12 +109,6 @@ protected:
         {
             // Check if its a "fully connected" convolution
             const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
-            bool       is_optimised                   = false;
-#if defined(__arm__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && _data_type == DataType::F32;
-#elif defined(__aarch64__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && _data_type == DataType::F32;
-#endif /* defined(__arm__) || defined(__aarch64__) */
 
             reshaped_weights_shape.collapse(3);
 
@@ -167,14 +163,7 @@ protected:
 
         if(!reshape_weights)
         {
-            const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
-            bool       is_optimised                   = false;
-#if defined(__arm__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && _data_type == DataType::F32;
-#elif defined(__aarch64__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && _data_type == DataType::F32;
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
+            const bool      is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
             TensorShape     tmp_weights_shape(weights_shape);
             SimpleTensor<T> tmp_weights(tmp_weights_shape, _data_type, 1, _fractional_bits, _quantization_info);