aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md24
-rw-r--r--SConscript4
-rw-r--r--arm_compute/core/CPP/CPPTypes.h7
-rw-r--r--arm_compute/function_info/ActivationLayerInfo.h14
-rw-r--r--arm_compute/runtime/OMP/OMPScheduler.h9
-rw-r--r--docs/user_guide/errata.dox10
-rw-r--r--docs/user_guide/release_version_and_change_log.dox7
-rw-r--r--src/common/cpuinfo/CpuInfo.cpp50
-rw-r--r--src/common/cpuinfo/CpuInfo.h9
-rw-r--r--src/core/CPP/CPPTypes.cpp14
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp6
-rw-r--r--src/core/helpers/LUTManager.cpp27
-rw-r--r--src/core/helpers/LUTManager.h18
-rw-r--r--src/cpu/kernels/CpuActivationKernel.cpp18
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp6
-rw-r--r--src/runtime/OMP/OMPScheduler.cpp35
-rw-r--r--tests/validation/NEON/UNIT/RuntimeContext.cpp15
22 files changed, 231 insertions, 68 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c67479ce41..f291534201 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
project(
ArmCompute
- VERSION 36.0.0
+ VERSION 38.0.0
DESCRIPTION
"The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
LANGUAGES C CXX ASM)
diff --git a/README.md b/README.md
index 112f40225d..02dd05edac 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
<img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
</div>
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.04-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.06-green)
The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
<br>
## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.04-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-24.06-green)](https://arm-software.github.io/ComputeLibrary/latest)
> Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
@@ -50,24 +50,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
| Platform | Operating System | Release archive (Download) |
| -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) |
<br>
| Architecture | Operating System | Release archive (Download) |
| ------------ | ---------------- | -------------------------- |
-| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
<br>
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.04-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.04)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.06-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.06)
Pre-build binaries are generated with the following security / good coding practices related flags:
> -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
diff --git a/SConscript b/SConscript
index 80aa87cae8..325506ed40 100644
--- a/SConscript
+++ b/SConscript
@@ -32,8 +32,8 @@ import json
import codecs
import platform
-VERSION = "v0.0-unreleased"
-LIBRARY_VERSION_MAJOR = 36
+VERSION = "v24.06"
+LIBRARY_VERSION_MAJOR = 38
LIBRARY_VERSION_MINOR = 0
LIBRARY_VERSION_PATCH = 0
SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 139d630fd7..c97751bc0c 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -170,7 +170,12 @@ public:
* @return Number of CPUs
*/
unsigned int get_cpu_num() const;
-
+ /** Return the maximum number of CPUs present excluding the little cores
+ * in case of an Android device
+ *
+ * @return Number of CPUs excluding little
+ */
+ unsigned int get_cpu_num_excluding_little() const;
/** Return the vector length in bytes for sme2
*
* @return Vector length if sme2 is enabled, otherwise returns 0.
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 9390d0c54f..83b12d572e 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -121,6 +121,20 @@ public:
_lut_fp16 = lut;
}
#endif // __aarch64__
+
+ // The < and == are added to be able to use this data type as an attribute for LUTInfo
+ friend bool operator<(const ActivationLayerInfo &l, const ActivationLayerInfo &r)
+ {
+ const auto l_tup = std::make_tuple(l._act, l._a, l._b, l._enabled);
+ const auto r_tup = std::make_tuple(r._act, r._a, r._b, r._enabled);
+
+ return l_tup < r_tup;
+ }
+ bool operator==(const ActivationLayerInfo &l) const
+ {
+ return this->_act == l._act && this->_a == l._a && this->_b == l._b && this->_enabled == l._enabled;
+ }
+
private:
ActivationFunction _act = {ActivationLayerInfo::ActivationFunction::IDENTITY};
float _a = {};
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index b522b403a9..9b39714fea 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
#include "arm_compute/runtime/IScheduler.h"
@@ -79,6 +79,7 @@ protected:
private:
unsigned int _num_threads;
+ unsigned int _nonlittle_num_cpus;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
index 056e45a432..c195dc7851 100644
--- a/docs/user_guide/errata.dox
+++ b/docs/user_guide/errata.dox
@@ -1,5 +1,5 @@
///
-/// Copyright (c) 2019-2023 Arm Limited.
+/// Copyright (c) 2019-2024 Arm Limited.
///
/// SPDX-License-Identifier: MIT
///
@@ -30,6 +30,14 @@ namespace arm_compute
@section S7_1_errata Errata
+- (COMPMID-6904) Fix out-of-bound memory write for non-optimized FP16 GeMM kernel.
+ - Versions: >= v17.09 && < v24.06
+ - Oses: Linux, Android, MacOS, Windows.
+ - Conditions:
+ - Compile the latest Arm Compute Library for armv8.2-a or multi_isa
+ - Device with FP16 support
+ - GeMM with beta coefficient != 0 or 1
+
- (COMPMID-6493) Crash when running Arm Compute Library compiled for SVE2 on a computer that support SVE only.
- Versions: >= v21.02 && <=v23.08
- OSs: Linux, Android.
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index f493ff631e..16664c8d84 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,9 +41,16 @@ If there is more than one release in a month then an extra sequential number is
@section S2_2_changelog Changelog
+v24.06 Public minor release
+ - Enable FP16 in multiple Neon™ kernels for multi_isa + v8a
+ - Fix OpenMP® thread scheduling for large machine
+ - Optimize CPU activation functions using LUT-based implementation:
+ - Tanh function for FP16.
+
v24.05 Public major release
- Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types
- Various fixes to enable FP16 kernels in armv8a multi_isa builds.
+ - Updated logic in the OpenMP scheduler to exclude LITTLE cores.
v24.04 Public major release
- Add Bfloat16 data type support for @ref NEMatMul.
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 809ab3e2c3..d46d8d7773 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -29,6 +29,7 @@
#include "support/StringSupport.h"
#include "support/ToolchainSupport.h"
+#include <map>
#include <sstream>
#if !defined(BARE_METAL)
@@ -269,6 +270,46 @@ int get_max_cpus()
}
return max_cpus;
}
+#if defined(__ANDROID__)
+std::vector<uint32_t> get_cpu_capacities()
+{
+ std::vector<uint32_t> cpu_capacities;
+ for (int i = 0; i < get_max_cpus(); ++i)
+ {
+ std::stringstream str;
+ str << "/sys/devices/system/cpu/cpu" << i << "/cpu_capacity";
+ std::ifstream file(str.str(), std::ios::in);
+ if (file.is_open())
+ {
+ std::string line;
+ if (bool(getline(file, line)))
+ {
+ cpu_capacities.emplace_back(support::cpp11::stoul(line));
+ }
+ }
+ }
+
+ return cpu_capacities;
+}
+
+uint32_t not_little_num_cpus_internal()
+{
+ std::vector<uint32_t> cpus_all = get_cpu_capacities();
+ std::vector<uint32_t> cpus_not_little;
+
+ std::vector<uint32_t>::iterator result = std::max_element(cpus_all.begin(), cpus_all.end());
+ uint32_t max_capacity = *result;
+ uint32_t threshold = max_capacity / 2;
+ for (unsigned int i = 0; i < cpus_all.size(); i++)
+ {
+ if (!(cpus_all[i] < threshold))
+ {
+ cpus_not_little.emplace_back(cpus_all[i]);
+ }
+ }
+ return cpus_not_little.size();
+}
+#endif /* defined(__ANDROID__) */
#elif defined(__aarch64__) && \
defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
/** Query features through sysctlbyname
@@ -402,6 +443,15 @@ uint32_t CpuInfo::num_cpus() const
return _cpus.size();
}
+uint32_t CpuInfo::not_little_num_cpus() const
+{
+#if defined(__ANDROID__)
+ return not_little_num_cpus_internal();
+#else /* defined(__ANDROID__) */
+ return num_cpus();
+#endif /* defined(__ANDROID__) */
+}
+
uint32_t num_threads_hint()
{
unsigned int num_threads_hint = 1;
diff --git a/src/common/cpuinfo/CpuInfo.h b/src/common/cpuinfo/CpuInfo.h
index 953e4883c3..78d11e9610 100644
--- a/src/common/cpuinfo/CpuInfo.h
+++ b/src/common/cpuinfo/CpuInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_COMMON_CPUINFO_H
-#define SRC_COMMON_CPUINFO_H
+#ifndef ACL_SRC_COMMON_CPUINFO_CPUINFO_H
+#define ACL_SRC_COMMON_CPUINFO_CPUINFO_H
#include "src/common/cpuinfo/CpuIsaInfo.h"
#include "src/common/cpuinfo/CpuModel.h"
@@ -120,6 +120,7 @@ public:
CpuModel cpu_model(uint32_t cpuid) const;
CpuModel cpu_model() const;
uint32_t num_cpus() const;
+ uint32_t not_little_num_cpus() const;
private:
CpuIsaInfo _isa{};
@@ -135,4 +136,4 @@ private:
uint32_t num_threads_hint();
} // namespace cpuinfo
} // namespace arm_compute
-#endif /* SRC_COMMON_CPUINFO_H */
+#endif // ACL_SRC_COMMON_CPUINFO_CPUINFO_H
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index f6761f27b0..ee39210fa5 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -140,10 +140,20 @@ unsigned int CPUInfo::get_L2_cache_size() const
unsigned long CPUInfo::get_sme2_vector_length() const
{
#ifdef ARM_COMPUTE_ENABLE_SME2
- return arm_gemm::utils::sme::get_vector_length<int8_t>();
+ if (this->has_sme2())
+ return arm_gemm::utils::sme::get_vector_length<int8_t>();
+ else
+ return 0;
#else // ARM_COMPUTE_ENABLE_SME2
return 0;
#endif // ARM_COMPUTE_ENABLE_SME2
}
-
+unsigned int CPUInfo::get_cpu_num_excluding_little() const
+{
+#if defined(__ANDROID__)
+ return _impl->info.not_little_num_cpus();
+#else /* defined(__ANDROID__) */
+ return get_cpu_num();
+#endif /* defined(__ANDROID__) */
+}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 717fd11485..153c36052a 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,11 +78,11 @@ static const BatchNormalizationKernel available_kernels[] = {
REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
#endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if ARM_COMPUTE_ENABLE_FP16
{"neon_fp16_batch_normalization",
[](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
{"neon_fp32_batch_normalization",
[](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index cb869838e2..694def1a3a 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,11 +63,11 @@ static const BoundingBoxTransformKernel available_kernels[] = {
{"fp32_neon_boundingboxtransform",
[](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
{"fp16_neon_boundingboxtransform",
[](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
#if defined(ARM_COMPUTE_ENABLE_NEON)
{"qu16_neon_boundingboxtransform",
[](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 549319e49f..e23e3d020f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,10 +61,10 @@ static const ComputeAllAnchorsKernel available_kernels[] = {
{"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
{"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
{"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
};
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 0a1780f6ee..5883731088 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,10 +70,10 @@ struct InstanceNormKernel
static const InstanceNormKernel available_kernels[] = {
{"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
{"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
};
/** Micro-kernel selector
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 451031d696..cfe4ac9a4c 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,10 +60,10 @@ struct MeanStdDevNormKernel
static const std::vector<MeanStdDevNormKernel> available_kernels = {
{"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
{"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
{"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
};
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
index 06e35eed8c..2effffbe92 100644
--- a/src/core/helpers/LUTManager.cpp
+++ b/src/core/helpers/LUTManager.cpp
@@ -30,17 +30,38 @@ namespace arm_compute
namespace
{
-void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+float16_t activation(float16_t x, const LUTInfo &info)
+{
+ float16_t out = 0.f;
+ switch (info.act)
+ {
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ out = 1.f / (1.f + std::exp(-x));
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ {
+ out = static_cast<float16_t>(info.alpha * std::tanh(info.beta * x));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table");
+ break;
+ }
+ return out;
+}
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info)
{
union Element
{
uint16_t i = 0;
float16_t fp;
} item;
+
// Fill lut by iterating over all 16 bit values using the union.
while (true)
{
- (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+ (*lut)[item.i] = activation(item.fp, info);
if (item.i == 65535)
break;
item.i++;
@@ -62,7 +83,7 @@ std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table
// Not found, or pointer not valid
// We do not use make_shared to prevent the weak_ptr keeping the control block alive
std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
- init_lut_fp16(ptr.get());
+ init_lut_fp16(ptr.get(), info);
map_fp16[info] = ptr;
return ptr;
}
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
index 4e13ead7e3..f3f4bf2832 100644
--- a/src/core/helpers/LUTManager.h
+++ b/src/core/helpers/LUTManager.h
@@ -38,19 +38,23 @@ namespace arm_compute
struct LUTInfo
{
ActivationLayerInfo::ActivationFunction act;
+ float alpha;
+ float beta;
DataType dt;
- QuantizationInfo qinfo;
+ UniformQuantizationInfo qinfo;
+
// Operators enable use of map with Lutinfo as key
friend bool operator<(const LUTInfo &l, const LUTInfo &r)
{
- return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
- ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
- ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
- (l.qinfo.offset() < l.qinfo.offset()));
+ const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset);
+ const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset);
+
+ return l_tup < r_tup;
}
- bool operator==(const LUTInfo &l)
+ bool operator==(const LUTInfo &l) const
{
- return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+ return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt &&
+ this->qinfo == l.qinfo;
}
};
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 7cfa39b286..4253027231 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -43,6 +43,13 @@ namespace kernels
{
namespace
{
+
+bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func)
+{
+ return func == ActivationLayerInfo::ActivationFunction::LOGISTIC ||
+ func == ActivationLayerInfo::ActivationFunction::TANH;
+}
+
static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
#ifdef ARM_COMPUTE_ENABLE_SVE
{"sve2_q8_activation_lut",
@@ -85,10 +92,7 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
{"sve_fp16_activation_lut",
[](const ActivationDataTypeISASelectorData &data)
- {
- return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
- data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
- },
+ { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); },
REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
{"sve_fp16_activation",
[](const ActivationDataTypeISASelectorData &data)
@@ -299,10 +303,10 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
activation_info.setLookupTable256(tmp_lut);
}
- if (src->data_type() == DataType::F16 &&
- activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ if (std::string(uk->name) == "sve_fp16_activation_lut")
{
- const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+ const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(),
+ src->quantization_info().uniform()};
activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
}
#endif // __aarch64__
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 60fda511e3..6a93be0618 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,7 +81,7 @@ void vector_matrix_multiply_f16(
// window_end_x is computed above which may cause out-of-bound writes to the dst.
for (; x < (window_end_x - window_step_x); x += window_step_x)
{
- if (x > width_matrix_b)
+ if (x >= width_matrix_b)
{
return;
}
@@ -176,7 +176,7 @@ void vector_matrix_multiply_f16(
for (; x < window_end_x; ++x)
{
- if (x > width_matrix_b)
+ if (x >= width_matrix_b)
{
return;
}
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index d4d6193fce..baffa8cbb2 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,10 +32,21 @@
namespace arm_compute
{
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
OMPScheduler::OMPScheduler() // NOLINT
- : _num_threads(omp_get_max_threads())
+ : _num_threads(cpu_info().get_cpu_num_excluding_little()),
+ _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
{
}
+#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+OMPScheduler::OMPScheduler() // NOLINT
+ : _num_threads(omp_get_max_threads()), _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
+{
+}
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
unsigned int OMPScheduler::num_threads() const
{
@@ -45,7 +56,15 @@ unsigned int OMPScheduler::num_threads() const
void OMPScheduler::set_num_threads(unsigned int num_threads)
{
const unsigned int num_cores = omp_get_max_threads();
- _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+ const unsigned int adjusted_num_threads = std::min(_nonlittle_num_cpus, num_threads);
+ _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads;
+#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+ _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
}
void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
@@ -99,9 +118,15 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
}
ThreadInfo info;
- info.cpu_info = &cpu_info();
+ info.cpu_info = &cpu_info();
+
+#if !defined(__ANDROID__)
+ info.num_threads = _num_threads;
+#else /* !__ANDROID__ */
info.num_threads = num_threads_to_use;
-#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+#endif /* __ANDROID__ */
+
+#pragma omp parallel for firstprivate(info) num_threads(info.num_threads) default(shared) proc_bind(close) \
schedule(static, 1)
for (unsigned int wid = 0; wid < amount_of_work; ++wid)
{
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 819811943d..e0d45c639a 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,6 +48,19 @@ namespace validation
{
TEST_SUITE(NEON)
TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+ CPUInfo& ci = arm_compute::Scheduler::get().cpu_info();
+ const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+ const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+ ARM_COMPUTE_EXPECT(num_threads<=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
TEST_SUITE(RuntimeContext)
TEST_CASE(Scheduler, framework::DatasetMode::ALL)