14 files changed, 162 insertions, 47 deletions
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 809ab3e2c3..d46d8d7773 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -29,6 +29,7 @@
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
+#include <map>
 #include <sstream>
 
 #if !defined(BARE_METAL)
@@ -269,6 +270,46 @@ int get_max_cpus()
     }
     return max_cpus;
 }
+#if defined(__ANDROID__)
+std::vector<uint32_t> get_cpu_capacities()
+{
+    std::vector<uint32_t> cpu_capacities;
+    for (int i = 0; i < get_max_cpus(); ++i)
+    {
+        std::stringstream str;
+        str << "/sys/devices/system/cpu/cpu" << i << "/cpu_capacity";
+        std::ifstream file(str.str(), std::ios::in);
+        if (file.is_open())
+        {
+            std::string line;
+            if (bool(getline(file, line)))
+            {
+                cpu_capacities.emplace_back(support::cpp11::stoul(line));
+            }
+        }
+    }
+
+    return cpu_capacities;
+}
+
+uint32_t not_little_num_cpus_internal()
+{
+    std::vector<uint32_t> cpus_all = get_cpu_capacities();
+    std::vector<uint32_t> cpus_not_little;
+
+    std::vector<uint32_t>::iterator result       = std::max_element(cpus_all.begin(), cpus_all.end());
+    uint32_t                        max_capacity = *result;
+    uint32_t                        threshold    = max_capacity / 2;
+    for (unsigned int i = 0; i < cpus_all.size(); i++)
+    {
+        if (!(cpus_all[i] < threshold))
+        {
+            cpus_not_little.emplace_back(cpus_all[i]);
+        }
+    }
+    return cpus_not_little.size();
+}
+#endif /* defined(__ANDROID__) */
 #elif defined(__aarch64__) && \
     defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
 /** Query features through sysctlbyname
@@ -402,6 +443,15 @@ uint32_t CpuInfo::num_cpus() const
     return _cpus.size();
 }
 
+uint32_t CpuInfo::not_little_num_cpus() const
+{
+#if defined(__ANDROID__)
+    return not_little_num_cpus_internal();
+#else  /* defined(__ANDROID__) */
+    return num_cpus();
+#endif /* defined(__ANDROID__) */
+}
+
 uint32_t num_threads_hint()
 {
     unsigned int num_threads_hint = 1;
diff --git a/src/common/cpuinfo/CpuInfo.h b/src/common/cpuinfo/CpuInfo.h
index 953e4883c3..78d11e9610 100644
--- a/src/common/cpuinfo/CpuInfo.h
+++ b/src/common/cpuinfo/CpuInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_COMMON_CPUINFO_H
-#define SRC_COMMON_CPUINFO_H
+#ifndef ACL_SRC_COMMON_CPUINFO_CPUINFO_H
+#define ACL_SRC_COMMON_CPUINFO_CPUINFO_H
 
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 #include "src/common/cpuinfo/CpuModel.h"
@@ -120,6 +120,7 @@ public:
     CpuModel cpu_model(uint32_t cpuid) const;
     CpuModel cpu_model() const;
     uint32_t num_cpus() const;
+    uint32_t not_little_num_cpus() const;
 
 private:
     CpuIsaInfo            _isa{};
@@ -135,4 +136,4 @@ private:
 uint32_t num_threads_hint();
 } // namespace cpuinfo
 } // namespace arm_compute
-#endif /* SRC_COMMON_CPUINFO_H */
+#endif // ACL_SRC_COMMON_CPUINFO_CPUINFO_H
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index f6761f27b0..ee39210fa5 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -140,10 +140,20 @@ unsigned int CPUInfo::get_L2_cache_size() const
 unsigned long CPUInfo::get_sme2_vector_length() const
 {
 #ifdef ARM_COMPUTE_ENABLE_SME2
-    return arm_gemm::utils::sme::get_vector_length<int8_t>();
+    if (this->has_sme2())
+        return arm_gemm::utils::sme::get_vector_length<int8_t>();
+    else
+        return 0;
 #else  // ARM_COMPUTE_ENABLE_SME2
     return 0;
 #endif // ARM_COMPUTE_ENABLE_SME2
 }
-
+unsigned int CPUInfo::get_cpu_num_excluding_little() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.not_little_num_cpus();
+#else  /* defined(__ANDROID__) */
+    return get_cpu_num();
+#endif /* defined(__ANDROID__) */
+}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 717fd11485..153c36052a 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,11 +78,11 @@ static const BatchNormalizationKernel available_kernels[] = {
      REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if ARM_COMPUTE_ENABLE_FP16
     {"neon_fp16_batch_normalization",
      [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
      REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
     {"neon_fp32_batch_normalization",
      [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
      REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index cb869838e2..694def1a3a 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,11 +63,11 @@ static const BoundingBoxTransformKernel available_kernels[] = {
     {"fp32_neon_boundingboxtransform",
      [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
      REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
     {"fp16_neon_boundingboxtransform",
      [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
      REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
 #if defined(ARM_COMPUTE_ENABLE_NEON)
     {"qu16_neon_boundingboxtransform",
      [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 549319e49f..e23e3d020f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,10 +61,10 @@ static const ComputeAllAnchorsKernel available_kernels[] = {
     {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
      REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
     {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
      REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
     {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
      REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
 };
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 0a1780f6ee..5883731088 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,10 +70,10 @@ struct InstanceNormKernel
 static const InstanceNormKernel available_kernels[] = {
     {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
      REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
     {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
      REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
 };
 
 /** Micro-kernel selector
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 451031d696..cfe4ac9a4c 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,10 +60,10 @@ struct MeanStdDevNormKernel
 static const std::vector<MeanStdDevNormKernel> available_kernels = {
     {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
      REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
     {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
      REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
     {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
      REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
 };
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
index 06e35eed8c..2effffbe92 100644
--- a/src/core/helpers/LUTManager.cpp
+++ b/src/core/helpers/LUTManager.cpp
@@ -30,17 +30,38 @@ namespace arm_compute
 namespace
 {
 
-void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+float16_t activation(float16_t x, const LUTInfo &info)
+{
+    float16_t out = 0.f;
+    switch (info.act)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+            out = 1.f / (1.f + std::exp(-x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::TANH:
+        {
+            out = static_cast<float16_t>(info.alpha * std::tanh(info.beta * x));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table");
+            break;
+    }
+    return out;
+}
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info)
 {
     union Element
     {
         uint16_t  i = 0;
         float16_t fp;
     } item;
+
     // Fill lut by iterating over all 16 bit values using the union.
     while (true)
     {
-        (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+        (*lut)[item.i] = activation(item.fp, info);
         if (item.i == 65535)
             break;
         item.i++;
@@ -62,7 +83,7 @@ std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table
         // Not found, or pointer not valid
         // We do not use make_shared to prevent the weak_ptr keeping the control block alive
         std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
-        init_lut_fp16(ptr.get());
+        init_lut_fp16(ptr.get(), info);
         map_fp16[info] = ptr;
         return ptr;
     }
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
index 4e13ead7e3..f3f4bf2832 100644
--- a/src/core/helpers/LUTManager.h
+++ b/src/core/helpers/LUTManager.h
@@ -38,19 +38,23 @@ namespace arm_compute
 struct LUTInfo
 {
     ActivationLayerInfo::ActivationFunction act;
+    float                                   alpha;
+    float                                   beta;
     DataType                                dt;
-    QuantizationInfo                        qinfo;
+    UniformQuantizationInfo                 qinfo;
+
     // Operators enable use of map with Lutinfo as key
     friend bool operator<(const LUTInfo &l, const LUTInfo &r)
     {
-        return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
-               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
-               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
-                (l.qinfo.offset() < l.qinfo.offset()));
+        const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset);
+        const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset);
+
+        return l_tup < r_tup;
     }
-    bool operator==(const LUTInfo &l)
+    bool operator==(const LUTInfo &l) const
     {
-        return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+        return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt &&
+               this->qinfo == l.qinfo;
     }
 };
 
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 7cfa39b286..4253027231 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -43,6 +43,13 @@ namespace kernels
 {
 namespace
 {
+
+bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func)
+{
+    return func == ActivationLayerInfo::ActivationFunction::LOGISTIC ||
+           func == ActivationLayerInfo::ActivationFunction::TANH;
+}
+
 static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
     {"sve2_q8_activation_lut",
@@ -85,10 +92,7 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
      REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
     {"sve_fp16_activation_lut",
      [](const ActivationDataTypeISASelectorData &data)
-     {
-         return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
-                data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
-     },
+     { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); },
      REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
     {"sve_fp16_activation",
      [](const ActivationDataTypeISASelectorData &data)
@@ -299,10 +303,10 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
         activation_info.setLookupTable256(tmp_lut);
     }
 
-    if (src->data_type() == DataType::F16 &&
-        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+    if (std::string(uk->name) == "sve_fp16_activation_lut")
     {
-        const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+        const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(),
+                              src->quantization_info().uniform()};
         activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
     }
 #endif // __aarch64__
diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
index 6154ad3e3b..5595ace998 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.cpp
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp
@@ -85,7 +85,7 @@ void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
             break;
 #ifdef ARM_COMPUTE_ENABLE_FP16
         case DataType::F16:
-            _func = REGISTER_FP32_NEON(fp16_run_dequantization_core);
+            _func = REGISTER_FP16_NEON(fp16_run_dequantization_core);
             break;
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 60fda511e3..6a93be0618 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,7 +81,7 @@ void vector_matrix_multiply_f16(
             // window_end_x is computed above which may cause out-of-bound writes to the dst.
             for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                if (x > width_matrix_b)
+                if (x >= width_matrix_b)
                 {
                     return;
                 }
@@ -176,7 +176,7 @@ void vector_matrix_multiply_f16(
 
             for (; x < window_end_x; ++x)
             {
-                if (x > width_matrix_b)
+                if (x >= width_matrix_b)
                 {
                     return;
                 }
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index d4d6193fce..baffa8cbb2 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,10 +32,21 @@
 
 namespace arm_compute
 {
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
 OMPScheduler::OMPScheduler() // NOLINT
-    : _num_threads(omp_get_max_threads())
+    : _num_threads(cpu_info().get_cpu_num_excluding_little()),
+      _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
 {
 }
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+OMPScheduler::OMPScheduler() // NOLINT
+    : _num_threads(omp_get_max_threads()), _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
+{
+}
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 
 unsigned int OMPScheduler::num_threads() const
 {
@@ -45,7 +56,15 @@ unsigned int OMPScheduler::num_threads() const
 void OMPScheduler::set_num_threads(unsigned int num_threads)
 {
     const unsigned int num_cores = omp_get_max_threads();
-    _num_threads                 = (num_threads == 0) ? num_cores : num_threads;
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+    const unsigned int adjusted_num_threads = std::min(_nonlittle_num_cpus, num_threads);
+    _num_threads                            = (num_threads == 0) ? num_cores : adjusted_num_threads;
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+    _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 }
 
 void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
@@ -99,9 +118,15 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
     }
 
     ThreadInfo info;
-    info.cpu_info    = &cpu_info();
+    info.cpu_info = &cpu_info();
+
+#if !defined(__ANDROID__)
+    info.num_threads = _num_threads;
+#else  /* !__ANDROID__ */
     info.num_threads = num_threads_to_use;
-#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+#endif /* __ANDROID__ */
+
+#pragma omp parallel for firstprivate(info) num_threads(info.num_threads) default(shared) proc_bind(close) \
     schedule(static, 1)
     for (unsigned int wid = 0; wid < amount_of_work; ++wid)
     {