7 files changed, 133 insertions, 34 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index b080a86938..c97751bc0c 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPP_TYPES_H
-#define ARM_COMPUTE_CPP_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
+#define ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
 
 #include "arm_compute/core/Error.h"
 
@@ -170,6 +170,17 @@ public:
      * @return Number of CPUs
      */
     unsigned int get_cpu_num() const;
+    /** Return the maximum number of CPUs present excluding the little cores
+     * in case of an Android device
+     *
+     * @return Number of CPUs excluding little
+     */
+    unsigned int get_cpu_num_excluding_little() const;
+    /** Return the vector length in bytes for sme2
+     *
+     * @return Vector length if sme2 is enabled, otherwise returns 0.
+     */
+    unsigned long get_sme2_vector_length() const;
 
 private:
     struct Impl;
@@ -184,4 +195,4 @@ struct ThreadInfo
     const CPUInfo *cpu_info{nullptr};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 471b8c57ab..aecba3712e 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_QUANTIZATION_INFO_H
-#define ARM_COMPUTE_QUANTIZATION_INFO_H
+#ifndef ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
+#define ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
 
 #include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
@@ -84,10 +84,12 @@ public:
      *
      * @note Used for asymmetric quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(float scale, int offset) : _scale(1, scale), _offset(1, offset)
+    QuantizationInfo(float scale, int offset, bool is_dynamic = false)
+        : _scale(1, scale), _offset(1, offset), _is_dynamic(is_dynamic)
     {
     }
     /** Construct quantization info.
@@ -103,10 +105,12 @@ public:
      *
      * @note Used for asymmetric per channel quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset) : _scale(scale), _offset(offset)
+    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset, bool is_dynamic = false)
+        : _scale(scale), _offset(offset), _is_dynamic(is_dynamic)
     {
     }
     /** Scale vector accessor
@@ -125,6 +129,14 @@ public:
     {
         return _offset;
     }
+    /** is_dynamic accessor
+     *
+     * @return If true, the scale and offset may change, so operators will need to read on every run
+     */
+    bool is_dynamic() const
+    {
+        return _is_dynamic;
+    }
     /** Indicates whether this QuantizationInfo has valid settings or not
      *
      * @return True if the this has invalid settings.
@@ -149,6 +161,8 @@ public:
 private:
     std::vector<float>   _scale;  /**< Vector containing scaling factors */
     std::vector<int32_t> _offset; /**< Vector containing zero offsets */
+    bool                 _is_dynamic =
+        false; /**< If true, the scale and offset may change, so operators will need to read on every run */
 };
 
 /** Check whether two quantization info are equal.
@@ -430,6 +444,19 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
     return (static_cast<int>(value) - offset) * scale;
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value  Value to dequantize
+ * @param[in] scale  Scale to use for dequantization
+ * @param[in] offset Zero-offset to use for dequantization
+ *
+ * @return Dequantized value
+ */
+inline float dequantize(int32_t value, float scale, int32_t offset)
+{
+    return (static_cast<int>(value) - offset) * scale;
+}
+
 /** Quantize a value given a 16-bit symmetric quantization scheme
  *
  * @param[in] value           Value to quantize
@@ -536,6 +563,31 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
     return dequantize_qasymm16(value, qinfo.uniform());
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+inline float dequantize_s32(int32_t value, const UniformQuantizationInfo &qinfo)
+{
+    return (static_cast<int>(value) - qinfo.offset) * qinfo.scale;
+}
+
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+
+inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo)
+{
+    return dequantize_s32(value, qinfo.uniform());
+}
+
 /*
  * In case of requantization of a quantized input tensor to an output tensor with another quantization
  * instead of applying dequantization and then a quantization functions, we just compute new scale and
@@ -581,4 +633,4 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform
 }
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_QUANTIZATION_INFO_H */
+#endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 9390d0c54f..83b12d572e 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -121,6 +121,20 @@ public:
         _lut_fp16 = lut;
     }
 #endif // __aarch64__
+
+    // The < and == are added to be able to use this data type as an attribute for LUTInfo
+    friend bool operator<(const ActivationLayerInfo &l, const ActivationLayerInfo &r)
+    {
+        const auto l_tup = std::make_tuple(l._act, l._a, l._b, l._enabled);
+        const auto r_tup = std::make_tuple(r._act, r._a, r._b, r._enabled);
+
+        return l_tup < r_tup;
+    }
+    bool operator==(const ActivationLayerInfo &l) const
+    {
+        return this->_act == l._act && this->_a == l._a && this->_b == l._b && this->_enabled == l._enabled;
+    }
+
 private:
     ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
     float              _a       = {};
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index a827c79fda..74fe30454e 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,7 +85,8 @@ public:
           _pretranspose_B(false),
           _activation_info(),
           _fixed_format(false),
-          _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED),
+          _accumulate(false)
     {
     }
     /** Constructor
@@ -106,6 +107,7 @@ public:
      * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
      * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      * @param[in] pretranspose_B              (Optional) Pretranspose matrix B (transposition of its lowest 2 dimensions), in addition to and before, any further transformations of B
+     * @param[in] accumulate                  (Optional) Whether to accumulate in destination or not
      */
     GEMMInfo(bool                       is_a_reshaped,
              bool                       is_b_reshaped,
@@ -120,7 +122,8 @@ public:
              const ActivationLayerInfo &activation_info         = ActivationLayerInfo(),
              bool                       fixed_format            = false,
              arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED,
-             bool                       pretranspose_B          = false) noexcept
+             bool                       pretranspose_B          = false,
+             bool                       accumulate              = false) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
           _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
@@ -135,7 +138,8 @@ public:
           _pretranspose_B(pretranspose_B),
           _activation_info(activation_info),
           _fixed_format(fixed_format),
-          _weight_format(weight_format)
+          _weight_format(weight_format),
+          _accumulate(accumulate)
     {
     }
     /** Flag which specifies if the matrix A has been reshaped
@@ -294,7 +298,14 @@ public:
     {
         return _fixed_format;
     }
-
+    /** Flag which specifies if GEMM should accumulate the result in destination or not.
+     *
+     * @return True if GEMM is accumulating the result.
+     */
+    bool accumulate() const
+    {
+        return _accumulate;
+    }
     /** Set fixed-format flag
      *
      * @param[in] fixed_format sets whether or not to use fixed-format kernels
@@ -303,12 +314,19 @@ public:
     {
         _fixed_format = fixed_format;
     }
+    /** Set accumulate flag
+     *
+     * @param[in] accumulate sets whether or not to use accumulation
+     */
+    void set_accumulate(bool accumulate)
+    {
+        _accumulate = accumulate;
+    }
 
     arm_compute::WeightFormat weight_format() const
     {
         return _weight_format;
     }
-
     /** Set weight format to be used
      *
      * @param[in] weight_format arm_compute::WeightFormat enumeration
@@ -334,6 +352,7 @@ private:
     ActivationLayerInfo       _activation_info;
     bool                      _fixed_format;
     arm_compute::WeightFormat _weight_format;
+    bool                      _accumulate;
 };
 } //namespace arm_compute
 #endif // ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
diff --git a/arm_compute/runtime/CL/functions/CLScatter.h b/arm_compute/runtime/CL/functions/CLScatter.h
index 1c90d208bd..973953624e 100644
--- a/arm_compute/runtime/CL/functions/CLScatter.h
+++ b/arm_compute/runtime/CL/functions/CLScatter.h
@@ -55,14 +55,15 @@ public:
     ~CLScatter();
     /** Initialise the kernel's inputs and outputs
      *
+     * @note Negative indices are treated as out of bounds.
+     *
      * Valid data layouts:
      * - All
      *
-     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  src             Source tensor. Values used to fill output. Can be nullptr when zero initialization is true.
      * @param[in]  updates         Tensor containing values used to update output tensor. Data types supported: same as @p src
-     * @param[in]  indices         Tensor containing Indices to change in the output Tensor. Data types supported : U32
+     * @param[in]  indices         Tensor containing Indices to change in the output Tensor. Data types supported : S32
      * @param[out] output          Destination tensor. Data types supported: same as @p src.
      * @param[in]  info            Scatter info object.
      */
@@ -85,7 +86,7 @@ public:
      *
      * @param[in] src     Source tensor.
      * @param[in] updates Tensor containing values used for updating the output Tensor. Data types supported : same as @p src
-     * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : U32
+     * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
      * @param[in] output  Destination tensor. Data types supported: same as @p src.
      * @param[in] info    Scatter info containing type of scatter.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 824c4443ad..6d07675d3d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/GEMMInfo.h"
@@ -80,6 +80,7 @@ public:
      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
      * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |F32      |F32            |
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
      *  This kernel performs the following computations:
@@ -88,12 +89,12 @@ public:
      *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise
      *
      * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32/F32
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
@@ -120,4 +121,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index b522b403a9..9b39714fea 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
 
 #include "arm_compute/runtime/IScheduler.h"
 
@@ -79,6 +79,7 @@ protected:
 
 private:
     unsigned int _num_threads;
+    unsigned int _nonlittle_num_cpus;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H