aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>2023-12-05 14:27:31 +0000
committerMohmun02 <MohammedSuhail.Munshi@arm.com>2024-01-10 09:56:39 +0000
commit7467ba8fac0afb19d750b3bdda9ba95002634038 (patch)
treee47d4989251f03d13590e6b22d9bd228fd1efe34
parent7fe7791468978429ab02343a8485b51b39832027 (diff)
downloadComputeLibrary-7467ba8fac0afb19d750b3bdda9ba95002634038.tar.gz
Use look up table for fp16 activation
- Enables FP16 lut for logistic activation - Adds LUTManager to re-use lut where appropriate. Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: I94667b63b452a8e58a1eb59cb0b5866178954523 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10864 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp1
-rw-r--r--arm_compute/function_info/ActivationLayerInfo.h32
-rw-r--r--docs/user_guide/release_version_and_change_log.dox2
-rw-r--r--filelist.json6
-rw-r--r--src/BUILD.bazel2
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/core/helpers/LUTManager.cpp72
-rw-r--r--src/core/helpers/LUTManager.h73
-rw-r--r--src/cpu/kernels/CpuActivationKernel.cpp17
-rw-r--r--src/cpu/kernels/CpuActivationKernel.h9
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp16.cpp29
-rw-r--r--src/cpu/kernels/activation/list.h9
-rw-r--r--src/cpu/kernels/lut/generic/sve/u16.cpp103
-rw-r--r--src/cpu/kernels/lut/list.h26
14 files changed, 360 insertions, 23 deletions
diff --git a/Android.bp b/Android.bp
index e2f86a4f46..f7d4d257d6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -392,6 +392,7 @@ cc_library_static {
"src/core/Utils.cpp",
"src/core/Validate.cpp",
"src/core/Version.cpp",
+ "src/core/helpers/LUTManager.cpp",
"src/core/helpers/SoftmaxHelpers.cpp",
"src/core/helpers/Utils.cpp",
"src/core/helpers/WindowHelpers.cpp",
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 195b67cf99..9390d0c54f 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,13 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/QuantizationInfo.h"
#include <array>
+#include <memory>
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif // __arch64__
namespace arm_compute
{
@@ -58,7 +64,10 @@ public:
typedef arm_compute::ActivationFunction ActivationFunction;
/** Lookup table */
- using LookupTable256 = std::array<qasymm8_t, 256>;
+#ifdef __aarch64__
+ using LookupTable256 = std::array<qasymm8_t, 256>;
+ using LookupTable65536 = std::array<float16_t, 65536>;
+#endif // __aarch64__
ActivationLayerInfo() = default;
/** Default Constructor
@@ -101,6 +110,16 @@ public:
{
_lut = std::move(lut);
}
+
+ const LookupTable65536 &lut_fp16() const
+ {
+ ARM_COMPUTE_ERROR_ON(_lut_fp16 == nullptr);
+ return *_lut_fp16;
+ }
+ void setLookupTable65536(std::shared_ptr<LookupTable65536> lut)
+ {
+ _lut_fp16 = lut;
+ }
#endif // __aarch64__
private:
ActivationFunction _act = {ActivationLayerInfo::ActivationFunction::IDENTITY};
@@ -109,8 +128,9 @@ private:
bool _enabled = {false};
#ifdef __aarch64__
- LookupTable256 _lut = {};
+ LookupTable256 _lut = {};
+ std::shared_ptr<LookupTable65536> _lut_fp16{nullptr};
#endif // __aarch64__
};
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 894a6078ba..40ad09fd84 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -45,6 +45,8 @@ v24.01 Public major release
- Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
You should link only to the main `libarm_compute` library for core functionality.
- Expand GPUTarget list with Maliā„¢ G720 and G620.
+ - Optimize CPU activation functions using LUT-based implementation:
+ - Sigmoid function for FP16.
- New features
- Add support for FP16 in all multi_isa builds.
- Performance optimizations:
diff --git a/filelist.json b/filelist.json
index 0c9550905e..7c530f3f33 100644
--- a/filelist.json
+++ b/filelist.json
@@ -14,6 +14,7 @@
"src/core/Error.cpp",
"src/core/GPUTarget.cpp",
"src/core/Helpers.cpp",
+ "src/core/helpers/LUTManager.cpp",
"src/core/IAccessWindow.cpp",
"src/core/IKernel.cpp",
"src/core/ITensor.cpp",
@@ -1828,6 +1829,11 @@
"qasymm8": ["src/cpu/kernels/lut/generic/neon/u8.cpp"],
"qasymm8_signed": ["src/cpu/kernels/lut/generic/neon/u8.cpp"]
},
+ "sve": {
+ "fp16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+ "qasymm16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+ "qasymm16_signed": ["src/cpu/kernels/lut/generic/sve/u16.cpp"]
+ },
"sve2": {
"qasymm8": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"],
"qasymm8_signed": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"]
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f0c4b52688..9d5ae63484 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -335,6 +335,7 @@ filegroup(
"cpu/kernels/elementwise_unary/generic/sve/fp32.cpp",
"cpu/kernels/elementwise_unary/generic/sve/impl.cpp",
"cpu/kernels/elementwise_unary/generic/sve/integer.cpp",
+ "cpu/kernels/lut/generic/sve/u16.cpp",
"cpu/kernels/scale/sve/fp16.cpp",
"cpu/kernels/scale/sve/fp32.cpp",
"cpu/kernels/scale/sve/integer.cpp",
@@ -637,6 +638,7 @@ filegroup(
"core/Utils.cpp",
"core/Validate.cpp",
"core/Version.cpp",
+ "core/helpers/LUTManager.cpp",
"core/helpers/SoftmaxHelpers.cpp",
"core/helpers/Utils.cpp",
"core/helpers/WindowHelpers.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0124574765..be7a6ef188 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -312,6 +312,7 @@ target_sources(
cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
cpu/kernels/elementwise_unary/generic/sve/impl.cpp
cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+ cpu/kernels/lut/generic/sve/u16.cpp
cpu/kernels/scale/sve/fp16.cpp
cpu/kernels/scale/sve/fp32.cpp
cpu/kernels/scale/sve/integer.cpp
@@ -628,6 +629,7 @@ target_sources(
core/Utils.cpp
core/Validate.cpp
core/Version.cpp
+ core/helpers/LUTManager.cpp
core/helpers/SoftmaxHelpers.cpp
core/helpers/Utils.cpp
core/helpers/WindowHelpers.cpp
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
new file mode 100644
index 0000000000..2bf0098118
--- /dev/null
+++ b/src/core/helpers/LUTManager.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/helpers/LUTManager.h"
+
+namespace arm_compute
+{
+#ifdef __aarch64__
+namespace
+{
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+{
+ for (uint16_t i = 0; i < lut->size() - 1; ++i)
+ {
+ const float16_t *v = reinterpret_cast<float16_t *>(&i);
+ (*lut)[i] = 1.f / (1.f + std::exp(-*v));
+ }
+ // Final value should be filled outside of loop to avoid overflows.
+ const uint16_t i = lut->size() - 1;
+ const float16_t *v = reinterpret_cast<const float16_t *>(&i);
+ (*lut)[i] = 1.f / (1.f + std::exp(-*v));
+}
+} // namespace
+
+std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
+{
+ const auto itr = map_fp16.find(info);
+ if (itr != map_fp16.end() && !itr->second.expired())
+ {
+ // Found and valid
+ return itr->second.lock(); // Return weak ptr as shared ptr
+ }
+ else
+ {
+ // Not found, or pointer not valid
+ const auto ptr = std::make_shared<ActivationLayerInfo::LookupTable65536>();
+ init_lut_fp16(ptr.get());
+ map_fp16[info] = ptr;
+ return ptr;
+ }
+}
+#endif // __aarch64__
+
+// Static function to get LutManager instance
+LUTManager &LUTManager::get_instance()
+{
+ static auto inst_ = std::make_unique<LUTManager>(); // The one, single instance.
+ return *inst_;
+}
+
+} // namespace arm_compute
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
new file mode 100644
index 0000000000..4e13ead7e3
--- /dev/null
+++ b/src/core/helpers/LUTManager.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+#define ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+
+struct LUTInfo
+{
+ ActivationLayerInfo::ActivationFunction act;
+ DataType dt;
+ QuantizationInfo qinfo;
+ // Operators enable use of map with Lutinfo as key
+ friend bool operator<(const LUTInfo &l, const LUTInfo &r)
+ {
+ return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
+ ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
+ ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
+ (l.qinfo.offset() < l.qinfo.offset()));
+ }
+ bool operator==(const LUTInfo &l)
+ {
+ return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+ }
+};
+
+/* Class to handle getting look up table */
+class LUTManager
+{
+public:
+ LUTManager() = default;
+
+ static LUTManager &get_instance();
+#ifdef __aarch64__
+ std::shared_ptr<ActivationLayerInfo::LookupTable65536> get_lut_table(LUTInfo info);
+
+private:
+ std::map<LUTInfo, std::weak_ptr<ActivationLayerInfo::LookupTable65536>> map_fp16{};
+#endif // __aarch64__
+};
+
+} // namespace arm_compute
+#endif // ACL_SRC_CORE_HELPERS_LUTMANAGER_H
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 3f3d72e8df..7cfa39b286 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -83,6 +83,13 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
data.f != ActivationLayerInfo::ActivationFunction::GELU;
},
REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+ {"sve_fp16_activation_lut",
+ [](const ActivationDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
+ data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
+ },
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
{"sve_fp16_activation",
[](const ActivationDataTypeISASelectorData &data)
{
@@ -279,6 +286,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
_name = std::string("CpuActivationKernel").append("/").append(uk->name);
#ifdef __aarch64__
+ // Initialise lut_manager
+ LUTManager &lut_manager = LUTManager::get_instance();
+
if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) &&
activation_info.activation() != ActivationFunction::RELU)
{
@@ -288,6 +298,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
activation_info.a(), activation_info.b());
activation_info.setLookupTable256(tmp_lut);
}
+
+ if (src->data_type() == DataType::F16 &&
+ activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+ activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
+ }
#endif // __aarch64__
_act_info = activation_info;
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 4bad9fb3e8..c1487499d6 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "src/core/common/Macros.h"
+#include "src/core/helpers/LUTManager.h"
#include "src/cpu/ICpuKernel.h"
namespace arm_compute
@@ -103,4 +104,4 @@ private:
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 97399e01e0..19d9126556 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023 Arm Limited.
+ * Copyright (c) 2020-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "src/core/NEON/SVEMath.h"
+#include "src/cpu/kernels/lut/list.h"
#include <arm_sve.h>
#include <cmath>
@@ -141,6 +142,32 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
},
input, output);
}
+
+void sve_fp16_activation_lut(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16);
+ const auto window_start_x = window.x().start();
+ const auto window_end_x = window.x().end();
+ const auto size = window_end_x - window_start_x;
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+ lut_u16_sve(reinterpret_cast<const uint16_t *>(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */,
+ size, input_ptr + window_start_x, output_ptr + window_start_x);
+ },
+ input, output);
+}
} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
index 6550ddfeca..8c24adc3fe 100644
--- a/src/cpu/kernels/activation/list.h
+++ b/src/cpu/kernels/activation/list.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023 Arm Limited.
+ * Copyright (c) 2020-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
namespace arm_compute
{
@@ -42,6 +42,7 @@ DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut);
DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
@@ -50,4 +51,4 @@ DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
diff --git a/src/cpu/kernels/lut/generic/sve/u16.cpp b/src/cpu/kernels/lut/generic/sve/u16.cpp
new file mode 100644
index 0000000000..75b8dcaae2
--- /dev/null
+++ b/src/cpu/kernels/lut/generic/sve/u16.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void lut_u16_sve(const uint16_t *table, size_t num_strings, size_t size, const uint16_t *input, uint16_t *output)
+{
+ int64_t cnth = svcnth();
+ int64_t tail = size & (4 * cnth - 1);
+ int64_t count = size - tail;
+ int64_t pos = 0;
+ ARM_COMPUTE_UNUSED(num_strings);
+ __asm __volatile("cbz %[count], 2f\n"
+ "mov z31.s, #0\n"
+ "cnth x7, ALL, MUL #4\n"
+ "cntb x8, ALL, MUL #4\n"
+ "ptrue p0.b\n"
+ "1:"
+ "ld1h z0.h, p0/z, [%[input]]\n"
+ "ld1h z1.h, p0/z, [%[input], #1, MUL VL]\n"
+ "ld1h z2.h, p0/z, [%[input], #2, MUL VL]\n"
+ "ld1h z3.h, p0/z, [%[input], #3, MUL VL]\n"
+ "add %[input], %[input], x8\n"
+
+ "zip1 z8.h, z0.h, z31.h\n"
+ "ld1h z8.s, p0/z, [%[table], z8.s, UXTW #1]\n"
+ "zip2 z0.h, z0.h, z31.h\n"
+ "ld1h z0.s, p0/z, [%[table], z0.s, UXTW #1]\n"
+ "uzp1 z0.h, z8.h, z0.h\n"
+ "st1h z0.h, p0, [%[output]]\n"
+
+ "zip1 z10.h, z1.h, z31.h\n"
+ "ld1h z10.s, p0/z, [%[table], z10.s, UXTW #1]\n"
+ "zip2 z1.h, z1.h, z31.h\n"
+ "ld1h z1.s, p0/z, [%[table], z1.s, UXTW #1]\n"
+ "uzp1 z1.h, z10.h, z1.h\n"
+ "st1h z1.h, p0, [%[output], #1, MUL VL]\n"
+
+ "zip1 z12.h, z2.h, z31.h\n"
+ "ld1h z12.s, p0/z, [%[table], z12.s, UXTW #1]\n"
+ "zip2 z2.h, z2.h, z31.h\n"
+ "ld1h z2.s, p0/z, [%[table], z2.s, UXTW #1]\n"
+ "uzp1 z2.h, z12.h, z2.h\n"
+ "st1h z2.h, p0, [%[output], #2, MUL VL]\n"
+
+ "zip1 z14.h, z3.h, z31.h\n"
+ "ld1h z14.s, p0/z, [%[table], z14.s, UXTW #1]\n"
+ "zip2 z3.h, z3.h, z31.h\n"
+ "ld1h z3.s, p0/z, [%[table], z3.s, UXTW #1]\n"
+ "uzp1 z3.h, z14.h, z3.h\n"
+ "st1h z3.h, p0, [%[output], #3, MUL VL]\n"
+
+ "add %[pos], %[pos], x7\n"
+ "add %[output], %[output], x8\n"
+ "cmp %[pos], %[count]\n"
+ "blt 1b\n"
+ "2:\n"
+ : [count] "+r"(count), [input] "+r"(input), [output] "+r"(output), [pos] "+r"(pos)
+ : [table] "r"(table)
+ : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12",
+ "z14", "z31", "p0", "p1", "z2", "z3", "z4", "x7", "x8");
+ for (int i = 0; i < tail; i++)
+ {
+ output[i] = table[input[i]];
+ }
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __aarch64__
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index da90346267..9acfe97728 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,8 @@
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_LUT_LIST_H
-#define SRC_CORE_NEON_KERNELS_LUT_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_LUT_LIST_H
+#define ACL_SRC_CPU_KERNELS_LUT_LIST_H
#include <cstddef>
#include <cstdint>
@@ -34,17 +34,27 @@ namespace cpu
{
#ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name) \
+#define DECLARE_LUT_U8_KERNEL(func_name) \
void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
uint8_t *const *output)
-DECLARE_LUT_KERNEL(lut_u8_neon);
-DECLARE_LUT_KERNEL(lut_u8_sve2);
+DECLARE_LUT_U8_KERNEL(lut_u8_neon);
+DECLARE_LUT_U8_KERNEL(lut_u8_sve2);
+
+#undef DECLARE_LUT_U8_KERNEL
+
+#define DECLARE_LUT_U16_KERNEL(func_name) \
+ void func_name(const uint16_t *table, size_t num_strings, size_t string_length, const uint16_t *input, \
+ uint16_t *output)
+
+DECLARE_LUT_U16_KERNEL(lut_u16_neon);
+DECLARE_LUT_U16_KERNEL(lut_u16_sve);
+
+#undef DECLARE_LUT_U16_KERNEL
-#undef DECLARE_LUT_KERNEL
#endif // __aarch64__
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_LUT_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_LUT_LIST_H