aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2020-11-22 00:49:42 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2020-12-14 16:02:26 +0000
commitaa51a5ba9a3f05be08b94859b53c398edee5d2e3 (patch)
treeb28829b483421b210cd7c8a256c7feafed736b36
parent3737c7934da929003bda446291489cf352e43751 (diff)
downloadComputeLibrary-aa51a5ba9a3f05be08b94859b53c398edee5d2e3.tar.gz
COMPMID-3870: Create ActivationLayer SVE/SVE2
Adds support for ActivationLayer for SVE and SVE2. Datatypes supported: *FP32 *FP16 *QASYMM8 *QASYMM8_SIGNED *QSYMM16 Change-Id: Ia3583891795cda4ca2f9fa27c440731a5c27710d Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4566 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp19
-rw-r--r--SConscript10
-rwxr-xr-xscripts/check_bad_style.sh2
-rw-r--r--src/core/NEON/SVEAsymm.h262
-rw-r--r--src/core/NEON/SVEAsymm.inl105
-rw-r--r--src/core/NEON/SVEMath.h116
-rw-r--r--src/core/NEON/SVEMath.inl266
-rw-r--r--src/core/NEON/SVESymm.h127
-rw-r--r--src/core/NEON/kernels/NEActivationLayerKernel.cpp33
-rw-r--r--src/core/NEON/kernels/activation/impl/NEON/fp16.cpp (renamed from src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp)0
-rw-r--r--src/core/NEON/kernels/activation/impl/NEON/fp32.cpp (renamed from src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp)0
-rw-r--r--src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp (renamed from src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp)0
-rw-r--r--src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp (renamed from src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp)0
-rw-r--r--src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp (renamed from src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp)0
-rw-r--r--src/core/NEON/kernels/activation/impl/SVE/fp16.cpp132
-rw-r--r--src/core/NEON/kernels/activation/impl/SVE/fp32.cpp133
-rw-r--r--src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp185
-rw-r--r--src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp183
-rw-r--r--src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp122
-rw-r--r--src/core/NEON/kernels/activation/impl/list.h5
-rw-r--r--src/core/NEON/kernels/floor/impl/NEON/fp16.cpp (renamed from src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp)0
-rw-r--r--src/core/NEON/kernels/floor/impl/NEON/fp32.cpp (renamed from src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp)0
-rw-r--r--src/core/common/Registrars.h33
-rw-r--r--tests/validation/NEON/ActivationLayer.cpp40
24 files changed, 1752 insertions, 21 deletions
diff --git a/Android.bp b/Android.bp
index 404c1d54c8..9f7f447fe3 100644
--- a/Android.bp
+++ b/Android.bp
@@ -343,11 +343,16 @@ cc_library_static {
"src/core/NEON/kernels/NEWeightsReshapeKernel.cpp",
"src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp",
"src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp",
- "src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp",
- "src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp",
- "src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp",
- "src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp",
- "src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp",
+ "src/core/NEON/kernels/activation/impl/NEON/fp16.cpp",
+ "src/core/NEON/kernels/activation/impl/NEON/fp32.cpp",
+ "src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp",
+ "src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp",
+ "src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp",
+ "src/core/NEON/kernels/activation/impl/SVE/fp16.cpp",
+ "src/core/NEON/kernels/activation/impl/SVE/fp32.cpp",
+ "src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp",
+ "src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp",
+ "src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
@@ -400,8 +405,8 @@ cc_library_static {
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
- "src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp",
- "src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp",
+ "src/core/NEON/kernels/floor/impl/NEON/fp16.cpp",
+ "src/core/NEON/kernels/floor/impl/NEON/fp32.cpp",
"src/core/PyramidInfo.cpp",
"src/core/Rounding.cpp",
"src/core/Size2D.cpp",
diff --git a/SConscript b/SConscript
index f1fe9b288c..656336d555 100644
--- a/SConscript
+++ b/SConscript
@@ -244,15 +244,15 @@ if env['neon']:
core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
if any(i in env['data_type_support'] for i in ['all', 'fp16']):
- core_files += Glob('src/core/NEON/kernels/*/impl/fp16_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/*/impl/*/fp16.cpp')
if any(i in env['data_type_support'] for i in ['all', 'fp32']):
- core_files += Glob('src/core/NEON/kernels/*/impl/fp32_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/*/impl/*/fp32.cpp')
if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
- core_files += Glob('src/core/NEON/kernels/*/impl/qasymm8_neon*.cpp')
+ core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8.cpp')
if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
- core_files += Glob('src/core/NEON/kernels/*/impl/qasymm8_signed_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8_signed.cpp')
if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
- core_files += Glob('src/core/NEON/kernels/*/impl/qsymm16_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/*/impl/*/qsymm16.cpp')
runtime_files += Glob('src/runtime/NEON/*.cpp')
runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh
index acc496b00a..26524d7ec9 100755
--- a/scripts/check_bad_style.sh
+++ b/scripts/check_bad_style.sh
@@ -37,7 +37,7 @@ then
exit -1
fi
-grep -HnR --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "float32_t" $FILES | tee bad_style.log
+grep -HnR --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "/^float32_t/" $FILES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
new file mode 100644
index 0000000000..4b0ecd9eea
--- /dev/null
+++ b/src/core/NEON/SVEAsymm.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_SVEASYMM_H
+#define ARM_COMPUTE_SVEASYMM_H
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+/** Perform a multiply-accumulate on all components of a QASYMM8 vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] vd Input vector value in QASYMM8 format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A vector in QASYMM8 format, saturated to fit
+ */
+svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo);
+
+/** Perform a multiply-accumulate on all components of a QASYMM8_SIGNED vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] vd Input vector value in QASYMM8_SIGNED format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A vector in QASYMM8_SIGNED format, saturated to fit
+ */
+svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo);
+
+/** Dequantize following an asymmetric quantization scheme a sve vector.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] scale Quantization scaling factor.
+ * @param[in] offset Zero quantization offset.
+ *
+ * @return Dequantized values in an sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scale, int32_t offset)
+{
+ const auto voffset = svdup_n_s32(offset);
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale),
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Dequantize an sve vector
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in an sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, const UniformQuantizationInfo &qi)
+{
+ return svdequantize_z(pg, qv, qi.scale, qi.offset);
+}
+
+/** Dequantize an sve vector stored as signed asymmetric.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] scale Quantization scaling factor.
+ * @param[in] offset Zero quantization offset.
+ *
+ * @return Dequantized values in a sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale, int32_t offset)
+{
+ const auto voffset = svdup_n_s32(offset);
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale),
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Dequantize an sve vector.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in an sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const UniformQuantizationInfo &qi)
+{
+ return svdequantize_z(pg, qv, qi.scale, qi.offset);
+}
+
+/** Dequantize following symmetric quantization scheme on an sve vector.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] vscale Vector containing quantization scaling factors.
+ *
+ * @return Dequantized values in a sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
+{
+ const svfloat32x4_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)),
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Dequantize following a symmetric quantization scheme an sve vector.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] scale Quantization scaling factor.
+ *
+ * @return Dequantized values in a sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
+{
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale),
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Quantize an sve vector holding floating point values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return An sve vector holding the quantized values
+ */
+inline svuint8_t svquantize_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
+{
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const auto voffset = svdup_n_f32(offset);
+ const auto vinvscale = svdup_n_f32(1.f / scale);
+
+ const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
+ const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
+ const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
+ const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
+
+ const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+ const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+ return svqxtnt_u16(svqxtnb_u16(pa), pb);
+}
+
+/** Signed quantize an sve vector holding floating point values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return An sve vector holding the quantized values
+ */
+inline svint8_t svquantize_signed_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
+{
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const auto voffset = svdup_n_f32(offset);
+ const auto vinvscale = svdup_n_f32(1.f / scale);
+ const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
+ const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
+ const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
+ const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
+
+ const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+
+ return svqxtnt_s16(svqxtnb_s16(pa), pb);
+}
+
+/** Quantize to QASYMM16 an sve vector holding 16 floating point values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return An sve vector holding the quantized values
+ */
+inline svuint16x2_t svquantize_qasymm16_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
+{
+ const float scale = qi.scale;
+ const int offset = qi.offset;
+ const auto voffset = svdup_n_f32(offset);
+ const auto vinvscale = svdup_n_f32(1.f / scale);
+
+ const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
+ const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
+ const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
+ const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
+
+ const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+ const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+ return svcreate2_u16(pa, pb);
+}
+} // namespace arm_compute
+#include "src/core/NEON/SVEAsymm.inl"
+#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif // ARM_COMPUTE_NEASYMM_H
diff --git a/src/core/NEON/SVEAsymm.inl b/src/core/NEON/SVEAsymm.inl
new file mode 100644
index 0000000000..edf5733c36
--- /dev/null
+++ b/src/core/NEON/SVEAsymm.inl
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+#if defined(__ARM_FEATURE_SVE2)
+inline svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo)
+{
+ // Convert uint8 vectors to uint16 vectors
+ auto vd_low_u16 = svmovlb_u16(vd);
+ auto vd_high_u16 = svmovlt_u16(vd);
+
+ // Convert uint16 vectors to uint32 vectors
+ auto A_u32 = svmovlb_u32(vd_low_u16);
+ auto B_u32 = svmovlt_u32(vd_low_u16);
+ auto C_u32 = svmovlb_u32(vd_high_u16);
+ auto D_u32 = svmovlt_u32(vd_high_u16);
+
+ // Convert uint32 vectors to float32 vectors
+ auto A_f32 = svcvt_f32_u32_z(pg, A_u32);
+ auto B_f32 = svcvt_f32_u32_z(pg, B_u32);
+ auto C_f32 = svcvt_f32_u32_z(pg, C_u32);
+ auto D_f32 = svcvt_f32_u32_z(pg, D_u32);
+
+ // vd = vd*vs + vo
+ A_f32 = svmla_f32_z(pg, vo, A_f32, vs);
+ B_f32 = svmla_f32_z(pg, vo, B_f32, vs);
+ C_f32 = svmla_f32_z(pg, vo, C_f32, vs);
+ D_f32 = svmla_f32_z(pg, vo, D_f32, vs);
+
+ // Convert float32 vectors to uint32 vectors
+ A_u32 = svcvt_u32_f32_z(pg, A_f32);
+ B_u32 = svcvt_u32_f32_z(pg, B_f32);
+ C_u32 = svcvt_u32_f32_z(pg, C_f32);
+ D_u32 = svcvt_u32_f32_z(pg, D_f32);
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ vd_low_u16 = svqxtnt_u32(svqxtnb_u32(A_u32), B_u32);
+ vd_high_u16 = svqxtnt_u32(svqxtnb_u32(C_u32), D_u32);
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ const auto res = svqxtnt_u16(svqxtnb_u16(vd_low_u16), vd_high_u16);
+ return res;
+}
+
+inline svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo)
+{
+ // Convert uint8 vectors to int16 vectors
+ auto vd_low_s16 = svmovlb_s16(vd);
+ auto vd_high_s16 = svmovlt_s16(vd);
+
+ // Convert int16 vectors to int32 vectors
+ auto A_s32 = svmovlb_s32(vd_low_s16);
+ auto B_s32 = svmovlt_s32(vd_low_s16);
+ auto C_s32 = svmovlb_s32(vd_high_s16);
+ auto D_s32 = svmovlt_s32(vd_high_s16);
+
+ // Convert int32 vectors to float32 vectors
+ auto A_f32 = svcvt_f32_s32_z(pg, A_s32);
+ auto B_f32 = svcvt_f32_s32_z(pg, B_s32);
+ auto C_f32 = svcvt_f32_s32_z(pg, C_s32);
+ auto D_f32 = svcvt_f32_s32_z(pg, D_s32);
+
+ // vd = vd*vs + vo
+ A_f32 = svmla_f32_z(pg, vo, A_f32, vs);
+ B_f32 = svmla_f32_z(pg, vo, B_f32, vs);
+ C_f32 = svmla_f32_z(pg, vo, C_f32, vs);
+ D_f32 = svmla_f32_z(pg, vo, D_f32, vs);
+
+ // Convert float32 vectors to int32 vectors
+ A_s32 = svcvt_s32_f32_z(pg, A_f32);
+ B_s32 = svcvt_s32_f32_z(pg, B_f32);
+ C_s32 = svcvt_s32_f32_z(pg, C_f32);
+ D_s32 = svcvt_s32_f32_z(pg, D_f32);
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ vd_low_s16 = svqxtnt_s32(svqxtnb_s32(A_s32), B_s32);
+ vd_high_s16 = svqxtnt_s32(svqxtnb_s32(C_s32), D_s32);
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ const auto res = svqxtnt_s16(svqxtnb_s16(vd_low_s16), vd_high_s16);
+ return res;
+}
+#endif /* (__ARM_FEATURE_SVE2) */
+} // namespace arm_compute
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
new file mode 100644
index 0000000000..bdf2e894e2
--- /dev/null
+++ b/src/core/NEON/SVEMath.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_SVEMATH_H
+#define ARM_COMPUTE_SVEMATH_H
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <array>
+
+namespace arm_compute
+{
+/** Calculate exponent.
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] val Input vector value in F16 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val);
+
+/** Calculate exponential
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] x Input vector value in F16 format.
+ *
+ * @return The calculated exponent.
+ */
+svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] pg Input reciprocal.
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x);
+
+} // namespace arm_compute
+#include "src/core/NEON/SVEMath.inl"
+#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* ARM_COMPUTE_SVEMATH_H */ \ No newline at end of file
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
new file mode 100644
index 0000000000..5ebfeaa5c5
--- /dev/null
+++ b/src/core/NEON/SVEMath.inl
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_compute
+{
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, const std::array<svfloat32_t, 8> &coeffs)
+{
+ const auto A = svmla_f32_z(pg, coeffs[0], coeffs[4], x);
+ const auto B = svmla_f32_z(pg, coeffs[2], coeffs[6], x);
+ const auto C = svmla_f32_z(pg, coeffs[1], coeffs[5], x);
+ const auto D = svmla_f32_z(pg, coeffs[3], coeffs[7], x);
+ const auto x2 = svmul_f32_z(pg, x, x);
+ const auto x4 = svmul_f32_z(pg, x2, x2);
+ const auto res = svmla_f32_z(pg, svmla_f32_z(pg, A, B, x2), svmla_f32_z(pg, C, D, x2), x4);
+ return res;
+}
+
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, const std::array<svfloat16_t, 8> &coeffs)
+{
+ const auto A = svmla_f16_z(pg, coeffs[0], coeffs[4], x);
+ const auto B = svmla_f16_z(pg, coeffs[2], coeffs[6], x);
+ const auto C = svmla_f16_z(pg, coeffs[1], coeffs[5], x);
+ const auto D = svmla_f16_z(pg, coeffs[3], coeffs[7], x);
+ const auto x2 = svmul_f16_z(pg, x, x);
+ const auto x4 = svmul_f16_z(pg, x2, x2);
+ const auto res = svmla_f16_z(pg, svmla_f16_z(pg, A, B, x2), svmla_f16_z(pg, C, D, x2), x4);
+ return res;
+}
+
+inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x)
+{
+ auto recip = svrecpe_f16(x);
+ recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
+ recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
+ return recip;
+}
+
+inline svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x)
+{
+ auto recip = svrecpe_f32(x);
+ recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip);
+ recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip);
+ return recip;
+}
+
+inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
+{
+ const auto CONST_LN2 = svdup_n_f32(0.6931471805f); // ln(2)
+ const auto CONST_INV_LN2 = svdup_n_f32(1.4426950408f); // 1/ln(2)
+ const auto CONST_INF = svdup_n_f32(std::numeric_limits<float>::infinity());
+ const auto CONST_MAX_INPUT = svdup_n_f32(88.7f);
+ const auto CONST_0 = svdup_n_f32(0.f);
+ const auto CONST_NEGATIVE_126 = svdup_n_s32(-126);
+
+ /** Exponent polynomial coefficients */
+ const std::array<svfloat32_t, 8> exp_tab =
+ {
+ {
+ svdup_n_f32(1.f),
+ svdup_n_f32(0.0416598916054f),
+ svdup_n_f32(0.500000596046f),
+ svdup_n_f32(0.0014122662833f),
+ svdup_n_f32(1.00000011921f),
+ svdup_n_f32(0.00833693705499f),
+ svdup_n_f32(0.166665703058f),
+ svdup_n_f32(0.000195780929062f),
+ }
+ };
+
+ // Perform range reduction [-log(2),log(2)]
+ auto m = svcvt_s32_f32_z(pg, svmul_f32_z(pg, x, CONST_INV_LN2));
+ auto val = svmls_f32_z(pg, x, svcvt_f32_s32_z(pg, m), CONST_LN2);
+
+ // Polynomial Approximation
+ auto poly = svtaylor_poly_f32_z(pg, val, exp_tab);
+
+ // Reconstruct
+ poly = svreinterpret_f32_s32(svqadd_s32(svreinterpret_s32_f32(poly), svlsl_n_s32_z(pg, m, 23)));
+
+ // Handle underflow
+ svbool_t ltpg = svcmplt_s32(pg, m, CONST_NEGATIVE_126);
+ poly = svsel_f32(ltpg, CONST_0, poly);
+
+ // Handle overflow
+ svbool_t gtpg = svcmpgt_f32(pg, x, CONST_MAX_INPUT);
+ poly = svsel_f32(gtpg, CONST_INF, poly);
+
+ return poly;
+}
+
+inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x)
+{
+ const auto CONST_LN2 = svdup_n_f16(0.6931471805f); // ln(2)
+ const auto CONST_INV_LN2 = svdup_n_f16(1.4426950408f); // 1/ln(2)
+ const auto CONST_INF = svdup_n_f16(std::numeric_limits<float16_t>::infinity());
+ const auto CONST_MAX_INPUT = svdup_n_f16(88.7f);
+ const auto CONST_0 = svdup_n_f16(0.f);
+ const auto CONST_NEGATIVE_126 = svdup_n_s16(-126);
+
+ /** Exponent polynomial coefficients */
+ const std::array<svfloat16_t, 8> exp_tab =
+ {
+ {
+ svdup_n_f16(1.f),
+ svdup_n_f16(0.0416598916054f),
+ svdup_n_f16(0.500000596046f),
+ svdup_n_f16(0.0014122662833f),
+ svdup_n_f16(1.00000011921f),
+ svdup_n_f16(0.00833693705499f),
+ svdup_n_f16(0.166665703058f),
+ svdup_n_f16(0.000195780929062f),
+ }
+ };
+
+ // Perform range reduction [-log(2),log(2)]
+ auto m = svcvt_s16_f16_z(pg, svmul_f16_z(pg, x, CONST_INV_LN2));
+ auto val = svmls_f16_z(pg, x, svcvt_f16_s16_z(pg, m), CONST_LN2);
+
+ // Polynomial Approximation
+ auto poly = svtaylor_poly_f16_z(pg, val, exp_tab);
+
+ // Reconstruct
+ poly = svreinterpret_f16_s16(svqadd_s16(svreinterpret_s16_f16(poly), svlsl_n_s16_z(pg, m, 11)));
+
+ // Handle underflow
+ svbool_t ltpg = svcmplt_s16(pg, m, CONST_NEGATIVE_126);
+ poly = svsel_f16(ltpg, CONST_0, poly);
+
+ // Handle overflow
+ svbool_t gtpg = svcmpgt_f16(pg, x, CONST_MAX_INPUT);
+ poly = svsel_f16(gtpg, CONST_INF, poly);
+
+ return poly;
+}
+
+inline svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val)
+{
+ const svfloat32_t CONST_1 = svdup_n_f32(1.f);
+ const svfloat32_t CONST_2 = svdup_n_f32(2.f);
+ const svfloat32_t CONST_MIN_TANH = svdup_n_f32(-10.f);
+ const svfloat32_t CONST_MAX_TANH = svdup_n_f32(10.f);
+
+ svfloat32_t x = svmin_f32_z(pg, svmax_f32_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH);
+ svfloat32_t exp2x = svexp_f32_z(pg, svmul_f32_z(pg, CONST_2, x));
+ svfloat32_t num = svsub_f32_z(pg, exp2x, CONST_1);
+ svfloat32_t den = svadd_f32_z(pg, exp2x, CONST_1);
+ svfloat32_t tanh = svdiv_f32_z(pg, num, den);
+ return tanh;
+}
+
+inline svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val)
+{
+ const svfloat16_t CONST_1 = svdup_n_f16(1.f);
+ const svfloat16_t CONST_2 = svdup_n_f16(2.f);
+ const svfloat16_t CONST_MIN_TANH = svdup_n_f16(-10.f);
+ const svfloat16_t CONST_MAX_TANH = svdup_n_f16(10.f);
+
+ const svfloat16_t x = svmin_f16_z(pg, svmax_f16_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH);
+ const svfloat16_t exp2x = svexp_f16_z(pg, svmul_f16_z(pg, CONST_2, x));
+ const svfloat16_t num = svsub_f16_z(pg, exp2x, CONST_1);
+ const svfloat16_t den = svadd_f16_z(pg, exp2x, CONST_1);
+ const svfloat16_t tanh = svdiv_f16_z(pg, num, den);
+ return tanh;
+}
+
+inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
+{
+#if defined(__ARM_FEATURE_SVE2)
+ return svcvt_f32_s32_z(pg, svlogb_f32_z(pg, x));
+#else /* !defined(__ARM_FEATURE_SVE2) */
+ /** Logarithm polynomial coefficients */
+ const std::array<svfloat32_t, 8> log_tab =
+ {
+ {
+ svdup_n_f32(-2.29561495781f),
+ svdup_n_f32(-2.47071170807f),
+ svdup_n_f32(-5.68692588806f),
+ svdup_n_f32(-0.165253549814f),
+ svdup_n_f32(5.17591238022f),
+ svdup_n_f32(0.844007015228f),
+ svdup_n_f32(4.58445882797f),
+ svdup_n_f32(0.0141278216615f),
+ }
+ };
+
+ const auto CONST_127 = svdup_n_s32(127); // 127
+ const auto CONST_LN2 = svdup_n_f32(0.6931471805f); // ln(2)
+
+ // Extract exponent
+ auto m = svsub_s32_z(pg, svasr_n_s32_z(pg, svreinterpret_s32_f32(x), 23), CONST_127);
+ auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
+
+ // Polynomial Approximation
+ auto poly = svtaylor_poly_f32_z(pg, val, log_tab);
+
+ // Reconstruct
+ poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
+
+ return poly;
+#endif /* defined(__ARM_FEATURE_SVE2) */
+}
+
+inline svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x)
+{
+#if defined(__ARM_FEATURE_SVE2)
+ return svcvt_f16_s16_z(pg, svlogb_f16_z(pg, x));
+#else /* !defined(__ARM_FEATURE_SVE2) */
+
+ /** Logarithm polynomial coefficients */
+ const std::array<svfloat16_t, 8> log_tab
+ {
+ {
+ svdup_n_f16(-2.29561495781f),
+ svdup_n_f16(-2.47071170807f),
+ svdup_n_f16(-5.68692588806f),
+ svdup_n_f16(-0.165253549814f),
+ svdup_n_f16(5.17591238022f),
+ svdup_n_f16(0.844007015228f),
+ svdup_n_f16(4.58445882797f),
+ svdup_n_f16(0.0141278216615f),
+ }
+ };
+
+ const auto CONST_7 = svdup_n_s16(7); // 7
+ const auto CONST_LN2 = svdup_n_f16(0.6931471805f); // ln(2)
+
+ // Extract exponent
+ auto m = svsub_s16_z(pg, svasr_n_s16_z(pg, svreinterpret_s16_f16(x), 11), CONST_7);
+ auto val = svreinterpret_f16_s16(svsub_s16_z(pg, svreinterpret_s16_f16(x), svlsl_n_s16_z(pg, m, 11)));
+
+ // Polynomial Approximation
+ auto poly = svtaylor_poly_f16_z(pg, val, log_tab);
+
+ // Reconstruct
+ poly = svmla_f16_z(pg, poly, svcvt_f16_s16_z(pg, m), CONST_LN2);
+
+ return poly;
+#endif /* defined(__ARM_FEATURE_SVE2) */
+}
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE) */
diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
new file mode 100644
index 0000000000..30e1e172a3
--- /dev/null
+++ b/src/core/NEON/SVESymm.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_SVESYMM_H
+#define ARM_COMPUTE_SVESYMM_H
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+/** Dequantize an sve vector holding 16-bit quantized values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] scale Quantization scale
+ *
+ * @return Dequantized values in an sve vector
+ */
+inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale)
+{
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x2_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Quantize an sve vector holding 8 floating point values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] scale Quantization scale
+ *
+ * @return An sve vector holding the quantized values
+ */
+inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float scale)
+{
+ const svfloat32_t vinvscale = svdup_n_f32(1.f / scale);
+
+ const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget2_f32(qv, 0), vinvscale));
+ const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget2_f32(qv, 1), vinvscale));
+ const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+
+ return pa;
+}
+
+/** Dequantize an sve vector holding 16 16-bit quantized values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in an sve vector
+ */
+inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi)
+{
+ const float scale = qi.scale;
+ const auto vscale = svdup_n_f32(scale);
+ const svfloat32x4_t vdequantized_input =
+ {
+ { {
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+ svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale),
+ }
+ }
+ };
+ return vdequantized_input;
+}
+
+/** Quantize an sve vector holding 16 floating point values.
+ *
+ * @param[in] pg Predicate value.
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return An sve vector holding the quantized values
+ */
+inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
+{
+ const float scale = qi.scale;
+ ARM_COMPUTE_ERROR_ON(scale == 0.f);
+ const auto vinvscale = svdup_n_f32(1.f / scale);
+ const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 0), vinvscale));
+ const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 1), vinvscale));
+ const auto rf_2 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 2), vinvscale));
+ const auto rf_3 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 3), vinvscale));
+
+ const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+
+ return svcreate2_s16(pa, pb);
+}
+
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif // ARM_COMPUTE_NESYMM_H \ No newline at end of file
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index d969fd8e38..f215787bf6 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -56,6 +56,18 @@ struct ActivationKernel
static const ActivationKernel available_kernels[] =
{
+#if defined(__ARM_FEATURE_SVE)
+ {
+ "fp16_sve_activation",
+ [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
+ },
+ {
+ "fp32_sve_activation",
+ [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
+ },
+#else /* !defined(__ARM_FEATURE_SVE) */
{
"fp16_neon_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
@@ -66,6 +78,25 @@ static const ActivationKernel available_kernels[] =
[](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
},
+#endif /* defined(__ARM_FEATURE_SVE) */
+
+#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */
+ {
+ "qasymm8_sve_activation",
+ [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
+ },
+ {
+ "qasymm8_signed_sve_activation",
+ [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
+ },
+ {
+ "qsymm16_sve_activation",
+ [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
+ REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
+ },
+#else /* !defined(__ARM_FEATURE_SVE2) */
{
"qasymm8_neon_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
@@ -81,6 +112,7 @@ static const ActivationKernel available_kernels[] =
[](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
},
+#endif /* defined(__ARM_FEATURE_SVE2) */
};
const ActivationKernel *get_implementation(const ActivationSelectorData &data)
@@ -159,7 +191,6 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, *input->clone());
- // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
Coordinates coord;
coord.set_num_dimensions(output->num_dimensions());
output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
diff --git a/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/fp16.cpp
index 58e1cfcf23..58e1cfcf23 100644
--- a/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/NEON/fp16.cpp
diff --git a/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/fp32.cpp
index 610db05224..610db05224 100644
--- a/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/NEON/fp32.cpp
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp
index 7b26441824..7b26441824 100644
--- a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp
index c616c5e27d..c616c5e27d 100644
--- a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp
diff --git a/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp
index 0bef807db9..0bef807db9 100644
--- a/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp
diff --git a/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp b/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp
new file mode 100644
index 0000000000..8d6f4f2351
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const auto const_1 = svdup_n_f16(1.f);
+ const auto const_0 = svdup_n_f16(0.f);
+ const auto const_6 = svdup_n_f16(6.f);
+ const auto const_3 = svdup_n_f16(3.f);
+ const auto const_inv_6 = svdup_n_f16(0.166666667f);
+
+ const auto va = svdup_n_f16(act_info.a());
+ const auto vb = svdup_n_f16(act_info.b());
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ svfloat16_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_f16(pg, input_ptr + x);
+ switch(act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f16_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f16_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f16_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f16(pg, output_ptr + x, tmp);
+
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+
+ }
+ while(svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE \ No newline at end of file
diff --git a/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp b/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp
new file mode 100644
index 0000000000..2c276028a0
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const auto const_1 = svdup_n_f32(1.f);
+ const auto const_0 = svdup_n_f32(0.f);
+ const auto const_6 = svdup_n_f32(6.f);
+ const auto const_3 = svdup_n_f32(3.f);
+ const auto const_inv_6 = svdup_n_f32(0.166666667f);
+
+ const auto va = svdup_n_f32(act_info.a());
+ const auto vb = svdup_n_f32(act_info.b());
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+ svfloat32_t tmp;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_f32(pg, input_ptr + x);
+ switch(act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f32_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f32_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f32_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f32(pg, output_ptr + x, tmp);
+
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
+
+ }
+ while(svptest_any(svptrue_b32(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE \ No newline at end of file
diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp
new file mode 100644
index 0000000000..a49a562c84
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+ const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+ const auto const_0 = quantize_qasymm8(0.f, qi_in);
+ const auto vconst_0 = svdup_n_u8(const_0);
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+ const auto const_6_f32 = svdup_n_f32(6.f);
+ const auto const_0_f32 = svdup_n_f32(0.f);
+ const auto const_3_f32 = svdup_n_f32(3.f);
+ const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+ // Initialise scale/offset for re-quantization
+ bool requant = true;
+ if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ {
+ requant = false;
+ }
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ auto vs = svdup_n_f32(s);
+ auto vo = svdup_n_f32(o);
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ svuint8_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_u8(pg, input_ptr + x);
+ if(act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = svmax_u8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_u8(pg, output_ptr + x, tmp);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+
+ }
+ while(svptest_any(svptrue_b8(), pg));
+
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */ \ No newline at end of file
diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp
new file mode 100644
index 0000000000..f34bee88fc
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+ const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+ const auto const_0 = quantize_qasymm8_signed(0.f, qi_in);
+ const auto vconst_0 = svdup_n_s8(const_0);
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+ const auto const_6_f32 = svdup_n_f32(6.f);
+ const auto const_0_f32 = svdup_n_f32(0.f);
+ const auto const_3_f32 = svdup_n_f32(3.f);
+ const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+ // Initialise scale/offset for re-quantization
+ bool requant = true;
+ if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ {
+ requant = false;
+ }
+ float s = qi_in.scale / qi_out.scale;
+ float o = -qi_in.offset * s + qi_out.offset;
+ auto vs = svdup_n_f32(s);
+ auto vo = svdup_n_f32(o);
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ svint8_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_s8(pg, input_ptr + x);
+ if(act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = svmax_s8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep =
+ {
+ { {
+ svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_s8(pg, output_ptr + x, tmp);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+
+ }
+ while(svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp b/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp
new file mode 100644
index 0000000000..1432e3bbdf
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/SVESymm.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const auto vconst_1 = svdup_n_f32(1.f);
+ const auto va_f32 = svdup_n_f32(act_info.a());
+ const auto vb_f32 = svdup_n_f32(act_info.b());
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ svint16_t tmp;
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ const auto vin = svld1_s16(pg, input_ptr + x);
+ if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep =
+ {
+ { {
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+ svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep =
+ {
+ { {
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
+ }
+ }
+ };
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+
+ svst1_s16(pg, output_ptr + x, tmp);
+
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+
+ }
+ while(svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/NEON/kernels/activation/impl/list.h b/src/core/NEON/kernels/activation/impl/list.h
index 3b48ee3e22..db6c5b21b8 100644
--- a/src/core/NEON/kernels/activation/impl/list.h
+++ b/src/core/NEON/kernels/activation/impl/list.h
@@ -32,10 +32,15 @@ namespace cpu
void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
#undef DECLARE_ACTIVATION_KERNEL
} // namespace cpu
diff --git a/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp b/src/core/NEON/kernels/floor/impl/NEON/fp16.cpp
index 4f56ca9daf..4f56ca9daf 100644
--- a/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp
+++ b/src/core/NEON/kernels/floor/impl/NEON/fp16.cpp
diff --git a/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp b/src/core/NEON/kernels/floor/impl/NEON/fp32.cpp
index 3f4b14b3e5..3f4b14b3e5 100644
--- a/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp
+++ b/src/core/NEON/kernels/floor/impl/NEON/fp32.cpp
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index dcea3e8d38..649fe468a3 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -24,34 +24,63 @@
#ifndef SRC_CORE_COMMON_REGISTRARS_H
#define SRC_CORE_COMMON_REGISTRARS_H
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#if defined(ENABLE_FP16_KERNELS)
+
+#if defined(__ARM_FEATURE_SVE)
+#define REGISTER_FP16_SVE(func_name) &(func_name)
+#else /* !defined(__ARM_FEATURE_SVE) */
+#define REGISTER_FP16_SVE(func_name) nullptr
+#endif /* defined(__ARM_FEATURE_SVE) */
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#define REGISTER_FP16_NEON(func_name) &(func_name)
-#else /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#define REGISTER_FP16_NEON(func_name) nullptr
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+
+#else /* !defined(ENABLE_FP16_KERNELS) */
#define REGISTER_FP16_NEON(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name) nullptr
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
#if defined(ENABLE_FP32_KERNELS)
+#if defined(__ARM_FEATURE_SVE)
+#define REGISTER_FP32_SVE(func_name) &(func_name)
+#endif /* defined(__ARM_FEATURE_SVE) */
#define REGISTER_FP32_NEON(func_name) &(func_name)
#else /* defined(ENABLE_FP32_KERNELS) */
#define REGISTER_FP32_NEON(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name) nullptr
#endif /* defined(ENABLE_FP32_KERNELS) */
#if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
+#if defined(__ARM_FEATURE_SVE)
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
+#endif /* defined(__ARM_FEATURE_SVE) */
#define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name)
#else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
#define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
#endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
#if defined(ENABLE_QASYMM8_KERNELS)
+#if defined(__ARM_FEATURE_SVE)
+#define REGISTER_QASYMM8_SVE(func_name) &(func_name)
+#endif /* defined(__ARM_FEATURE_SVE) */
#define REGISTER_QASYMM8_NEON(func_name) &(func_name)
#else /* defined(ENABLE_QASYMM8_KERNELS) */
#define REGISTER_QASYMM8_NEON(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name) nullptr
#endif /* defined(ENABLE_QASYMM8_KERNELS) */
#if defined(ENABLE_QSYMM16_KERNELS)
+#if defined(__ARM_FEATURE_SVE)
+#define REGISTER_QSYMM16_SVE(func_name) &(func_name)
+#endif /* defined(__ARM_FEATURE_SVE) */
#define REGISTER_QSYMM16_NEON(func_name) &(func_name)
#else /* defined(ENABLE_QSYMM16_KERNELS) */
#define REGISTER_QSYMM16_NEON(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name) nullptr
#endif /* defined(ENABLE_QSYMM16_KERNELS) */
#endif /* SRC_CORE_COMMON_REGISTRARS_H */
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 84ff288b2f..f8f35f0a8e 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -61,7 +61,6 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
switch(activation)
{
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
case ActivationLayerInfo::ActivationFunction::ELU:
case ActivationLayerInfo::ActivationFunction::SQRT:
case ActivationLayerInfo::ActivationFunction::TANH:
@@ -69,10 +68,26 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
switch(data_type)
{
case DataType::F16:
+#if defined(__ARM_FEATURE_SVE)
+ return RelativeTolerance<float>(0.25f);
+#else // !defined(__ARM_FEATURE_SVE)
return RelativeTolerance<float>(0.1f);
+#endif // defined(__ARM_FEATURE_SVE)
default:
return RelativeTolerance<float>(0.05f);
}
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ switch(data_type)
+ {
+ case DataType::F16:
+#if defined(__ARM_FEATURE_SVE)
+ return RelativeTolerance<float>(0.9f);
+#else // !defined(__ARM_FEATURE_SVE)
+ return RelativeTolerance<float>(0.01f);
+#endif // defined(__ARM_FEATURE_SVE)
+ default:
+ return RelativeTolerance<float>(0.00001f);
+ }
default:
return RelativeTolerance<float>(0.f);
}
@@ -90,14 +105,29 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
switch(activation)
{
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
case ActivationLayerInfo::ActivationFunction::SQRT:
case ActivationLayerInfo::ActivationFunction::TANH:
case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
switch(data_type)
{
case DataType::F16:
+#if defined(__ARM_FEATURE_SVE)
+ return AbsoluteTolerance<float>(0.25f);
+#else // !defined(__ARM_FEATURE_SVE)
+ return AbsoluteTolerance<float>(0.01f);
+#endif // defined(__ARM_FEATURE_SVE)
+ default:
+ return AbsoluteTolerance<float>(0.00001f);
+ }
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ switch(data_type)
+ {
+ case DataType::F16:
+#if defined(__ARM_FEATURE_SVE)
+ return AbsoluteTolerance<float>(0.9f);
+#else // !defined(__ARM_FEATURE_SVE)
return AbsoluteTolerance<float>(0.01f);
+#endif // defined(__ARM_FEATURE_SVE)
default:
return AbsoluteTolerance<float>(0.00001f);
}
@@ -107,10 +137,10 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
}
/** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
-#else // defined(__aarch64__)
+#if(!defined(__aarch64__) || defined(__ARM_FEATURE_SVE2))
constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+#else // !(!defined(__aarch64__) || defined(__ARM_FEATURE_SVE2))
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
#endif // defined(__aarch64__)
constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);