diff options
Diffstat (limited to 'src')
20 files changed, 1699 insertions, 3 deletions
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h new file mode 100644 index 0000000000..4b0ecd9eea --- /dev/null +++ b/src/core/NEON/SVEAsymm.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_SVEASYMM_H +#define ARM_COMPUTE_SVEASYMM_H + +#if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +/** Perform a multiply-accumulate on all components of a QASYMM8 vector + * + * vd*vs + vo + * + * @param[in] pg Predicate value. + * @param[in] vd Input vector value in QASYMM8 format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A vector in QASYMM8 format, saturated to fit + */ +svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo); + +/** Perform a multiply-accumulate on all components of a QASYMM8_SIGNED vector + * + * vd*vs + vo + * + * @param[in] pg Predicate value. + * @param[in] vd Input vector value in QASYMM8_SIGNED format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A vector in QASYMM8_SIGNED format, saturated to fit + */ +svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo); + +/** Dequantize following an asymmetric quantization scheme a sve vector. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * @param[in] offset Zero quantization offset. + * + * @return Dequantized values in an sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scale, int32_t offset) +{ + const auto voffset = svdup_n_s32(offset); + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale), + } + } + }; + return vdequantized_input; +} + +/** Dequantize an sve vector + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in an sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, const UniformQuantizationInfo &qi) +{ + return svdequantize_z(pg, qv, qi.scale, qi.offset); +} + +/** Dequantize an sve vector stored as signed asymmetric. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * @param[in] offset Zero quantization offset. + * + * @return Dequantized values in a sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale, int32_t offset) +{ + const auto voffset = svdup_n_s32(offset); + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale), + } + } + }; + return vdequantized_input; +} + +/** Dequantize an sve vector. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in an sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const UniformQuantizationInfo &qi) +{ + return svdequantize_z(pg, qv, qi.scale, qi.offset); +} + +/** Dequantize following symmetric quantization scheme on an sve vector. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] vscale Vector containing quantization scaling factors. + * + * @return Dequantized values in a sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale) +{ + const svfloat32x4_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)), + } + } + }; + return vdequantized_input; +} + +/** Dequantize following a symmetric quantization scheme an sve vector. + * + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scaling factor. + * + * @return Dequantized values in a sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale) +{ + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale), + } + } + }; + return vdequantized_input; +} + +/** Quantize an sve vector holding floating point values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return An sve vector holding the quantized values + */ +inline svuint8_t svquantize_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const auto voffset = svdup_n_f32(offset); + const auto vinvscale = svdup_n_f32(1.f / scale); + + const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale)); + const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale)); + const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale)); + const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + return svqxtnt_u16(svqxtnb_u16(pa), pb); +} + +/** Signed quantize an sve vector holding floating point values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return An sve vector holding the quantized values + */ +inline svint8_t svquantize_signed_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const auto voffset = svdup_n_f32(offset); + const auto vinvscale = svdup_n_f32(1.f / scale); + const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale)); + const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale)); + const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + + return svqxtnt_s16(svqxtnb_s16(pa), pb); +} + +/** Quantize to QASYMM16 an sve vector holding 16 floating point values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return An sve vector holding the quantized values + */ +inline svuint16x2_t svquantize_qasymm16_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const auto voffset = svdup_n_f32(offset); + const auto vinvscale = svdup_n_f32(1.f / scale); + + const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale)); + const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale)); + const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale)); + const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + return svcreate2_u16(pa, pb); +} +} // namespace arm_compute +#include "src/core/NEON/SVEAsymm.inl" +#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif // ARM_COMPUTE_NEASYMM_H diff --git a/src/core/NEON/SVEAsymm.inl b/src/core/NEON/SVEAsymm.inl new file mode 100644 index 0000000000..edf5733c36 --- /dev/null +++ b/src/core/NEON/SVEAsymm.inl @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +namespace arm_compute +{ +#if defined(__ARM_FEATURE_SVE2) +inline svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo) +{ + // Convert uint8 vectors to uint16 vectors + auto vd_low_u16 = svmovlb_u16(vd); + auto vd_high_u16 = svmovlt_u16(vd); + + // Convert uint16 vectors to uint32 vectors + auto A_u32 = svmovlb_u32(vd_low_u16); + auto B_u32 = svmovlt_u32(vd_low_u16); + auto C_u32 = svmovlb_u32(vd_high_u16); + auto D_u32 = svmovlt_u32(vd_high_u16); + + // Convert uint32 vectors to float32 vectors + auto A_f32 = svcvt_f32_u32_z(pg, A_u32); + auto B_f32 = svcvt_f32_u32_z(pg, B_u32); + auto C_f32 = svcvt_f32_u32_z(pg, C_u32); + auto D_f32 = svcvt_f32_u32_z(pg, D_u32); + + // vd = vd*vs + vo + A_f32 = svmla_f32_z(pg, vo, A_f32, vs); + B_f32 = svmla_f32_z(pg, vo, B_f32, vs); + C_f32 = svmla_f32_z(pg, vo, C_f32, vs); + D_f32 = svmla_f32_z(pg, vo, D_f32, vs); + + // Convert float32 vectors to uint32 vectors + A_u32 = svcvt_u32_f32_z(pg, A_f32); + B_u32 = svcvt_u32_f32_z(pg, B_f32); + C_u32 = svcvt_u32_f32_z(pg, C_f32); + D_u32 = svcvt_u32_f32_z(pg, D_f32); + + // Convert uint32 vectors to uint16 vectors (with saturation) + vd_low_u16 = svqxtnt_u32(svqxtnb_u32(A_u32), B_u32); + vd_high_u16 = svqxtnt_u32(svqxtnb_u32(C_u32), D_u32); + + // convert uint16 vectors to uint8 vectors (with saturation) + const auto res = svqxtnt_u16(svqxtnb_u16(vd_low_u16), vd_high_u16); + return res; +} + +inline svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo) +{ + // Convert uint8 vectors to int16 vectors + auto vd_low_s16 = svmovlb_s16(vd); + auto vd_high_s16 = svmovlt_s16(vd); + + // Convert int16 vectors to int32 vectors + auto A_s32 = svmovlb_s32(vd_low_s16); + auto B_s32 = svmovlt_s32(vd_low_s16); + auto C_s32 = svmovlb_s32(vd_high_s16); + auto D_s32 = svmovlt_s32(vd_high_s16); + + // Convert int32 vectors to float32 vectors + auto A_f32 = svcvt_f32_s32_z(pg, A_s32); + auto B_f32 = svcvt_f32_s32_z(pg, B_s32); + auto C_f32 = svcvt_f32_s32_z(pg, C_s32); + auto D_f32 = svcvt_f32_s32_z(pg, D_s32); + + // vd = vd*vs + vo + A_f32 = svmla_f32_z(pg, vo, A_f32, vs); + B_f32 = svmla_f32_z(pg, vo, B_f32, vs); + C_f32 = svmla_f32_z(pg, vo, C_f32, vs); + D_f32 = svmla_f32_z(pg, vo, D_f32, vs); + + // Convert float32 vectors to int32 vectors + A_s32 = svcvt_s32_f32_z(pg, A_f32); + B_s32 = svcvt_s32_f32_z(pg, B_f32); + C_s32 = svcvt_s32_f32_z(pg, C_f32); + D_s32 = svcvt_s32_f32_z(pg, D_f32); + + // Convert uint32 vectors to uint16 vectors (with saturation) + vd_low_s16 = svqxtnt_s32(svqxtnb_s32(A_s32), B_s32); + vd_high_s16 = svqxtnt_s32(svqxtnb_s32(C_s32), D_s32); + + // convert uint16 vectors to uint8 vectors (with saturation) + const auto res = svqxtnt_s16(svqxtnb_s16(vd_low_s16), vd_high_s16); + return res; +} +#endif /* (__ARM_FEATURE_SVE2) */ +} // namespace arm_compute diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h new file mode 100644 index 0000000000..bdf2e894e2 --- /dev/null +++ b/src/core/NEON/SVEMath.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_SVEMATH_H +#define ARM_COMPUTE_SVEMATH_H + +#if defined(__ARM_FEATURE_SVE) +#include <arm_sve.h> +#include <array> + +namespace arm_compute +{ +/** Calculate exponent. + * + * @param[in] pg Input reciprocal. + * @param[in] val Input vector value in F32 format. + * + * @return The calculated exponent. + */ +svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t val); + +/** Calculate reciprocal. + * + * @param[in] pg Input reciprocal. + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x); + +/** Calculate logarithm + * + * @param[in] pg Input reciprocal. + * @param[in] x Input vector value in F32 format. + * + * @return The calculated logarithm. + */ +svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x); + +/** Calculate hyperbolic tangent. + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @note We clamp x to [-5,5] to avoid overflowing issues. + * + * @param[in] pg Input reciprocal. + * @param[in] val Input vector value in F32 format. + * + * @return The calculated Hyperbolic Tangent. + */ +svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val); + +/** Calculate hyperbolic tangent. + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @note We clamp x to [-5,5] to avoid overflowing issues. + * + * @param[in] pg Input reciprocal. + * @param[in] val Input vector value in F16 format. + * + * @return The calculated Hyperbolic Tangent. + */ +svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val); + +/** Calculate exponential + * + * @param[in] pg Input reciprocal. + * @param[in] x Input vector value in F16 format. + * + * @return The calculated exponent. + */ +svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x); + +/** Calculate reciprocal. + * + * @param[in] pg Input reciprocal. + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x); + +/** Calculate logarithm + * + * @param[in] pg Input reciprocal. + * @param[in] x Input vector value in F32 format. + * + * @return The calculated logarithm. + */ +svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x); + +} // namespace arm_compute +#include "src/core/NEON/SVEMath.inl" +#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl new file mode 100644 index 0000000000..5ebfeaa5c5 --- /dev/null +++ b/src/core/NEON/SVEMath.inl @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <cmath> +#include <limits> + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_compute +{ +inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, const std::array<svfloat32_t, 8> &coeffs) +{ + const auto A = svmla_f32_z(pg, coeffs[0], coeffs[4], x); + const auto B = svmla_f32_z(pg, coeffs[2], coeffs[6], x); + const auto C = svmla_f32_z(pg, coeffs[1], coeffs[5], x); + const auto D = svmla_f32_z(pg, coeffs[3], coeffs[7], x); + const auto x2 = svmul_f32_z(pg, x, x); + const auto x4 = svmul_f32_z(pg, x2, x2); + const auto res = svmla_f32_z(pg, svmla_f32_z(pg, A, B, x2), svmla_f32_z(pg, C, D, x2), x4); + return res; +} + +inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, const std::array<svfloat16_t, 8> &coeffs) +{ + const auto A = svmla_f16_z(pg, coeffs[0], coeffs[4], x); + const auto B = svmla_f16_z(pg, coeffs[2], coeffs[6], x); + const auto C = svmla_f16_z(pg, coeffs[1], coeffs[5], x); + const auto D = svmla_f16_z(pg, coeffs[3], coeffs[7], x); + const auto x2 = svmul_f16_z(pg, x, x); + const auto x4 = svmul_f16_z(pg, x2, x2); + const auto res = svmla_f16_z(pg, svmla_f16_z(pg, A, B, x2), svmla_f16_z(pg, C, D, x2), x4); + return res; +} + +inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x) +{ + auto recip = svrecpe_f16(x); + recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip); + recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip); + return recip; +} + +inline svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x) +{ + auto recip = svrecpe_f32(x); + recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip); + recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip); + return recip; +} + +inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) +{ + const auto CONST_LN2 = svdup_n_f32(0.6931471805f); // ln(2) + const auto CONST_INV_LN2 = svdup_n_f32(1.4426950408f); // 1/ln(2) + const auto CONST_INF = svdup_n_f32(std::numeric_limits<float>::infinity()); + const auto CONST_MAX_INPUT = svdup_n_f32(88.7f); + const auto CONST_0 = svdup_n_f32(0.f); + const auto CONST_NEGATIVE_126 = svdup_n_s32(-126); + + /** Exponent polynomial coefficients */ + const std::array<svfloat32_t, 8> exp_tab = + { + { + svdup_n_f32(1.f), + svdup_n_f32(0.0416598916054f), + svdup_n_f32(0.500000596046f), + svdup_n_f32(0.0014122662833f), + svdup_n_f32(1.00000011921f), + svdup_n_f32(0.00833693705499f), + svdup_n_f32(0.166665703058f), + svdup_n_f32(0.000195780929062f), + } + }; + + // Perform range reduction [-log(2),log(2)] + auto m = svcvt_s32_f32_z(pg, svmul_f32_z(pg, x, CONST_INV_LN2)); + auto val = svmls_f32_z(pg, x, svcvt_f32_s32_z(pg, m), CONST_LN2); + + // Polynomial Approximation + auto poly = svtaylor_poly_f32_z(pg, val, exp_tab); + + // Reconstruct + poly = svreinterpret_f32_s32(svqadd_s32(svreinterpret_s32_f32(poly), svlsl_n_s32_z(pg, m, 23))); + + // Handle underflow + svbool_t ltpg = svcmplt_s32(pg, m, CONST_NEGATIVE_126); + poly = svsel_f32(ltpg, CONST_0, poly); + + // Handle overflow + svbool_t gtpg = svcmpgt_f32(pg, x, CONST_MAX_INPUT); + poly = svsel_f32(gtpg, CONST_INF, poly); + + return poly; +} + +inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x) +{ + const auto CONST_LN2 = svdup_n_f16(0.6931471805f); // ln(2) + const auto CONST_INV_LN2 = svdup_n_f16(1.4426950408f); // 1/ln(2) + const auto CONST_INF = svdup_n_f16(std::numeric_limits<float16_t>::infinity()); + const auto CONST_MAX_INPUT = svdup_n_f16(88.7f); + const auto CONST_0 = svdup_n_f16(0.f); + const auto CONST_NEGATIVE_126 = svdup_n_s16(-126); + + /** Exponent polynomial coefficients */ + const std::array<svfloat16_t, 8> exp_tab = + { + { + svdup_n_f16(1.f), + svdup_n_f16(0.0416598916054f), + svdup_n_f16(0.500000596046f), + svdup_n_f16(0.0014122662833f), + svdup_n_f16(1.00000011921f), + svdup_n_f16(0.00833693705499f), + svdup_n_f16(0.166665703058f), + svdup_n_f16(0.000195780929062f), + } + }; + + // Perform range reduction [-log(2),log(2)] + auto m = svcvt_s16_f16_z(pg, svmul_f16_z(pg, x, CONST_INV_LN2)); + auto val = svmls_f16_z(pg, x, svcvt_f16_s16_z(pg, m), CONST_LN2); + + // Polynomial Approximation + auto poly = svtaylor_poly_f16_z(pg, val, exp_tab); + + // Reconstruct + poly = svreinterpret_f16_s16(svqadd_s16(svreinterpret_s16_f16(poly), svlsl_n_s16_z(pg, m, 11))); + + // Handle underflow + svbool_t ltpg = svcmplt_s16(pg, m, CONST_NEGATIVE_126); + poly = svsel_f16(ltpg, CONST_0, poly); + + // Handle overflow + svbool_t gtpg = svcmpgt_f16(pg, x, CONST_MAX_INPUT); + poly = svsel_f16(gtpg, CONST_INF, poly); + + return poly; +} + +inline svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val) +{ + const svfloat32_t CONST_1 = svdup_n_f32(1.f); + const svfloat32_t CONST_2 = svdup_n_f32(2.f); + const svfloat32_t CONST_MIN_TANH = svdup_n_f32(-10.f); + const svfloat32_t CONST_MAX_TANH = svdup_n_f32(10.f); + + svfloat32_t x = svmin_f32_z(pg, svmax_f32_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH); + svfloat32_t exp2x = svexp_f32_z(pg, svmul_f32_z(pg, CONST_2, x)); + svfloat32_t num = svsub_f32_z(pg, exp2x, CONST_1); + svfloat32_t den = svadd_f32_z(pg, exp2x, CONST_1); + svfloat32_t tanh = svdiv_f32_z(pg, num, den); + return tanh; +} + +inline svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val) +{ + const svfloat16_t CONST_1 = svdup_n_f16(1.f); + const svfloat16_t CONST_2 = svdup_n_f16(2.f); + const svfloat16_t CONST_MIN_TANH = svdup_n_f16(-10.f); + const svfloat16_t CONST_MAX_TANH = svdup_n_f16(10.f); + + const svfloat16_t x = svmin_f16_z(pg, svmax_f16_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH); + const svfloat16_t exp2x = svexp_f16_z(pg, svmul_f16_z(pg, CONST_2, x)); + const svfloat16_t num = svsub_f16_z(pg, exp2x, CONST_1); + const svfloat16_t den = svadd_f16_z(pg, exp2x, CONST_1); + const svfloat16_t tanh = svdiv_f16_z(pg, num, den); + return tanh; +} + +inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x) +{ +#if defined(__ARM_FEATURE_SVE2) + return svcvt_f32_s32_z(pg, svlogb_f32_z(pg, x)); +#else /* !defined(__ARM_FEATURE_SVE2) */ + /** Logarithm polynomial coefficients */ + const std::array<svfloat32_t, 8> log_tab = + { + { + svdup_n_f32(-2.29561495781f), + svdup_n_f32(-2.47071170807f), + svdup_n_f32(-5.68692588806f), + svdup_n_f32(-0.165253549814f), + svdup_n_f32(5.17591238022f), + svdup_n_f32(0.844007015228f), + svdup_n_f32(4.58445882797f), + svdup_n_f32(0.0141278216615f), + } + }; + + const auto CONST_127 = svdup_n_s32(127); // 127 + const auto CONST_LN2 = svdup_n_f32(0.6931471805f); // ln(2) + + // Extract exponent + auto m = svsub_s32_z(pg, svasr_n_s32_z(pg, svreinterpret_s32_f32(x), 23), CONST_127); + auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23))); + + // Polynomial Approximation + auto poly = svtaylor_poly_f32_z(pg, val, log_tab); + + // Reconstruct + poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2); + + return poly; +#endif /* defined(__ARM_FEATURE_SVE2) */ +} + +inline svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x) +{ +#if defined(__ARM_FEATURE_SVE2) + return svcvt_f16_s16_z(pg, svlogb_f16_z(pg, x)); +#else /* !defined(__ARM_FEATURE_SVE2) */ + + /** Logarithm polynomial coefficients */ + const std::array<svfloat16_t, 8> log_tab + { + { + svdup_n_f16(-2.29561495781f), + svdup_n_f16(-2.47071170807f), + svdup_n_f16(-5.68692588806f), + svdup_n_f16(-0.165253549814f), + svdup_n_f16(5.17591238022f), + svdup_n_f16(0.844007015228f), + svdup_n_f16(4.58445882797f), + svdup_n_f16(0.0141278216615f), + } + }; + + const auto CONST_7 = svdup_n_s16(7); // 7 + const auto CONST_LN2 = svdup_n_f16(0.6931471805f); // ln(2) + + // Extract exponent + auto m = svsub_s16_z(pg, svasr_n_s16_z(pg, svreinterpret_s16_f16(x), 11), CONST_7); + auto val = svreinterpret_f16_s16(svsub_s16_z(pg, svreinterpret_s16_f16(x), svlsl_n_s16_z(pg, m, 11))); + + // Polynomial Approximation + auto poly = svtaylor_poly_f16_z(pg, val, log_tab); + + // Reconstruct + poly = svmla_f16_z(pg, poly, svcvt_f16_s16_z(pg, m), CONST_LN2); + + return poly; +#endif /* defined(__ARM_FEATURE_SVE2) */ +} +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */ diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h new file mode 100644 index 0000000000..30e1e172a3 --- /dev/null +++ b/src/core/NEON/SVESymm.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_SVESYMM_H +#define ARM_COMPUTE_SVESYMM_H + +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +/** Dequantize an sve vector holding 16-bit quantized values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] scale Quantization scale + * + * @return Dequantized values in an sve vector + */ +inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale) +{ + const auto vscale = svdup_n_f32(scale); + const svfloat32x2_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale) + } + } + }; + return vdequantized_input; +} + +/** Quantize an sve vector holding 8 floating point values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be quantized. + * @param[in] scale Quantization scale + * + * @return An sve vector holding the quantized values + */ +inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float scale) +{ + const svfloat32_t vinvscale = svdup_n_f32(1.f / scale); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget2_f32(qv, 0), vinvscale)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget2_f32(qv, 1), vinvscale)); + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + + return pa; +} + +/** Dequantize an sve vector holding 16 16-bit quantized values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be dequantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return Dequantized values in an sve vector + */ +inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + { + { { + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale), + } + } + }; + return vdequantized_input; +} + +/** Quantize an sve vector holding 16 floating point values. + * + * @param[in] pg Predicate value. + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return An sve vector holding the quantized values + */ +inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + ARM_COMPUTE_ERROR_ON(scale == 0.f); + const auto vinvscale = svdup_n_f32(1.f / scale); + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 0), vinvscale)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 1), vinvscale)); + const auto rf_2 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 2), vinvscale)); + const auto rf_3 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svget4_f32(qv, 3), vinvscale)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + + return svcreate2_s16(pa, pb); +} + +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif // ARM_COMPUTE_NESYMM_H
\ No newline at end of file diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index d969fd8e38..f215787bf6 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -56,6 +56,18 @@ struct ActivationKernel static const ActivationKernel available_kernels[] = { +#if defined(__ARM_FEATURE_SVE) + { + "fp16_sve_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation) + }, + { + "fp32_sve_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) + }, +#else /* !defined(__ARM_FEATURE_SVE) */ { "fp16_neon_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, @@ -66,6 +78,25 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) }, +#endif /* defined(__ARM_FEATURE_SVE) */ + +#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */ + { + "qasymm8_sve_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation) + }, + { + "qasymm8_signed_sve_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation) + }, + { + "qsymm16_sve_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation) + }, +#else /* !defined(__ARM_FEATURE_SVE2) */ { "qasymm8_neon_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, @@ -81,6 +112,7 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) }, +#endif /* defined(__ARM_FEATURE_SVE2) */ }; const ActivationKernel *get_implementation(const ActivationSelectorData &data) @@ -159,7 +191,6 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, *input->clone()); - // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped Coordinates coord; coord.set_num_dimensions(output->num_dimensions()); output->set_valid_region(ValidRegion(coord, output->tensor_shape())); diff --git a/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/fp16.cpp index 58e1cfcf23..58e1cfcf23 100644 --- a/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/NEON/fp16.cpp diff --git a/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/fp32.cpp index 610db05224..610db05224 100644 --- a/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/NEON/fp32.cpp diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp index 7b26441824..7b26441824 100644 --- a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/NEON/qasymm8.cpp diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp index c616c5e27d..c616c5e27d 100644 --- a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/NEON/qasymm8_signed.cpp diff --git a/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp index 0bef807db9..0bef807db9 100644 --- a/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/NEON/qsymm16.cpp diff --git a/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp b/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp new file mode 100644 index 0000000000..8d6f4f2351 --- /dev/null +++ b/src/core/NEON/kernels/activation/impl/SVE/fp16.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/common/StdTypes.h" +#include "src/core/common/Validate.h" + +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_SVE) +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f16(1.f); + const auto const_0 = svdup_n_f16(0.f); + const auto const_6 = svdup_n_f16(6.f); + const auto const_3 = svdup_n_f16(3.f); + const auto const_inv_6 = svdup_n_f16(0.166666667f); + + const auto va = svdup_n_f16(act_info.a()); + const auto vb = svdup_n_f16(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + svfloat16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_f16(pg, input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f16_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f16_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f16_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } + while(svptest_any(svptrue_b16(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif // __ARM_FEATURE_SVE
\ No newline at end of file diff --git a/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp b/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp new file mode 100644 index 0000000000..2c276028a0 --- /dev/null +++ b/src/core/NEON/kernels/activation/impl/SVE/fp32.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/common/StdTypes.h" +#include "src/core/common/Validate.h" + +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_SVE) +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f32(1.f); + const auto const_0 = svdup_n_f32(0.f); + const auto const_6 = svdup_n_f32(6.f); + const auto const_3 = svdup_n_f32(3.f); + const auto const_inv_6 = svdup_n_f32(0.166666667f); + + const auto va = svdup_n_f32(act_info.a()); + const auto vb = svdup_n_f32(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + svfloat32_t tmp; + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + const auto vin = svld1_f32(pg, input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f32_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f32_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f32_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f32(pg, output_ptr + x, tmp); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + + } + while(svptest_any(svptrue_b32(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif // __ARM_FEATURE_SVE
\ No newline at end of file diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp new file mode 100644 index 0000000000..a49a562c84 --- /dev/null +++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/common/StdTypes.h" +#include "src/core/common/Validate.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8(0.f, qi_in); + const auto vconst_0 = svdup_n_u8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + const auto const_6_f32 = svdup_n_f32(6.f); + const auto const_0_f32 = svdup_n_f32(0.f); + const auto const_3_f32 = svdup_n_f32(3.f); + const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + bool requant = true; + if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + svuint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_u8(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_u8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = svmla_qasymm8_z(pg, tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_u8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } + while(svptest_any(svptrue_b8(), pg)); + + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE2) */
\ No newline at end of file diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp new file mode 100644 index 0000000000..f34bee88fc --- /dev/null +++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/StdTypes.h" +#include "src/core/common/Validate.h" + +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8_signed(0.f, qi_in); + const auto vconst_0 = svdup_n_s8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + const auto const_6_f32 = svdup_n_f32(6.f); + const auto const_0_f32 = svdup_n_f32(0.f); + const auto const_3_f32 = svdup_n_f32(3.f); + const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + bool requant = true; + if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + svint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_s8(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_s8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } + while(svptest_any(svptrue_b8(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE2) */ diff --git a/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp b/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp new file mode 100644 index 0000000000..1432e3bbdf --- /dev/null +++ b/src/core/NEON/kernels/activation/impl/SVE/qsymm16.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/experimental/Types.h" +#include "src/core/common/StdTypes.h" +#include "src/core/common/Validate.h" + +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/SVESymm.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + svint16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_s16(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } + while(svptest_any(svptrue_b16(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE2) */ diff --git a/src/core/NEON/kernels/activation/impl/list.h b/src/core/NEON/kernels/activation/impl/list.h index 3b48ee3e22..db6c5b21b8 100644 --- a/src/core/NEON/kernels/activation/impl/list.h +++ b/src/core/NEON/kernels/activation/impl/list.h @@ -32,10 +32,15 @@ namespace cpu void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation); +DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation); DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation); +DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation); DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation); +DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation); DECLARE_ACTIVATION_KERNEL(fp16_neon_activation); +DECLARE_ACTIVATION_KERNEL(fp16_sve_activation); DECLARE_ACTIVATION_KERNEL(fp32_neon_activation); +DECLARE_ACTIVATION_KERNEL(fp32_sve_activation); #undef DECLARE_ACTIVATION_KERNEL } // namespace cpu diff --git a/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp b/src/core/NEON/kernels/floor/impl/NEON/fp16.cpp index 4f56ca9daf..4f56ca9daf 100644 --- a/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp +++ b/src/core/NEON/kernels/floor/impl/NEON/fp16.cpp diff --git a/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp b/src/core/NEON/kernels/floor/impl/NEON/fp32.cpp index 3f4b14b3e5..3f4b14b3e5 100644 --- a/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp +++ b/src/core/NEON/kernels/floor/impl/NEON/fp32.cpp diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index dcea3e8d38..649fe468a3 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -24,34 +24,63 @@ #ifndef SRC_CORE_COMMON_REGISTRARS_H #define SRC_CORE_COMMON_REGISTRARS_H -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#if defined(ENABLE_FP16_KERNELS) + +#if defined(__ARM_FEATURE_SVE) +#define REGISTER_FP16_SVE(func_name) &(func_name) +#else /* !defined(__ARM_FEATURE_SVE) */ +#define REGISTER_FP16_SVE(func_name) nullptr +#endif /* defined(__ARM_FEATURE_SVE) */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define REGISTER_FP16_NEON(func_name) &(func_name) -#else /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ +#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#define REGISTER_FP16_NEON(func_name) nullptr +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + +#else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr +#define REGISTER_FP16_SVE(func_name) nullptr #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ #if defined(ENABLE_FP32_KERNELS) +#if defined(__ARM_FEATURE_SVE) +#define REGISTER_FP32_SVE(func_name) &(func_name) +#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_FP32_NEON(func_name) &(func_name) #else /* defined(ENABLE_FP32_KERNELS) */ #define REGISTER_FP32_NEON(func_name) nullptr +#define REGISTER_FP32_SVE(func_name) nullptr #endif /* defined(ENABLE_FP32_KERNELS) */ #if defined(ENABLE_QASYMM8_SIGNED_KERNELS) +#if defined(__ARM_FEATURE_SVE) +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name) +#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name) #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #if defined(ENABLE_QASYMM8_KERNELS) +#if defined(__ARM_FEATURE_SVE) +#define REGISTER_QASYMM8_SVE(func_name) &(func_name) +#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_QASYMM8_NEON(func_name) &(func_name) #else /* defined(ENABLE_QASYMM8_KERNELS) */ #define REGISTER_QASYMM8_NEON(func_name) nullptr +#define REGISTER_QASYMM8_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_KERNELS) */ #if defined(ENABLE_QSYMM16_KERNELS) +#if defined(__ARM_FEATURE_SVE) +#define REGISTER_QSYMM16_SVE(func_name) &(func_name) +#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_QSYMM16_NEON(func_name) &(func_name) #else /* defined(ENABLE_QSYMM16_KERNELS) */ #define REGISTER_QSYMM16_NEON(func_name) nullptr +#define REGISTER_QSYMM16_SVE(func_name) nullptr #endif /* defined(ENABLE_QSYMM16_KERNELS) */ #endif /* SRC_CORE_COMMON_REGISTRARS_H */ |