Adding GELU activation

OpenCL implementation uses built in erf. NEON implementation requires new vectorized erf. Uses the following approximation: erf(x) = 1 - 1 / (1 + a1x + a2x^2 + a3x^3 + a4x^4)^4 a1 = 0.278393, a2 = 0.230389, a3 = 0.000972, a4 = 0.078108 From https://en.wikipedia.org/wiki/Error_function#Numerical_approximations Signed-off-by: Murray Kornelsen <murray.kornelsen@mail.mcgill.ca> Change-Id: I2d3964b2c26a4334166b17135f9104bc6324fad2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7921 Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Pablo Marquez Tello <pablo.tello@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Murray Kornelsen <murray.kornelsen@mail.mcgill.ca> 2022-07-13 21:22:39 -0400
committer: Pablo Marquez Tello <pablo.tello@arm.com> 2022-09-14 09:15:03 +0000
commit: 926f502ca731fa49bcdf949408ce25728616e5f2 (patch)
tree: 7e221103a9c0c5c0e4c054abc07cbdf11c7c7b4e /src/cpu
parent: 6e09e1404c635d948cf20eb6b4b5747dfb6656f2 (diff)
download: ComputeLibrary-926f502ca731fa49bcdf949408ce25728616e5f2.tar.gz
3 files changed, 47 insertions, 10 deletions
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index ee9db99080..61efcb2dd6 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -46,7 +46,8 @@ namespace
 static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
 {
 #ifdef __aarch64__
-    { // Neon LUT implementantion takes precedence
+    {
+        // Neon LUT implementantion takes precedence
         "neon_q8_activation_lut",
         [](const ActivationDataTypeISASelectorData & data) { return ActivationLayerInfo::is_lut_supported(data.f, data.dt); },
         REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)
@@ -54,27 +55,27 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
 #endif // __aarch64__
     {
         "sve2_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2; },
+        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
         REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
     },
     {
         "sve2_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2; },
+        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
         REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
     },
     {
         "sve2_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2; },
+        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
         REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
     },
     {
         "sve_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16; },
+        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
         REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
     },
     {
         "sve_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve; },
+        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
         REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
     },
     {
@@ -105,7 +106,7 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
 };
 
 /* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations =
 {
     ActivationLayerInfo::ActivationFunction::RELU,
     ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
@@ -114,6 +115,7 @@ static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_acti
     ActivationLayerInfo::ActivationFunction::TANH,
     ActivationLayerInfo::ActivationFunction::HARD_SWISH,
     ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+    ActivationLayerInfo::ActivationFunction::GELU,
 };
 /* Supported activation in the 16-bit integer domain */
 static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations =
@@ -193,7 +195,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
 #ifdef __aarch64__
     if(ActivationLayerInfo::is_lut_supported(activation_info.activation(), src->data_type()))
     {
-        activation_info.init_lut(src->data_type(), src->quantization_info().uniform(), (dst)?dst->quantization_info().uniform():src->quantization_info().uniform());
+        activation_info.init_lut(src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform());
     }
 #endif // __aarch64__
     _act_info = activation_info;
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
index 2dd239e3a1..35abcb5408 100644
--- a/src/cpu/kernels/activation/generic/neon/impl.h
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,7 +77,9 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
     const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
     const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
     const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
+    const auto      const_inv_2       = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
     const auto      const_inv_6       = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
+    const auto      const_inv_sqrt_2  = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
     constexpr float soft_relu_thresh  = 12.f;
     const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
     const auto      va                = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
@@ -146,6 +148,9 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
                 case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
                     tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
                     break;
+                case ActivationLayerInfo::ActivationFunction::GELU:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+                    break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported activation function");
             }
@@ -200,6 +205,9 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
                 case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
                     tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
                     break;
+                case ActivationLayerInfo::ActivationFunction::GELU:
+                    tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+                    break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported activation function");
             }
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index 67d9e0a8ca..05a0b505ca 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -58,9 +58,13 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
     const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
     const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
     const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+
 #ifndef __aarch64__
-    const auto vconst_1     = vdupq_n_f32(1.f);
     const auto vconst_0_f32 = vdupq_n_f32(0);
+#else  // #ifndef __aarch64__
+    const auto const_inv_2      = vdupq_n_f32(0.5f);
+    const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f);
 #endif // __aarch64__
     const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
     const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
@@ -193,6 +197,23 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
 
                 tmp = vquantize(tmp_dep, qi_out);
             }
+#else  // #ifndef __aarch64__
+            else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+            {
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
 #endif // __aarch64__
             else
             {
@@ -248,6 +269,12 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
                 tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
                 tmp         = quantize_qasymm8(tmp_f, qi_out);
             }
+            else if(act == ActivationLayerInfo::ActivationFunction::GELU)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
 #endif // __aarch64__
             else
             {
author	Murray Kornelsen <murray.kornelsen@mail.mcgill.ca>	2022-07-13 21:22:39 -0400
committer	Pablo Marquez Tello <pablo.tello@arm.com>	2022-09-14 09:15:03 +0000
commit	926f502ca731fa49bcdf949408ce25728616e5f2 (patch)
tree	7e221103a9c0c5c0e4c054abc07cbdf11c7c7b4e /src/cpu
parent	6e09e1404c635d948cf20eb6b4b5747dfb6656f2 (diff)
download	ComputeLibrary-926f502ca731fa49bcdf949408ce25728616e5f2.tar.gz