diff options
author | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-06-21 15:56:15 +0100 |
---|---|---|
committer | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-06-29 13:30:06 +0000 |
commit | b042e39060901b44e615b923b5723c04d9b42a95 (patch) | |
tree | e23fd9b89c753f9731e1e8ec4a0d9ca468f9f683 | |
parent | 13f96d0a5efc140785a6de58bff9b24b80dd0cfd (diff) | |
download | ComputeLibrary-b042e39060901b44e615b923b5723c04d9b42a95.tar.gz |
Add LUT-based leaky relu for QASYMM8 on CPU
* Add LUT generation function for Leaky ReLU.
* Some additional changes in the existing LUT implementation:
+ Bring back the NEON implementation of hard swish for 32-bit
build. Library size of 64-bit build is not affected.
+ Add some extra #ifdef to remove unnecessary code in 32-bit
build.
Resolves: COMPMID-5386
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I1ea49611cc922765ee741e31138c888401d33e9b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7845
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | arm_compute/core/QuantizationInfo.h | 11 | ||||
-rw-r--r-- | arm_compute/core/Types.h | 38 | ||||
-rw-r--r-- | src/cpu/kernels/CpuActivationKernel.cpp | 16 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/qasymm8.cpp | 46 | ||||
-rw-r--r-- | src/cpu/kernels/activation/list.h | 2 |
5 files changed, 91 insertions, 22 deletions
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index 0bd0f21bc1..21d962d08b 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -409,6 +409,17 @@ inline qasymm8_t qasymm8_hard_swish(qasymm8_t in, return tmp; } +inline qasymm8_t qasymm8_leaky_relu(qasymm8_t in, + const UniformQuantizationInfo &qi_in, + const UniformQuantizationInfo &qi_out, + float alpha) +{ + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * alpha; + const qasymm8_t tmp = quantize_qasymm8(tmp_f, qi_out); + return tmp; +} + /** Dequantize a value given a 8-bit symmetric quantization scheme * * @param[in] value Value to dequantize diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 4524976d6b..94fe1a07f4 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1680,6 +1680,7 @@ public: return _enabled; } +#ifdef __aarch64__ const LookupTable256 &lut() const { return _lut; @@ -1687,7 +1688,27 @@ public: void init_lut(const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out) { - qasymm8_hard_swish_populate_table(_lut, qi_in, qi_out); + if(_act == ActivationFunction::HARD_SWISH) + { + qasymm8_hard_swish_populate_table(_lut, qi_in, qi_out); + } + else if(_act == ActivationFunction::LEAKY_RELU) + { + qasymm8_leaky_relu_populate_table(_lut, qi_in, qi_out, _a); + } + } +#endif // __aarch64__ + + static inline bool is_lut_supported(ActivationFunction act_func, DataType data_type) + { +#ifdef __aarch64__ + auto supported = (data_type == DataType::QASYMM8 && (act_func == ActivationFunction::HARD_SWISH || act_func == ActivationFunction::LEAKY_RELU)); + return supported; +#else // __aarch64__ + ARM_COMPUTE_UNUSED(act_func); + ARM_COMPUTE_UNUSED(data_type); + return false; +#endif // __aarch64__ } private: @@ -1695,15 +1716,26 @@ private: float _a = {}; float _b = {}; bool _enabled = { false }; - LookupTable256 _lut = {}; - inline void qasymm8_hard_swish_populate_table(LookupTable256 &lut, const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out) +#ifdef __aarch64__ + LookupTable256 _lut = {}; + + static inline void qasymm8_hard_swish_populate_table(LookupTable256 &lut, const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out) { for(size_t i = 0; i < lut.size(); ++i) { lut[i] = qasymm8_hard_swish(i, qi_in, qi_out); } } + + static inline void qasymm8_leaky_relu_populate_table(LookupTable256 &lut, const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out, float alpha) + { + for(size_t i = 0; i < lut.size(); ++i) + { + lut[i] = qasymm8_leaky_relu(i, qi_in, qi_out, alpha); + } + } +#endif // __aarch64__ }; /** Fully connected layer info */ diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 330de1ece2..9eaf44af51 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -45,11 +45,13 @@ namespace { static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = { - { // neon LUT implementantion of HARD_SWISH takes precedence - "neon_qu8_activation_hardswish_lut", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.f == ActivationLayerInfo::ActivationFunction::HARD_SWISH; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_hardswish_lut) +#ifdef __aarch64__ + { // Neon LUT implementantion takes precedence + "neon_qu8_activation_lut", + [](const ActivationDataTypeISASelectorData & data) { return ActivationLayerInfo::is_lut_supported(data.f, data.dt); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation_lut) }, +#endif // __aarch64__ { "sve2_qu8_activation", [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2; }, @@ -87,7 +89,7 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel }, { "neon_qu8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.f != ActivationLayerInfo::ActivationFunction::HARD_SWISH; }, + [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation) }, { @@ -188,10 +190,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac _run_method = uk->ukernel; _name = std::string("CpuActivationKernel").append("/").append(uk->name); - if(activation_info.activation() == ActivationLayerInfo::ActivationFunction::HARD_SWISH && src->data_type() == DataType::QASYMM8) +#ifdef __aarch64__ + if(ActivationLayerInfo::is_lut_supported(activation_info.activation(), src->data_type())) { activation_info.init_lut(src->quantization_info().uniform(),(dst)?dst->quantization_info().uniform():src->quantization_info().uniform()); } +#endif // __aarch64__ _act_info = activation_info; // Configure kernel window diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp index 29f5e6b376..5095ecf5bd 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -417,9 +417,9 @@ void substitute_bytes_neon( #endif // __aarch64__ } // namespace -void neon_qasymm8_hardswish_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::HARD_SWISH); + ARM_COMPUTE_ERROR_ON(!ActivationLayerInfo::is_lut_supported(act_info.activation(), src->info()->data_type())); #ifdef __aarch64__ const int window_step_x = src->info()->tensor_shape().x(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); @@ -472,6 +472,13 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL const float a_f32 = act_info.a(); const float b_f32 = act_info.b(); +#ifndef __aarch64__ + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); +#endif // __aarch64__ + // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; float o = -qi_in.offset * s + qi_out.offset; @@ -545,21 +552,28 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL // Re-quantize to new output space tmp = vquantize(tmp_dep, qi_out); } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { + // De-quantize const auto vin_deq = vdequantize(vin, qi_in); - -#ifdef __aarch64__ - const uint32x4x4_t pos_mask = + // Perform activation + const float32x4x4_t tmp_dep = { { - wrapper::vcgtz(vin_deq.val[0]), - wrapper::vcgtz(vin_deq.val[1]), - wrapper::vcgtz(vin_deq.val[2]), - wrapper::vcgtz(vin_deq.val[3]), + wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), } }; -#else // __aarch64__ + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + const uint32x4x4_t pos_mask = { { @@ -569,7 +583,6 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL wrapper::vcgt(vin_deq.val[3], vconst_0_f32), } }; -#endif // __aarch64__ const float32x4x4_t tmp_dep = { @@ -583,6 +596,7 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL tmp = vquantize(tmp_dep, qi_out); } +#endif // __aarch64__ else { ARM_COMPUTE_ERROR("Unsupported activation function"); @@ -622,12 +636,20 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); tmp = quantize_qasymm8(tmp_f, qi_out); } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) { float tmp_f = dequantize_qasymm8(in, qi_in); tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; tmp = quantize_qasymm8(tmp_f, qi_out); } +#endif // __aarch64__ else { ARM_COMPUTE_ERROR("Unsupported activation function"); diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h index 7220d6cce1..b2322a6477 100644 --- a/src/cpu/kernels/activation/list.h +++ b/src/cpu/kernels/activation/list.h @@ -32,7 +32,7 @@ namespace cpu void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation); -DECLARE_ACTIVATION_KERNEL(neon_qasymm8_hardswish_lut); +DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation_lut); DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation); DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation); DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation); |