From 57c033bb5400ef19e5952f191da3e878e21bba91 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 15 Feb 2018 12:29:44 +0000 Subject: COMPMID-906: Use fused activation in NEON Batch normalization Change-Id: I5a6413548b2c9b8972c91ddba57395509dffd87e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120656 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../kernels/NEBatchNormalizationLayerKernel.cpp | 277 +++++++++++++-------- 1 file changed, 177 insertions(+), 100 deletions(-) (limited to 'src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index f5144c6bf3..1f730a2c3c 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,19 +26,34 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include + using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, + DataType::F32); + + if(act_info.enabled()) + { + ActivationLayerInfo::ActivationFunction act = act_info.activation(); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU + && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); + } if(nullptr != output) { @@ -67,28 +82,32 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } +} //namespace -void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +template +void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window) { - Iterator input(in, window); - Iterator output(out, window); + static_assert(!fused_activation, "Activation is not supported for QS8"); + + Iterator input(_input, window); + Iterator output(_output, window); // Hold information about the current feature map we are iterating. // Only compute denominator and NEON vectors once per feature map. int slice = -1; - const int fixed_point_position = in->info()->fixed_point_position(); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); - const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + const int fixed_point_position = _input->info()->fixed_point_position(); + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))); qint8x16_t mean_vec = vdupq_n_qs8(0); qint8x16_t var_vec = vdupq_n_qs8(0); qint8x16_t gamma_vec = vdupq_n_qs8(0); qint8x16_t beta_vec = vdupq_n_qs8(0); qint8x16_t denominator = vdupq_n_qs8(0); - const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position)); + const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) @@ -112,27 +131,30 @@ void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, cons input, output); } -void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +template +void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window) { - Iterator input(in, window); - Iterator output(out, window); + static_assert(!fused_activation, "Activation is not supported for QS16"); + + Iterator input(_input, window); + Iterator output(_output, window); // Hold information about the current feature map we are iterating. // Only compute denominator and NEON vectors once per feature map. int slice = -1; - const int fixed_point_position = in->info()->fixed_point_position(); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); - const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + const int fixed_point_position = _input->info()->fixed_point_position(); + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))); qint16x8_t mean_vec = vdupq_n_qs16(0); qint16x8_t var_vec = vdupq_n_qs16(0); qint16x8_t gamma_vec = vdupq_n_qs16(0); qint16x8_t beta_vec = vdupq_n_qs16(0); qint16x8_t denominator = vdupq_n_qs16(0); - const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(epsilon, fixed_point_position)); + const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) @@ -156,101 +178,162 @@ void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, con input, output); } -void batch_normalization_fp32(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +template +void NEBatchNormalizationLayerKernel::batch_normalization_fp16(const Window &window) { - Iterator input(in, window); - Iterator output(out, window); + static_assert(!fused_activation, "Activation is not supported for QS8"); + + ARM_COMPUTE_UNUSED(window); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + Iterator input(_input, window); + Iterator output(_output, window); // Hold information about the current feature map we are iterating. // Only compute denominator and NEON vectors once per feature map. int slice = -1; - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); - const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))); - float32x4_t mean_vec = vdupq_n_f32(0.0); - float32x4_t var_vec = vdupq_n_f32(0.0); - float32x4_t gamma_vec = vdupq_n_f32(0.0); - float32x4_t beta_vec = vdupq_n_f32(0.0); - float32x4_t denominator = vdupq_n_f32(0.0); - const float32x4_t epsilon_vec = vdupq_n_f32(epsilon); + float16x8_t mean_vec = vdupq_n_f16(0.0); + float16x8_t var_vec = vdupq_n_f16(0.0); + float16x8_t gamma_vec = vdupq_n_f16(0.0); + float16x8_t beta_vec = vdupq_n_f16(0.0); + float16x8_t denominator = vdupq_n_f16(0.0); + const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) { // Conctruct vectors - mean_vec = vdupq_n_f32(*(input_mean + id.z())); - var_vec = vdupq_n_f32(*(input_var + id.z())); - gamma_vec = vdupq_n_f32(*(input_gamma + id.z())); - beta_vec = vdupq_n_f32(*(input_beta + id.z())); + mean_vec = vdupq_n_f16(*(input_mean + id.z())); + var_vec = vdupq_n_f16(*(input_var + id.z())); + gamma_vec = vdupq_n_f16(*(input_gamma + id.z())); + beta_vec = vdupq_n_f16(*(input_beta + id.z())); // Calculate denominator - denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec)); + denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec)); slice = id.z(); } // Calculate x bar and store results - const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast(input.ptr())), mean_vec); - const float32x4_t x_bar = vmulq_f32(numerator, denominator); - vst1q_f32(reinterpret_cast(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec)); + const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast(input.ptr())), mean_vec); + const float16x8_t x_bar = vmulq_f16(numerator, denominator); + vst1q_f16(reinterpret_cast(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec))); }, input, output); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +template +void NEBatchNormalizationLayerKernel::batch_normalization_fp32(const Window &window) { - Iterator input(in, window); - Iterator output(out, window); + Iterator input(_input, window); + Iterator output(_output, window); + + F activation_functor(_act_info); // Hold information about the current feature map we are iterating. // Only compute denominator and NEON vectors once per feature map. int slice = -1; - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); - const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))); - float16x8_t mean_vec = vdupq_n_f16(0.0); - float16x8_t var_vec = vdupq_n_f16(0.0); - float16x8_t gamma_vec = vdupq_n_f16(0.0); - float16x8_t beta_vec = vdupq_n_f16(0.0); - float16x8_t denominator = vdupq_n_f16(0.0); - const float16x8_t epsilon_vec = vdupq_n_f16(epsilon); + float32x4_t mean_vec = vdupq_n_f32(0.0); + float32x4_t var_vec = vdupq_n_f32(0.0); + float32x4_t gamma_vec = vdupq_n_f32(0.0); + float32x4_t beta_vec = vdupq_n_f32(0.0); + float32x4_t denominator = vdupq_n_f32(0.0); + const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) { // Conctruct vectors - mean_vec = vdupq_n_f16(*(input_mean + id.z())); - var_vec = vdupq_n_f16(*(input_var + id.z())); - gamma_vec = vdupq_n_f16(*(input_gamma + id.z())); - beta_vec = vdupq_n_f16(*(input_beta + id.z())); + mean_vec = vdupq_n_f32(*(input_mean + id.z())); + var_vec = vdupq_n_f32(*(input_var + id.z())); + gamma_vec = vdupq_n_f32(*(input_gamma + id.z())); + beta_vec = vdupq_n_f32(*(input_beta + id.z())); // Calculate denominator - denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec)); + denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec)); slice = id.z(); } - // Calculate x bar and store results - const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast(input.ptr())), mean_vec); - const float16x8_t x_bar = vmulq_f16(numerator, denominator); - vst1q_f16(reinterpret_cast(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec))); + // Calculate x bar + const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast(input.ptr())), mean_vec); + const float32x4_t x_bar = vmulq_f32(numerator, denominator); + float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if(fused_activation) + { + activation_functor(res); + } + + // Store results + vst1q_f32(reinterpret_cast(output.ptr()), res); }, input, output); } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace + +void NEBatchNormalizationLayerKernel::configure_non_fused() +{ + switch(_input->info()->data_type()) + { + case DataType::QS8: + _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8; + break; + case DataType::QS16: + _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16; + break; + case DataType::F16: + _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp16; + break; + case DataType::F32: + _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp32>; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} + +void NEBatchNormalizationLayerKernel::configure_fused() +{ + // Fused Batched Normalization with activation functions : FP32 + static std::map bn_fused_map_f32 = + { + { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32> }, + { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32> }, + { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32> } + }; + + switch(_input->info()->data_type()) + { + case DataType::F32: + _func = bn_fused_map_f32[_act_info.activation()]; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon() + : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info() { } -void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon) +void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, + const ITensor *mean, const ITensor *var, + const ITensor *beta, const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma); @@ -264,40 +347,33 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, output_info = output->info(); } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info, mean->info(), var->info(), beta->info(), gamma->info(), epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info, + mean->info(), var->info(), + beta->info(), gamma->info(), + epsilon, act_info)); - _input = input; - _output = input; - _mean = mean; - _var = var; - _gamma = gamma; - _beta = beta; - _epsilon = epsilon; + _input = input; + _output = input; + _mean = mean; + _var = var; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + _act_info = act_info; if(output != nullptr) { _output = output; } - switch(input->info()->data_type()) + // Configure activation function to run + if(_act_info.enabled()) { - case DataType::QS8: - _func = &batch_normalization_q8; - break; - case DataType::QS16: - _func = &batch_normalization_q16; - break; - case DataType::F32: - _func = &batch_normalization_fp32; - break; - case DataType::F16: -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _func = &batch_normalization_fp16; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; + configure_fused(); + } + else + { + configure_non_fused(); } // Configure kernel window @@ -306,11 +382,12 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, INEKernel::configure(win_config.second); } -Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, - const ITensorInfo *gamma, - float epsilon) +Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, + float epsilon, ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr).first); return Status{}; @@ -323,5 +400,5 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window); + (this->*_func)(window); } -- cgit v1.2.1