From df24618b53cffed1c574e11e9fd4ba7740f8c009 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Mon, 3 Jul 2017 16:25:09 +0100 Subject: COMPMID-421: Added FP16 suppot to NENormalizationLayer and NEPixelWiseMultiplication. Change-Id: If174f8071502fc5cc94b27cd44a9b1d5e451a9e2 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79553 Tested-by: Kaizen Reviewed-by: Georgios Pinitas --- .../NEON/kernels/NENormalizationLayerKernel.cpp | 180 ++++++++++++++++----- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 45 +++++- 2 files changed, 179 insertions(+), 46 deletions(-) (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 0183e549f6..76ace91c20 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -46,12 +46,10 @@ BorderSize NENormalizationLayerKernel::border_size() const void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8); ARM_COMPUTE_ERROR_ON_NULLPTR(output); - // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output); @@ -68,27 +66,79 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _norm_info = norm_info; _border_size = BorderSize(0, border_width); - const bool is_dt_f32 = _input->info()->data_type() == DataType::F32; + unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - switch(norm_info.type()) + switch(_input->info()->data_type()) { - case NormType::IN_MAP_1D: - _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>; + case DataType::F32: + { + num_elems_processed_per_iteration = 4; + switch(norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerKernel::normalize_float; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerKernel::normalize_float; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerKernel::normalize_float; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } break; - case NormType::IN_MAP_2D: - // Normalize over X and Y - _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>; + } + case DataType::F16: + { + num_elems_processed_per_iteration = 8; + switch(norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerKernel::normalize_float; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerKernel::normalize_float; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerKernel::normalize_float; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } break; - case NormType::CROSS_MAP: - _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>; + } + case DataType::QS8: + { + num_elems_processed_per_iteration = 16; + switch(norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = &NENormalizationLayerKernel::normalize_fixed_point<0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = &NENormalizationLayerKernel::normalize_fixed_point<0, true>; + break; + case NormType::CROSS_MAP: + _func = &NENormalizationLayerKernel::normalize_fixed_point<2, false>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } break; + } default: ARM_COMPUTE_ERROR("NOT SUPPORTED!"); } - const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16; - const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); - const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; // Configure window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); @@ -104,8 +154,8 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * INEKernel::configure(win); } -template -void NENormalizationLayerKernel::normalize(const Window &window) +template +void NENormalizationLayerKernel::normalize_float(const Window &window) { Iterator input(_input, window); Iterator input_squared(_input_squared, window); @@ -121,39 +171,83 @@ void NENormalizationLayerKernel::normalize(const Window &window) const int min_top = 0; const int max_bottom = _input->info()->dimension(dim_y) - 1; - const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); - const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); - const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); + if(dt == DataType::F32) + { + const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); + const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); + const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float32x4_t accu = vdupq_n_f32(0.f); + for(int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); + for(int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); + } + } - execute_window_loop(window, [&](const Coordinates & id) + // Normalize + const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); + const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr())), vinvq_f32(normalized)); + vst1q_f32(reinterpret_cast(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#ifdef ARM_COMPUTE_ENABLE_FP16 + else if(dt == DataType::F16) { - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int current_slice = id[dim]; - const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - const int first_slice = std::max(current_slice - radius, min_left); - const int last_slice = std::min(current_slice + radius, max_right); + const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff()); + const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta()); + const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa()); - // Accumulate 2D In-Map values - float32x4_t accu = vdupq_n_f32(0.f); - for(int j = first_row; j <= last_row; j++) + execute_window_loop(window, [&](const Coordinates & id) { - // Compute row displacement - const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; - const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); - for(int i = first_slice; i <= last_slice; ++i) + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float16x8_t accu = vdupq_n_f16(0.f); + for(int j = first_row; j <= last_row; j++) { - accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); + for(int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); + } } - } - // Normalize - const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); - const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr())), vinvq_f32(normalized)); - vst1q_f32(reinterpret_cast(output.ptr()), normalized_pixel); - }, - input, input_squared, output); + const float16x8_t norm_f16 = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16); + const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast(input.ptr())), vinvq_f16(norm_f16)); + vst1q_f16(reinterpret_cast(output.ptr()), normalized_pixel); + }, + input, input_squared, output); + } +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + else + { + ARM_COMPUTE_ERROR("Not supported"); + } } template diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index c3f61ac94a..83d6d8218e 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -38,6 +38,10 @@ #include #include +#if ARM_COMPUTE_ENABLE_FP16 +#include // needed for float16_t +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + using namespace arm_compute; namespace arm_compute @@ -248,6 +252,32 @@ void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict vst4q_f32(output, result); } +template +void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale) +{ + ARM_COMPUTE_UNUSED(input1_ptr); + ARM_COMPUTE_UNUSED(input2_ptr); + ARM_COMPUTE_UNUSED(output_ptr); +#ifdef ARM_COMPUTE_ENABLE_FP16 + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + const float16x8x2_t ta1 = vld2q_f16(input1); + const float16x8x2_t ta2 = vld2q_f16(input2); + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = + { + { + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + } + }; + vst2q_f16(output, result); +#else /* ARM_COMPUTE_ENABLE_FP16 */ + ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a."); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} + template void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) { @@ -347,6 +377,10 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe { set_format_if_unknown(*output->info(), Format::F32); } + else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8) { set_data_type_if_unknown(*output->info(), DataType::QS8); @@ -355,9 +389,9 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); if(input1->info()->data_type() == DataType::QS8) @@ -479,6 +513,11 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; } } + else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output) + { + _func_float = &mul_F16_F16_F16_n; + _func_int = nullptr; + } else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output) { _func_float = &mul_F32_F32_F32_n; -- cgit v1.2.1