From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- .../kernels/meanstddevnorm/generic/neon/fp16.cpp | 102 ++++++++--------- .../kernels/meanstddevnorm/generic/neon/impl.cpp | 97 ++++++++-------- .../meanstddevnorm/generic/neon/qasymm8.cpp | 124 ++++++++++----------- 3 files changed, 164 insertions(+), 159 deletions(-) (limited to 'src/cpu/kernels/meanstddevnorm') diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp index 96e4030268..6470f391e2 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp @@ -23,9 +23,9 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" namespace arm_compute { @@ -45,64 +45,66 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, fl Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - float16x8_t sum_vec = vdupq_n_f16(static_cast(0.0f)); - float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); + float16x8_t sum_vec = vdupq_n_f16(static_cast(0.0f)); + float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - sum_vec = vaddq_f16(sum_vec, data); - float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); - float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + sum_vec = vaddq_f16(sum_vec, data); + float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); + float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); + } - float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); - sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); + float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); + sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); - float16_t sum = vget_lane_f16(sum_carry_res, 0); - float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); + float16_t sum = vget_lane_f16(sum_carry_res, 0); + float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float16_t data = *(in_ptr + x); - sum += data; - float fdata = static_cast(data); - sum_sq += fdata * fdata; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + float16_t data = *(in_ptr + x); + sum += data; + float fdata = static_cast(data); + sum_sq += fdata * fdata; + } - float16_t mean = sum / input->info()->dimension(0); - float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - float16_t stddev_inv = static_cast(1.f / sqrt(var + epsilon)); + float16_t mean = sum / input->info()->dimension(0); + float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + float16_t stddev_inv = static_cast(1.f / sqrt(var + epsilon)); - float16x8_t mean_vec = vdupq_n_f16(mean); - float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); + float16x8_t mean_vec = vdupq_n_f16(mean); + float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); - // Store results - vst1q_f16(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); + // Store results + vst1q_f16(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp index 0522d6e277..11f6294a35 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -45,60 +46,62 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - auto sum_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - auto sum_sq_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto sum_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto sum_sq_vec = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - sum_vec = wrapper::vadd(sum_vec, data); - sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + sum_vec = wrapper::vadd(sum_vec, data); + sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); + } - auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); - auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); - for(int i = 0; i < size / 4; ++i) - { - sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); - sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); - } + auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); + auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); + for (int i = 0; i < size / 4; ++i) + { + sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); + sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); + } - auto sum = wrapper::vgetlane(sum_carry_res, 0); - auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); + auto sum = wrapper::vgetlane(sum_carry_res, 0); + auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - ScalarType data = *(in_ptr + x); - sum += data; - sum_sq += data * data; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + ScalarType data = *(in_ptr + x); + sum += data; + sum_sq += data * data; + } - ScalarType mean = sum / input->info()->dimension(0); - ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - ScalarType stddev_inv = 1.f / sqrt(var + epsilon); + ScalarType mean = sum / input->info()->dimension(0); + ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + ScalarType stddev_inv = 1.f / sqrt(var + epsilon); - auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); - // Store results - wrapper::vstore(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); + // Store results + wrapper::vstore(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } template void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, const Window &window); } // namespace cpu diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp index 53af1e4b16..32654df5dc 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const float32x4_t quant_min_vec = vdupq_n_f32(0.0f); execute_window_loop( - win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast(input_itr.ptr()); - auto out_ptr = reinterpret_cast(output_itr.ptr()); + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); - uint32x4_t sum_vec = vdupq_n_u32(0); - uint32x4_t sum_sq_vec = vdupq_n_u32(0); + uint32x4_t sum_vec = vdupq_n_u32(0); + uint32x4_t sum_sq_vec = vdupq_n_u32(0); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); - const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); - const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); - sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); + const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); + const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); + sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); + } #ifdef __aarch64__ - sum_vec = vpaddq_u32(sum_vec, sum_vec); - sum_vec = vpaddq_u32(sum_vec, sum_vec); - uint32_t sum = vgetq_lane_u32(sum_vec, 0); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + uint32_t sum = vgetq_lane_u32(sum_vec, 0); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); #elif __arm__ // #ifdef __aarch64__ - uint32_t sum = vgetq_lane_u32(sum_vec, 0) + - vgetq_lane_u32(sum_vec, 1) + - vgetq_lane_u32(sum_vec, 2) + - vgetq_lane_u32(sum_vec, 3); + uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) + + vgetq_lane_u32(sum_vec, 3); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + - vgetq_lane_u32(sum_sq_vec, 1) + - vgetq_lane_u32(sum_sq_vec, 2) + - vgetq_lane_u32(sum_sq_vec, 3); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) + + vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3); #endif // #ifdef __aarch64__ - for(; x < window_end_x; ++x) - { - auto data = static_cast(*(in_ptr + x)); - sum += data; - sum_sq += (data * data); - } + for (; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + sum += data; + sum_sq += (data * data); + } - const float mean = (static_cast(sum) / static_cast(input->info()->dimension(0))); - const float var = (static_cast(sum_sq) / static_cast(input->info()->dimension(0))) - (mean * mean); - const float stdev_inv = 1.0f / sqrtf(var + epsilon); - const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); - const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); - float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); - db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); - db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); - db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); - db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); - const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); - vst1q_u8(out_ptr + x, out); - } + const float mean = (static_cast(sum) / static_cast(input->info()->dimension(0))); + const float var = + (static_cast(sum_sq) / static_cast(input->info()->dimension(0))) - (mean * mean); + const float stdev_inv = 1.0f / sqrtf(var + epsilon); + const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); + const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); + float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); + db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); + db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); + db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); + db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); + const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); + vst1q_u8(out_ptr + x, out); + } - for(; x < window_end_x; ++x) - { - auto data = static_cast(*(in_ptr + x)); - const uint8_t res = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); - *(out_ptr + x) = res; - } - }, - input_itr, output_itr); + for (; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + const uint8_t res = + data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); + *(out_ptr + x) = res; + } + }, + input_itr, output_itr); } } // namespace cpu } // namespace arm_compute -- cgit v1.2.1