diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/kernels/cast/generic/neon | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/cpu/kernels/cast/generic/neon')
-rw-r--r-- | src/cpu/kernels/cast/generic/neon/fp16.cpp | 393 |
1 files changed, 187 insertions, 206 deletions
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp index 6cd0c8500b..2897f4b242 100644 --- a/src/cpu/kernels/cast/generic/neon/fp16.cpp +++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/TensorInfo.h" -#include "src/cpu/kernels/CpuCastKernel.h" + #include "src/cpu/kernels/cast/list.h" +#include "src/cpu/kernels/CpuCastKernel.h" #include "support/SaturateCast.h" #include "arm_neon.h" @@ -35,7 +36,8 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_qasymm8_signed_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_s32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vcvtq_f32_s32(vld1q_s32(src_ptr + x)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12)) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = { + {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp16_to_other_dt_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); - } + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), + vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), + vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion F16 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); - vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); + vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion F16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread } } -void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_u8_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo & ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /* Up-conversion U8 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); return; } |