aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/cast/generic/neon/fp16.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/cast/generic/neon/fp16.cpp')
-rw-r--r--src/cpu/kernels/cast/generic/neon/fp16.cpp393
1 files changed, 187 insertions, 206 deletions
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 6cd0c8500b..2897f4b242 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
#include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
#include "support/SaturateCast.h"
#include "arm_neon.h"
@@ -35,7 +36,8 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+ int x = window_start_x;
- const int16x8x2_t texels =
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
+ const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+ vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+ vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
+ const float32x4x4_t texels = {
+ {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+ vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+ vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12)
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
+ const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+ vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+ vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::QASYMM8_SIGNED:
{
/* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float16x8x2_t texels = {{
vld1q_f16(src_ptr + x),
vld1q_f16(src_ptr + x + 8),
- }
- };
+ }};
- vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
- }
+ vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+ vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::QASYMM8:
case DataType::U8:
{
/* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float16x8x2_t texels = {{
vld1q_f16(src_ptr + x),
vld1q_f16(src_ptr + x + 8),
- }
- };
+ }};
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
+ vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+ vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+ }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F32:
{
/* Up-conversion F16 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
- vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
- vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
+ const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+ vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+ vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+ vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+ vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::S32:
{
/* Up-conversion F16 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
-
- vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
- vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+ vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+ vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+ vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+ vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
default:
@@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
}
}
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
/* Up-conversion U8 -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+ vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+ vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
return;
}