diff options
Diffstat (limited to 'src/cpu/kernels/CpuCastKernel.cpp')
-rw-r--r-- | src/cpu/kernels/CpuCastKernel.cpp | 1346 |
1 files changed, 626 insertions, 720 deletions
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 764a1ec71c..05c7742b03 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -28,16 +28,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/common/Registrars.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/SaturateCast.h" - #include "src/cpu/kernels/cast/list.h" +#include "support/SaturateCast.h" namespace arm_compute { @@ -47,38 +47,30 @@ namespace kernels { namespace { -static const std::vector<CpuCastKernel::CastKernel> available_kernels = -{ - { - "neon_qs8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast) - }, - { - "neon_qu8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_u8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast) - }, - { - "neon_fp32_to_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast) - }, - { - "neon_s32_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast) - }, +static const std::vector<CpuCastKernel::CastKernel> available_kernels = { + {"neon_qs8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)}, + {"neon_qu8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_u8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_fp16_cast", + [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)}, + {"neon_fp32_to_fp16_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)}, + {"neon_s32_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)}, }; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) @@ -88,57 +80,67 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); #ifdef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32, DataType::S64); #else // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); #endif // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 - && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && + dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && + (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::S64), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::S64), "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); #ifdef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, @@ -149,7 +151,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver #endif // __aarch64__ // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -193,15 +195,8 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) template <> inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr), - vld1q_s32(src_ptr + 4), - vld1q_s32(src_ptr + 8), - vld1q_s32(src_ptr + 12) - } - }; + const int32x4x4_t texels = { + {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}}; vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); @@ -215,33 +210,14 @@ inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int6 template <> inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 14)) - } - }; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}}; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); vst1q_f32(dst_ptr + 8, texels.val[2]); @@ -251,34 +227,15 @@ inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float template <> inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 14)) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}}; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); @@ -287,23 +244,26 @@ inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, floa } template <typename T1, typename T2> -inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) +inline void +convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x)); - } - }, - src, dst); + const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x); + } + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x)); + } + }, + src, dst); } } // namespace #endif // __aarch64__ @@ -325,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */ - const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuCastKernel::get_implementation( + CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()}); - switch(_src->info()->data_type()) + switch (_src->info()->data_type()) { #ifdef __aarch64__ case DataType::U64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -353,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -369,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8_SIGNED: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion QASYMM8_SIGNED -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion QASYMM8_SIGNED -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion QASYMM8_SIGNED -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -492,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8: case DataType::U8: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion U8 -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion U8 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion U8 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } - }, - src, dst); + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -609,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { /* Up-conversion U8 -> U16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); - const uint16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_u8(vget_low_u8(texels_u8)), - vmovl_u8(vget_high_u8(texels_u8)) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_u16(dst_ptr + x, texels.val[0]); - vst1q_u16(dst_ptr + x + 8, texels.val[1]); - } + const uint16x8x2_t texels = { + {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u16(dst_ptr + x, texels.val[0]); + vst1q_u16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -647,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion S16 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U8: { /* Down-conversion S16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), - vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); - } + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::S32: { /* Up-conversion S16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - const int32x4x4_t texels_s32 = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s16(vget_low_s16(texels.val[0])), - vmovl_s16(vget_high_s16(texels.val[0])), - vmovl_s16(vget_low_s16(texels.val[1])), - vmovl_s16(vget_high_s16(texels.val[1])) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s32(dst_ptr + x, texels_s32.val[0]); - vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); - vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); - vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); - } + const int32x4x4_t texels_s32 = { + {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])), + vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, texels_s32.val[0]); + vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); + vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); + vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -828,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::U8: { /* Down-conversion U16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U32: { /* Up-conversion U16 -> U32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); - vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - }, - src, dst); + vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); + vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -941,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr break; } case DataType::F32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F16: { @@ -953,105 +858,110 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::S32: { /* Conversion F32 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; + }}; - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); - } + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F32 -> U8 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); - } + }}; + + vst1_u8(dst_ptr + x, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), + vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), + vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion F32 -> QASYMM8_SIGNED */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + }}; + + vst1_s8(dst_ptr + x, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), + vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_s8(dst_ptr + x + 8, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), + vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } @@ -1060,7 +970,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } break; case DataType::S32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { #if __aarch64__ case DataType::S64: @@ -1079,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::F32: { /* Conversion S32 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; + }}; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); - } + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion S32 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3])))); - } + }}; + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), + vqmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), + vqmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3])))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), + vmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), + vmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } @@ -1184,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U8: { /* Down-conversion S32 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); - } + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), + vqmovun_s32(texels.val[1])))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), + vqmovun_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_u8(dst_ptr + x, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } |