Apply clang-format on repository

Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
author: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> 2023-09-27 17:46:17 +0100
committer: felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> 2023-09-28 12:08:05 +0000
commit: afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree: 03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/kernels/cast/generic/neon
parent: bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
download: ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
1 files changed, 187 insertions, 206 deletions
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 6cd0c8500b..2897f4b242 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
 #include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "support/SaturateCast.h"
 
 #include "arm_neon.h"
@@ -35,7 +36,8 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-        int        x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+            int        x       = window_start_x;
 
-            const int16x8x2_t texels =
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vmovl_s8(vget_low_s8(texels_s8)),
-                    vmovl_s8(vget_high_s8(texels_s8))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {
+                    {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+                     vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_f32(src_ptr + x),
-                    vld1q_f32(src_ptr + x + 4),
-                    vld1q_f32(src_ptr + x + 8),
-                    vld1q_f32(src_ptr + x + 12)
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+                                               vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    switch(_dst->info()->data_type())
+    switch (_dst->info()->data_type())
     {
         case DataType::QASYMM8_SIGNED:
         {
             /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
+                        vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
             /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                }
+                        vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::F32:
         {
             /* Up-conversion F16 -> F32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-                    vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                    vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+                        vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+                        vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::S32:
         {
             /* Up-conversion F16 -> S32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-
-                    vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                    vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+                        vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+                        vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         default:
@@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
     }
 }
 
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
     /* Up-conversion U8 -> F16 */
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
 
-            const int16x8x2_t texels =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
     return;
 }
author	Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	2023-09-27 17:46:17 +0100
committer	felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	2023-09-28 12:08:05 +0000
commit	afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree	03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/kernels/cast/generic/neon
parent	bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
download	ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz