aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2021-05-04 17:23:09 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2021-05-06 14:19:49 +0000
commitc4c595a2072180454836e909d2fe1030eed4dca1 (patch)
tree56202d1ec0ce3119e308dc6a6dd1868a05869513
parent5b1142525f97a1a249a4799dd05fd9fea6e4b1e6 (diff)
downloadComputeLibrary-c4c595a2072180454836e909d2fe1030eed4dca1.tar.gz
Fix performance issues in NEReduction
* Revert "Workaround for compiler error in gcc-9.2 and 9.3" This reverts commit e81825bf68ebfce21f6839fa59ddb7e22884a206. * Resolves COMPMID-4462 * Solves issue when using GCC9.2 Change-Id: Id17a5eadb10083ea10665f1f42f21a3af3cafd36 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5587 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp498
1 files changed, 229 insertions, 269 deletions
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 476391e1f7..2d6db764f4 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -605,252 +605,6 @@ struct RedOpX
template <typename T>
struct RedOpX_quantized
{
- using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
-
- using vtype = decltype(wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}));
- using stype = decltype(vdupq_n_f32(static_cast<float>(1.f)));
- using rtype = typename wrapper::traits::neon_vector<T, 16>::type;
-
- void vprocess(int x, const T *ptr, const ReductionOperation op, const UniformQuantizationInfo &iq_info,
- vtype &vec_res_value1, vtype &vec_res_value2, vtype &vec_res_value3, vtype &vec_res_value4,
-
- stype &vec_res_value1_f,
- stype &vec_res_value2_f,
- stype &vec_res_value3_f,
- stype &vec_res_value4_f,
- uint32x4x4_t &vec_res_idx,
-
- rtype &vec_res_value)
-
- {
- const auto vec_elements = wrapper::vloadq(ptr);
-
- switch(op)
- {
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
- vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
- vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
- vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
- break;
- }
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
- const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
- auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
- auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
- auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
- //de-quantize vec_elements
- temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
- temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
- temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
- temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
- vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
- vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
- vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
- vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
- case ReductionOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReductionOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
- void leftover_process(int x, int window_end_x, const T *input_ptr, Iterator output, const ReductionOperation op, const TensorInfo in_info,
- const UniformQuantizationInfo &iq_info,
- vtype &vec_res_value1, vtype &vec_res_value2, vtype &vec_res_value3, vtype &vec_res_value4,
-
- stype &vec_res_value1_f,
- stype &vec_res_value2_f,
- stype &vec_res_value3_f,
- stype &vec_res_value4_f,
- uint32x4x4_t &vec_res_idx,
-
- rtype &vec_res_value)
-
- {
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- if(*(input_ptr + x) < res)
- {
- idx = x;
- res = *(input_ptr + x);
- }
- }
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- if(*(input_ptr + x) > res)
- {
- idx = x;
- res = *(input_ptr + x);
- }
- }
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::MIN:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::MAX:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::PROD:
- {
- auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
-
- float res = wrapper::vgetlane(carry_res, 0);
- res *= wrapper::vgetlane(carry_res, 1);
- res *= wrapper::vgetlane(carry_res, 2);
- res *= wrapper::vgetlane(carry_res, 3);
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- //de-quantize input
- if(std::is_same<T, uint8_t>::value)
- {
- res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
- }
- else
- {
- res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
- }
- }
-
- //re-quantize result
- if(std::is_same<T, uint8_t>::value)
- {
- res = quantize_qasymm8(res, iq_info);
- }
- else
- {
- res = quantize_qasymm8_signed(res, iq_info);
- }
-
- *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
- break;
- }
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
- carry_res = wrapper::vadd(carry_res, vec_res_value3);
- carry_res = wrapper::vadd(carry_res, vec_res_value4);
-
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
- const float res_f = wrapper::vgetlane(carry_res, 0) + wrapper::vgetlane(carry_res, 1) + wrapper::vgetlane(carry_res, 2) + wrapper::vgetlane(carry_res, 3);
- auto res = static_cast<int32_t>(res_f);
-#else // ARM_COMPUTE_DEBUG_ENABLED
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
-#endif // ARM_COMPUTE_DEBUG_ENABLED
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res += *(input_ptr + x);
- }
-
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= static_cast<int32_t>(in_info.dimension(0));
- }
- else
- {
- // Subtract accumulated offsets
- res -= (in_info.dimension(0) - 1) * iq_info.offset;
- }
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
@@ -894,22 +648,215 @@ struct RedOpX_quantized
int x = window_start_x;
for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
- vprocess(x, input_ptr + x, op, iq_info,
- vec_res_value1, vec_res_value2, vec_res_value3, vec_res_value4,
- vec_res_value1_f,
- vec_res_value2_f,
- vec_res_value3_f,
- vec_res_value4_f,
- vec_res_idx, vec_res_value);
+ const auto vec_elements = wrapper::vloadq(input_ptr + x);
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+ const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
+
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+ auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+ auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+ auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
- leftover_process(x, window_end_x, input_ptr, output, op, in_info, iq_info,
- vec_res_value1, vec_res_value2, vec_res_value3, vec_res_value4,
- vec_res_value1_f,
- vec_res_value2_f,
- vec_res_value3_f,
- vec_res_value4_f,
- vec_res_idx, vec_res_value);
+ switch(op)
+ {
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ if(*(input_ptr + x) < res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ if(*(input_ptr + x) > res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+ }
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+ }
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
+
+ float res = wrapper::vgetlane(carry_res, 0);
+ res *= wrapper::vgetlane(carry_res, 1);
+ res *= wrapper::vgetlane(carry_res, 2);
+ res *= wrapper::vgetlane(carry_res, 3);
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ //de-quantize input
+ if(std::is_same<T, uint8_t>::value)
+ {
+ res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+ }
+ else
+ {
+ res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+ }
+ }
+
+ //re-quantize result
+ if(std::is_same<T, uint8_t>::value)
+ {
+ res = quantize_qasymm8(res, iq_info);
+ }
+ else
+ {
+ res = quantize_qasymm8_signed(res, iq_info);
+ }
+
+ *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+ break;
+ }
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
+
+ auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ res += *(input_ptr + x);
+ }
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= static_cast<int32_t>(in_info.dimension(0));
+ }
+ else
+ {
+ // Subtract accumulated offsets
+ res -= (in_info.dimension(0) - 1) * iq_info.offset;
+ }
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
},
input, output);
}
@@ -1237,6 +1184,19 @@ struct RedOpYZW_quantized
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
+ using vector_type = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+ using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
+
+ vector_type vec_res_value1{};
+ vector_type vec_res_value2{};
+ vector_type vec_res_value3{};
+ vector_type vec_res_value4{};
+
+ vector_type_f vec_res_value1_f{};
+ vector_type_f vec_res_value2_f{};
+ vector_type_f vec_res_value3_f{};
+ vector_type_f vec_res_value4_f{};
+
execute_window_loop(in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -1246,15 +1206,15 @@ struct RedOpYZW_quantized
for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
uint32x4x4_t vec_res_idx{ { 0 } };
- auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-
- auto vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- auto vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- auto vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- auto vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+
+ vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
auto vec_res_value = wrapper::vloadq(input_ptr + x);