From a26e166829f4d4c48864b1b7243e4e267373d0fd Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 4 Mar 2020 15:31:25 +0000 Subject: COMPMID-3203: Fix build failure with GCC 9.2 Rework convolve3x3 to avoid erroneous behavior by the register allocator. Signed-off-by: Georgios Pinitas Change-Id: Ifff2f4ae3a95b894462c7457ffba1f710cce0577 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2839 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 3 +- .../kernels/NEDirectConvolutionLayerKernel.cpp | 54 +--------------------- 2 files changed, 3 insertions(+), 54 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index 1dd05d2cf1..03b962291d 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -116,8 +116,7 @@ public: { if(dilation == Size2D(1U, 1U)) { - auto vres = detail::convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, stridex, input_offset); - detail::store_results(p_out, vres); + detail::convolve_3x3(in_top, in_mid, in_low, p_out, vw_r0, vw_r1, vw_r2, stridex, input_offset); } else { diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index fe3eb92b1b..7f393d619c 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -321,32 +321,6 @@ public: } }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template -void accumulate_results(float16_t *buffer, const float16x8x2_t &values); - -template <> -void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); - vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); -} - -template <> -void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); -} - -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - template float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); @@ -498,28 +472,6 @@ inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const return out; } -template -void accumulate_results(float *buffer, const float32x4x2_t &values); - -template <> -void accumulate_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); - vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); -} - -template <> -void accumulate_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); -} - template class convolver_nhwc { @@ -718,8 +670,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - store_results(p_out, vres); + convolve_3x3(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } @@ -743,8 +694,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - accumulate_results(p_out, vres); + convolve_3x3(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } -- cgit v1.2.1