diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-03-04 15:31:25 +0000 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-03-10 17:04:36 +0000 |
commit | a26e166829f4d4c48864b1b7243e4e267373d0fd (patch) | |
tree | 66cc076a656025c1ccca65e07f86673551ae45ec /src/core/NEON | |
parent | 0cdbda5e51e6ef9e03017231e56ee85ede69bb9a (diff) | |
download | ComputeLibrary-a26e166829f4d4c48864b1b7243e4e267373d0fd.tar.gz |
COMPMID-3203: Fix build failure with GCC 9.2
Rework convolve3x3 to avoid erroneous behavior by the register allocator.
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ifff2f4ae3a95b894462c7457ffba1f710cce0577
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2839
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON')
-rw-r--r-- | src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp | 3 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp | 54 |
2 files changed, 3 insertions, 54 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index 1dd05d2cf1..03b962291d 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -116,8 +116,7 @@ public: { if(dilation == Size2D(1U, 1U)) { - auto vres = detail::convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, stridex, input_offset); - detail::store_results<stridex>(p_out, vres); + detail::convolve_3x3<false>(in_top, in_mid, in_low, p_out, vw_r0, vw_r1, vw_r2, stridex, input_offset); } else { diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index fe3eb92b1b..7f393d619c 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -321,32 +321,6 @@ public: } }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <unsigned int stridex> -void accumulate_results(float16_t *buffer, const float16x8x2_t &values); - -template <> -void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); - vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); -} - -template <> -void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); -} - -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - template <unsigned int stridex> float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); @@ -498,28 +472,6 @@ inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const return out; } -template <unsigned int stridex> -void accumulate_results(float *buffer, const float32x4x2_t &values); - -template <> -void accumulate_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); - vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); -} - -template <> -void accumulate_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); -} - template <typename T1> class convolver_nhwc { @@ -718,8 +670,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - store_results<stridex>(p_out, vres); + convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } @@ -743,8 +694,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - accumulate_results<stridex>(p_out, vres); + convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } |