diff options
Diffstat (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp | 54 |
1 files changed, 2 insertions, 52 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index fe3eb92b1b..7f393d619c 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -321,32 +321,6 @@ public: } }; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <unsigned int stridex> -void accumulate_results(float16_t *buffer, const float16x8x2_t &values); - -template <> -void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); - vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); -} - -template <> -void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) -{ - vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); -} - -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - template <unsigned int stridex> float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); @@ -498,28 +472,6 @@ inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const return out; } -template <unsigned int stridex> -void accumulate_results(float *buffer, const float32x4x2_t &values); - -template <> -void accumulate_results<1>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); - vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); -} - -template <> -void accumulate_results<2>(float *buffer, const float32x4x2_t &values) -{ - vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(float *buffer, const float32x4x2_t &values) -{ - vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); -} - template <typename T1> class convolver_nhwc { @@ -718,8 +670,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - store_results<stridex>(p_out, vres); + convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } @@ -743,8 +694,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); - accumulate_results<stridex>(p_out, vres); + convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); } } } |