From f72f9367d1eddee91f15a64952b99ee6b80b821d Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 12 Jan 2018 16:29:45 +0000 Subject: COMPMID-791: Adds support of QASYMM8 in NEDepthwiseConvolution3x3 Change-Id: I1a9ed6c3420ddf8978aeaad48d9915333b006b49 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116374 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h | 5 +- .../NEDirectConvolutionLayerOutputStageKernel.h | 24 +- .../convolution/NEDirectConvolutionDetail.h | 252 +++++++++++++++++++-- 3 files changed, 254 insertions(+), 27 deletions(-) (limited to 'arm_compute/core/NEON/kernels') diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h index b8f01cb635..38e2a5ddfd 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,7 @@ public: NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default; /** Initialize the function's source, destination, conv and border_size. * - * @param[in] input Source tensor. DataType supported: F32. + * @param[in] input Source tensor. DataType supported: QASYMM8, F32. * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. @@ -64,6 +64,7 @@ private: ITensor *_output; const ITensor *_weights; PadStrideInfo _conv_info; + unsigned int _num_elems_written_per_iteration; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ \ No newline at end of file diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h index 46d52fc182..c42e5c43b5 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,13 +50,17 @@ public: ~NEDirectConvolutionLayerOutputStageKernel() = default; /** Set the accumulate buffer and the biases of the kernel. * - * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 - * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input - * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. + * Data type supported: QS16/QS32/F16/F32 + * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input + * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Data type supported: QS8/QS16/F16/F32 + * @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8 */ - void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr); + void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr, + int result_fixedpoint_multiplier = 0, int result_shift = 0, int result_offset_after_shift = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel * * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. @@ -72,13 +76,17 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output); + using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); private: OutputStageKernel *_func; ITensor *_input; const ITensor *_bias; ITensor *_output; + int _result_fixedpoint_multiplier; + int _result_shift; + int _result_offset_after_shift; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h index c358558610..908fa13876 100644 --- a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h +++ b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,12 +36,14 @@ namespace detail { /** Loads a 3x3 matrix as a row (float). * - * @param[in] ptr Pointer to a float 3x3 matrix. + * @param[in] ptr Pointer to a float 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. * * @return The loaded matrix. */ -inline float32x4x3_t load_matrix_row(const float *ptr) +inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) { + ARM_COMPUTE_UNUSED(weights_offset); const float32x4x3_t r = { { @@ -55,12 +57,14 @@ inline float32x4x3_t load_matrix_row(const float *ptr) /** Loads a 3x3 matrix as a row (qint8_t). * - * @param[in] ptr Pointer to a qint8 3x3 matrix. + * @param[in] ptr Pointer to a qint8 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. * * @return The loaded matrix. */ -inline qint8x8x3_t load_matrix_row(const qint8_t *ptr) +inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0) { + ARM_COMPUTE_UNUSED(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ const qint8x8x3_t r = @@ -74,6 +78,30 @@ inline qint8x8x3_t load_matrix_row(const qint8_t *ptr) return r; } +/** Loads a 3x3 matrix as a row (uint8_t). + * + * @param[in] ptr Pointer to a uint8_t 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. + * + * @return The loaded matrix. + */ +inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0) +{ + const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); + + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + int32x4x3_t r = + { + { + vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) + } + }; + return r; +} + /** Perform a convolve3x3 on float32. * * @param[in] in_top Pointer to the first row of the input. @@ -83,15 +111,21 @@ inline qint8x8x3_t load_matrix_row(const qint8_t *ptr) * @param[in] m1 Second row of the filter. * @param[in] m2 Third row of the filter. * @param[in] fixed_point_position (Optional) Fixed point position. + * @param[in] input_offset (Optional) Input quantization offset. * */ template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + int fixed_point_position, int input_offset = 0); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + int fixed_point_position, int input_offset) { ARM_COMPUTE_UNUSED(fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); const float32x4x3_t vtop = { @@ -149,9 +183,13 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + int fixed_point_position, int input_offset) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); + + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -159,9 +197,13 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + int fixed_point_position, int input_offset) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); + + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } @@ -175,15 +217,21 @@ inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, c * @param[in] m1 Second row of the filter. * @param[in] m2 Third row of the filter. * @param[in] fixed_point_position (Optional) Fixed point position. + * @param[in] input_offset (Optional) Input quantization offset. * */ template -qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position); +qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, + const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, + int fixed_point_position, int input_offset = 0); template <> -inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, + const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, + int fixed_point_position, int input_offset) { ARM_COMPUTE_UNUSED(fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); const qint8x8x3_t vtop = { @@ -236,9 +284,13 @@ inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid } template <> -inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, + const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, + int fixed_point_position, int input_offset) { - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); + + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3); @@ -250,15 +302,153 @@ inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid } template <> -inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, + const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, + int fixed_point_position, int input_offset) { - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + ARM_COMPUTE_UNUSED(input_offset); + + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2); out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3); return out; } +/** Perform a convolve3x3 on uint8_t + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] fixed_point_position (Optional) Fixed point position. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template +int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + int fixed_point_position, int input_offset); + +template <> +inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + int fixed_point_position, int input_offset) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const int32x4_t v_input_offset = vdupq_n_s32(input_offset); + + const uint8x8x2_t vtop = + { + { + vld1_u8(in_top), + vld1_u8(in_top + 8) + } + }; + const uint8x8x2_t vmid = + { + { + vld1_u8(in_mid), + vld1_u8(in_mid + 8) + } + }; + const uint8x8x2_t vlow = + { + { + vld1_u8(in_low), + vld1_u8(in_low + 8) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vtop.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[1])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vmid.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[1])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vlow.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[1])))), + } + }; + + int32x4x2_t out + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + } + }; + + // 0 + out.val[0] = vmlaq_s32(out.val[0], vtop_s32.val[0], m0.val[0]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vtop_s32.val[0], vtop_s32.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vtop_s32.val[0], vtop_s32.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_s32(out.val[0], vmid_s32.val[0], m1.val[0]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vmid_s32.val[0], vmid_s32.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vmid_s32.val[0], vmid_s32.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_s32(out.val[0], vlow_s32.val[0], m2.val[0]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vlow_s32.val[0], vlow_s32.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_s32(out.val[0], vextq_s32(vlow_s32.val[0], vlow_s32.val[1], 2), m2.val[2]); + + // 1 + out.val[1] = vmlaq_s32(out.val[1], vtop_s32.val[1], m0.val[0]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vtop_s32.val[1], vtop_s32.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vtop_s32.val[1], vtop_s32.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_s32(out.val[1], vmid_s32.val[1], m1.val[0]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vmid_s32.val[1], vmid_s32.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vmid_s32.val[1], vmid_s32.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_s32(out.val[1], vlow_s32.val[1], m2.val[0]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vlow_s32.val[1], vlow_s32.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_s32(out.val[1], vextq_s32(vlow_s32.val[1], vlow_s32.val[2], 2), m2.val[2]); + + return out; +} + +template <> +inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + int fixed_point_position, int input_offset) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + int fixed_point_position, int input_offset) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1); + return out; +} + /** Stores a float32x4x2_t array into a memory location. * * @param[in] buffer Pointer to the memory location where the values will be stored. @@ -315,6 +505,34 @@ inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values) vst1_qs16(buffer, vget_low_s16(values.val[0])); } +/** Stores a uint32_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template +void store_results(int32_t *buffer, const int32x4x2_t &values); + +template <> +inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); + vst1q_s32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1_s32(buffer, vget_low_s32(values.val[0])); +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Loads a 3x3 matrix as a row (float16_t). * -- cgit v1.2.1