aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-04-02 15:27:52 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-04-09 11:58:01 +0000
commita4bba9c594c4022c9f85192bb8fd3593ad1a8d3c (patch)
tree0e79ebd7105411f6756e63d3ce23f16aaeb88566
parent3418ba520dd6251738ba905df84a201121433ecd (diff)
downloadComputeLibrary-a4bba9c594c4022c9f85192bb8fd3593ad1a8d3c.tar.gz
COMPMID-1995: Fix 32-bit NEDepthwiseConvolution errors.
-Updates padding handling in assembly depthwise kernels. -Fixes 32-bit runs issues for depthwise convolution. Change-Id: I3fe6369397c1d13f5629dd34c068ce4af53c95cd Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/939 Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp37
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp8
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp146
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp1515
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp2366
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp172
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp169
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp325
8 files changed, 4676 insertions, 62 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 45e8da0272..e0cb616a3d 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -24,7 +24,7 @@
#pragma once
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include <arm_neon.h>
#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
@@ -275,6 +275,14 @@ class DepthwiseConvolutionBase : public IDepthwiseConvolution
unsigned int out_col_stride
);
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const InputType* inptrs[inner_tile_rows][inner_tile_cols],
+ OutputType* outptrs[output_tile_rows][output_tile_cols]
+ );
+
int n_channels(void) const;
private:
@@ -290,9 +298,7 @@ class DepthwiseConvolutionBase : public IDepthwiseConvolution
// Stride information for a convolution instance
int _input_col_stride, _input_row_stride, _input_batch_stride;
- const int _input_ws_col_stride, _input_ws_row_stride;
int _output_col_stride, _output_row_stride, _output_batch_stride;
- const int _output_ws_col_stride, _output_ws_row_stride;
// Methods for getting access to working space
size_t _get_input_working_space_size(void) const;
@@ -352,6 +358,14 @@ class DepthwiseConvolution : public DepthwiseConvolutionBase<
unsigned int out_row_stride,
unsigned int out_col_stride
);
+
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+ );
};
@@ -415,6 +429,14 @@ class DepthwiseConvolution<
unsigned int out_row_stride,
unsigned int out_col_stride
);
+
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+ );
};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -478,6 +500,15 @@ class DepthwiseConvolution<
unsigned int out_row_stride,
unsigned int out_col_stride
);
+
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+ );
};
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index 4c1d883a70..47fa60139f 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -109,6 +109,14 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
unsigned int out_col_stride
);
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+ );
+
private:
// Quantization parameters
const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index 674fc4d2df..493b2991dc 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -112,11 +112,7 @@ MEMBERFN()::DepthwiseConvolutionBase(
_padding_right(padding_right),
_activation(activation),
_input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
- _input_ws_col_stride(_n_channels),
- _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols),
- _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0),
- _output_ws_col_stride(_n_channels),
- _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns)
+ _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
{
}
@@ -231,12 +227,12 @@ MEMBERFN(void)::set_working_space(void *buffer)
MEMBERFN(size_t)::_get_input_working_space_size(void) const
{
- return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels;
+ return sizeof(TIn) * _n_channels;
}
MEMBERFN(size_t)::_get_output_working_space_size(void) const
{
- return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels;
+ return sizeof(TOut) * _n_channels;
}
MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
@@ -263,6 +259,14 @@ MEMBERFN(void)::run(
const unsigned int threadid
)
{
+ // Clear the input padding buffer
+ TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
+ const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
+ for (int n = 0; n < _n_channels; n++)
+ {
+ buf[n] = pad_value;
+ }
+
// Parallelise over blocks of channels
const auto start_channel = CHANNEL_BLOCK * start;
const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
@@ -379,60 +383,94 @@ MEMBERFN(void)::process_tile(
const int pad_out_right
)
{
+ Derived * dthis = static_cast<Derived *>(this);
const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
const bool pad_output = pad_out_bottom || pad_out_right;
- if (pad_input)
+ if (!pad_input && !pad_output)
{
- // Copy the input into the temporary buffer, applying padding
- padding::copy_and_pad_tile<TIn>(
- inner_tile_rows, inner_tile_cols, n_channels,
- inptr, _input_row_stride, _input_col_stride,
- static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride,
- pad_in_top, pad_in_left, pad_in_bottom, pad_in_right,
- static_cast<Derived *>(this)->_input_padding_value()
- );
+ switch(_activation)
+ {
+ case ActivationFunction::ReLU:
+ dthis->template execute_tile<ActivationFunction::ReLU>(
+ n_channels, packed_params,
+ inptr, _input_row_stride, _input_col_stride,
+ outptr, _output_row_stride, _output_col_stride
+ );
+ break;
+ case ActivationFunction::ReLU6:
+ dthis->template execute_tile<ActivationFunction::ReLU6>(
+ n_channels, packed_params,
+ inptr, _input_row_stride, _input_col_stride,
+ outptr, _output_row_stride, _output_col_stride
+ );
+ break;
+ default:
+ dthis->template execute_tile<ActivationFunction::None>(
+ n_channels, packed_params,
+ inptr, _input_row_stride, _input_col_stride,
+ outptr, _output_row_stride, _output_col_stride
+ );
+ break;
+ }
}
-
- // Execute the kernel
- const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid));
- const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride;
- const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride;
-
- TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid));
- const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride;
- const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride;
-
- Derived * dthis = static_cast<Derived *>(this);
-
- switch(_activation)
+ else
{
- case ActivationFunction::ReLU:
- dthis->template execute_tile<ActivationFunction::ReLU>(
- n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
- );
- break;
- case ActivationFunction::ReLU6:
- dthis->template execute_tile<ActivationFunction::ReLU6>(
- n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
- );
- break;
- default:
- dthis->template execute_tile<ActivationFunction::None>(
- n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
- );
- break;
- }
+ // Create arrays of input and output pointers, pointing padded elements to
+ // the working space padding buffers provided.
+ const TIn *inptrs[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
+ j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
+ {
+ // Padded input
+ inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
+ }
+ else
+ {
+ inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
+ }
+ }
+ }
- if (pad_output)
- {
- // Copy the output from the temporary buffer, removing unnecessary values
- padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute(
- n_channels * sizeof(TOut),
- _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut),
- outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut),
- 0, 0, pad_out_bottom, pad_out_right
- );
+ TOut *outptrs[output_tile_rows][output_tile_cols];
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ if (i < (output_tile_rows - pad_out_bottom) &&
+ j < (output_tile_cols - pad_out_right))
+ {
+ outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
+ }
+ else
+ {
+ outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
+ }
+ }
+ }
+
+ switch(_activation)
+ {
+ case ActivationFunction::ReLU:
+ dthis->template execute_tile<ActivationFunction::ReLU>(
+ n_channels, packed_params, inptrs, outptrs
+ );
+ break;
+ case ActivationFunction::ReLU6:
+ dthis->template execute_tile<ActivationFunction::ReLU6>(
+ n_channels, packed_params, inptrs, outptrs
+ );
+ break;
+ default:
+ dthis->template execute_tile<ActivationFunction::None>(
+ n_channels, packed_params, inptrs, outptrs
+ );
+ break;
+ }
}
}
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 4ac6276123..010dd81bce 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -431,6 +431,491 @@ void Conv::execute_tile<ActivationFunction::None>(
template <>
template <>
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x23, xzr\n"
+ "mov x24, xzr\n"
+ "and x25, %[n_channels], #3\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q13, [%[wbptr]]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "subs x26, x26, #1\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "ldr q14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "ldr q14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr q19, [x19, x23]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr q15, [x20, x23]\n"
+ "ldr q18, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr q13, [x19, x23]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr q14, [x20, x23]\n"
+ "ldr q17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr q13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr q17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr q12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr q11, [x22, x23]\n"
+ "ldr q13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr q14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "ldr q13, [%[wbptr]]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "str q8, [x28, x24]\n"
+ "add x23, x23, #16\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "str q9, [x21, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "mov v9.16b, v13.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "subs x26, x26, #1\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "ldr q14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "ldr q14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "ldr q19, [x19, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x20, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str q7, [x28, x24]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr q13, [x19, x23]\n"
+ "ldr q18, [x21, x23]\n"
+ "add x24, x24, #16\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr q14, [x20, x23]\n"
+ "ldr q17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr q13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr q17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr q12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr q11, [x22, x23]\n"
+ "ldr q13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr q14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "str q8, [x28, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str q9, [x21, x24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "str q7, [x28, x24]\n"
+ "add x24, x24, #16\n"
+ "4:\n"
+ "cbz x25, 7f\n"
+ "ldr s13, [%[wbptr]]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "subs x25, x25, #1\n"
+ "ldr s14, [x19, x23]\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "ldr s19, [x19, x23]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr s15, [x20, x23]\n"
+ "ldr s18, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr s13, [x19, x23]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr s14, [x20, x23]\n"
+ "ldr s17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr s13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr s17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr s12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr s11, [x22, x23]\n"
+ "ldr s13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "ldr s13, [%[wbptr]]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "str s8, [x28, x24]\n"
+ "add x23, x23, #4\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "str s9, [x21, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "mov v9.16b, v13.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "subs x25, x25, #1\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr s14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "ldr s14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "ldr s19, [x19, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x20, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str s7, [x28, x24]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr s13, [x19, x23]\n"
+ "ldr s18, [x21, x23]\n"
+ "add x24, x24, #4\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr s14, [x20, x23]\n"
+ "ldr s17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr s13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr s17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr s12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr s11, [x22, x23]\n"
+ "ldr s13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "str s8, [x28, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str s9, [x21, x24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "str s7, [x28, x24]\n"
+ "add x24, x24, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
+template <>
+template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
int n_channels,
const void *weight_bias_ptr,
@@ -850,6 +1335,511 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
template <>
template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x22, xzr\n"
+ "mov x26, xzr\n"
+ "and x23, %[n_channels], #3\n"
+ "lsr x24, %[n_channels], #2\n"
+ "cbz x24, 4f\n"
+ "1:\n"
+ "ldr q14, [%[wbptr]]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #48]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "ldr q9, [%[wbptr], #80]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr q8, [%[wbptr], #96]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr q7, [%[wbptr], #112]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "subs x24, x24, #1\n"
+ "ldr q5, [%[wbptr], #144]\n"
+ "ldr q15, [x19, x22]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr q10, [x19, x22]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q14, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr q16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr q17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr q13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr q16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr q17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr q12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr q15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr q11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr q16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "ldr q14, [%[wbptr]]\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q9, [%[wbptr], #80]\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "ldr q7, [%[wbptr], #112]\n"
+ "str q1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #48]\n"
+ "str q2, [x21, x26]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "mov v2.16b, v14.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "subs x24, x24, #1\n"
+ "ldr q15, [x19, x22]\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "ldr q8, [%[wbptr], #96]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "ldr q5, [%[wbptr], #144]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q10, [x19, x22]\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str q0, [x28, x26]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "add x26, x26, #16\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr q16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr q17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr q13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr q16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr q17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr q12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr q15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr q11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr q16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "str q1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str q2, [x21, x26]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "str q0, [x28, x26]\n"
+ "add x26, x26, #16\n"
+ "4:\n"
+ "cbz x23, 7f\n"
+ "ldr s14, [%[wbptr]]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #12]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "ldr s9, [%[wbptr], #20]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr s8, [%[wbptr], #24]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr s7, [%[wbptr], #28]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "ldr s5, [%[wbptr], #36]\n"
+ "subs x23, x23, #1\n"
+ "ldr s15, [x19, x22]\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr s16, [x21, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "ldr s10, [x19, x22]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr s14, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr s16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr s17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr s13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr s16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr s17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr s12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr s15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr s11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr s16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "ldr s14, [%[wbptr]]\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s9, [%[wbptr], #20]\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "ldr s7, [%[wbptr], #28]\n"
+ "str s1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #12]\n"
+ "str s2, [x21, x26]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "mov v2.16b, v14.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "subs x23, x23, #1\n"
+ "ldr s15, [x19, x22]\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "ldr s8, [%[wbptr], #24]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr s16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "ldr s5, [%[wbptr], #36]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s10, [x19, x22]\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str s0, [x28, x26]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "add x26, x26, #4\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr s16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr s17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr s13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr s16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr s17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr s12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr s15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr s11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr s16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "str s1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str s2, [x21, x26]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "str s0, [x28, x26]\n"
+ "add x26, x26, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
+template <>
+template <>
void Conv::execute_tile<ActivationFunction::ReLU6>(
int n_channels,
const void *weight_bias_ptr,
@@ -1287,6 +2277,531 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
);
}
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x26, %[n_channels], #3\n"
+ "lsr x25, %[n_channels], #2\n"
+ "cbz x25, 4f\n"
+ "1:\n"
+ "ldr q15, [%[wbptr]]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr q10, [%[wbptr], #32]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q7, [%[wbptr], #48]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "ldr q5, [%[wbptr], #80]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "ldr q0, [%[wbptr], #96]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "subs x25, x25, #1\n"
+ "ldr q1, [%[wbptr], #144]\n"
+ "ldr q17, [x21, x27]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "ldr q17, [x23, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "ldr q11, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr q19, [x22, x27]\n"
+ "ldr q15, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "ldr q12, [x21, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr q16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr q18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr q19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr q17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr q13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr q10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr q9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "str q8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr q16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "ldr q15, [%[wbptr]]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr q10, [%[wbptr], #32]\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "ldr q7, [%[wbptr], #48]\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "ldr q5, [%[wbptr], #80]\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x27, x27, #16\n"
+ "str q3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str q2, [x22, x28]\n"
+ "mov v3.16b, v15.16b\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "ldr q0, [%[wbptr], #96]\n"
+ "ldr q17, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr q1, [%[wbptr], #144]\n"
+ "ldr q11, [x21, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "ldr q19, [x22, x27]\n"
+ "ldr q12, [x21, x27]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "subs x25, x25, #1\n"
+ "str q4, [x24, x28]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "add x28, x28, #16\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr q16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr q18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr q19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr q17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr q13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr q10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr q9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "str q8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr q16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "add x27, x27, #16\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "str q3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str q2, [x22, x28]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "str q4, [x24, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x26, 7f\n"
+ "ldr s15, [%[wbptr]]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr s10, [%[wbptr], #8]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s7, [%[wbptr], #12]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "ldr s5, [%[wbptr], #20]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr s0, [%[wbptr], #24]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr s1, [%[wbptr], #36]\n"
+ "subs x26, x26, #1\n"
+ "ldr s17, [x21, x27]\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr s17, [x23, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "ldr s11, [x21, x27]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr s19, [x22, x27]\n"
+ "ldr s15, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "ldr s12, [x21, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr s16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr s18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr s19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr s17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr s13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr s10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr s9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "str s8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "ldr s15, [%[wbptr]]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr s10, [%[wbptr], #8]\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "ldr s7, [%[wbptr], #12]\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "ldr s5, [%[wbptr], #20]\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x27, x27, #4\n"
+ "str s3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str s2, [x22, x28]\n"
+ "mov v3.16b, v15.16b\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "ldr s0, [%[wbptr], #24]\n"
+ "ldr s17, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr s1, [%[wbptr], #36]\n"
+ "ldr s11, [x21, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "ldr s19, [x22, x27]\n"
+ "ldr s12, [x21, x27]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "subs x26, x26, #1\n"
+ "str s4, [x24, x28]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "add x28, x28, #4\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr s16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr s18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr s19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr s17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr s13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr s10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr s9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "str s8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "add x27, x27, #4\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "str s3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str s2, [x22, x28]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "str s4, [x24, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
#endif // __aarch64__
template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index ff0e454c76..a583615c99 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -1169,6 +1169,1155 @@ void Conv::execute_tile<ActivationFunction::None>(
template <>
template <>
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[6][6],
+ float *outptrs[4][4]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x15, %[n_channels], #3\n"
+ "lsr x16, %[n_channels], #2\n"
+ "cbz x16, 4f\n"
+ "1:\n"
+ "ldr q13, [%[wbptr]]\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v2.16b, v13.16b\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "ldr q27, [x18, x27]\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "ldr q28, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "ldr q25, [x19, x27]\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "ldr q16, [x18, x27]\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "ldr q15, [x17, x27]\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "ldr q21, [x20, x27]\n"
+ "subs x16, x16, #1\n"
+ "ldr q29, [x19, x27]\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr q30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr q31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr q25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr q21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr q20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "subs x16, x16, #1\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr q27, [x22, x27]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr q30, [x21, x27]\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "ldr q24, [x20, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str q18, [x23, x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr q27, [x17, x27]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "ldr q28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "str q22, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr q22, [x21, x27]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "str q23, [x23, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr q30, [x20, x27]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr q31, [x19, x27]\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr q26, [x18, x27]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr q23, [x22, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "str q19, [x25, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr q27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "ldr q28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str q17, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "ldr q22, [x19, x27]\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr q30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str q14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr q31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr q17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr q14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr q26, [x22, x27]\n"
+ "str q0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr q13, [%[wbptr]]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "add x27, x27, #16\n"
+ "str q1, [x25, x28]\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr q29, [x17, x27]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "str q2, [x24, x28]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "ldr q28, [x17, x27]\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "str q3, [x23, x28]\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "str q16, [x26, x28]\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "ldr q16, [x18, x27]\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "str q15, [x25, x28]\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "ldr q15, [x17, x27]\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "str q18, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "str q21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "str q24, [x25, x28]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q21, [x20, x27]\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "mov v1.16b, v13.16b\n"
+ "mov v2.16b, v13.16b\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr q29, [x19, x27]\n"
+ "str q20, [x26, x28]\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "add x28, x28, #16\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr q30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr q31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr q25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr q21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr q20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr q27, [x22, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr q30, [x21, x27]\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str q18, [x23, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr q24, [x20, x27]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr q27, [x17, x27]\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "ldr q28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "str q22, [x24, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr q22, [x21, x27]\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "str q23, [x23, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr q30, [x20, x27]\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr q31, [x19, x27]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr q26, [x18, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr q23, [x22, x27]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "str q19, [x25, x28]\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "ldr q27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr q28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str q17, [x24, x28]\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr q22, [x19, x27]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr q30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str q14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr q31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr q17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr q14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr q26, [x22, x27]\n"
+ "str q0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "str q1, [x25, x28]\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "str q2, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "str q3, [x23, x28]\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "str q16, [x26, x28]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "str q15, [x25, x28]\n"
+ "str q18, [x24, x28]\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "str q21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "str q24, [x25, x28]\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "str q20, [x26, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x15, 7f\n"
+ "ldr s13, [%[wbptr]]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v2.16b, v13.16b\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "subs x15, x15, #1\n"
+ "ldr s29, [x17, x27]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "ldr s25, [x19, x27]\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "ldr s21, [x20, x27]\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "ldr s28, [x17, x27]\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "ldr s16, [x18, x27]\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "ldr s29, [x19, x27]\n"
+ "ldr s15, [x17, x27]\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr s30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr s31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr s25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr s21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr s20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "subs x15, x15, #1\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr s27, [x22, x27]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr s30, [x21, x27]\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "ldr s24, [x20, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str s18, [x23, x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr s27, [x17, x27]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "ldr s28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "str s22, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr s22, [x21, x27]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "str s23, [x23, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr s30, [x20, x27]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr s31, [x19, x27]\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr s26, [x18, x27]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr s23, [x22, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "str s19, [x25, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr s27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "ldr s28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str s17, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "ldr s22, [x19, x27]\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr s30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str s14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr s31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr s17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr s14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr s26, [x22, x27]\n"
+ "str s0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr s13, [%[wbptr]]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "add x27, x27, #4\n"
+ "str s1, [x25, x28]\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr s29, [x17, x27]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "str s2, [x24, x28]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "ldr s28, [x17, x27]\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "str s3, [x23, x28]\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "str s16, [x26, x28]\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "ldr s16, [x18, x27]\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "str s15, [x25, x28]\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "ldr s15, [x17, x27]\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "str s18, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "str s21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "str s24, [x25, x28]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr s21, [x20, x27]\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "mov v1.16b, v13.16b\n"
+ "mov v2.16b, v13.16b\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr s29, [x19, x27]\n"
+ "str s20, [x26, x28]\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "add x28, x28, #4\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr s30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr s31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr s25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr s21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr s20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr s27, [x22, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr s30, [x21, x27]\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str s18, [x23, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr s24, [x20, x27]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr s27, [x17, x27]\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "ldr s28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "str s22, [x24, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr s22, [x21, x27]\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "str s23, [x23, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr s30, [x20, x27]\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr s31, [x19, x27]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr s26, [x18, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr s23, [x22, x27]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "str s19, [x25, x28]\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "ldr s27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr s28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str s17, [x24, x28]\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr s22, [x19, x27]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr s30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str s14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr s31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr s17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr s14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr s26, [x22, x27]\n"
+ "str s0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "str s1, [x25, x28]\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "str s2, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "str s3, [x23, x28]\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "str s16, [x26, x28]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "str s15, [x25, x28]\n"
+ "str s18, [x24, x28]\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "str s21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "str s24, [x25, x28]\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "str s20, [x26, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
+template <>
+template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
int n_channels,
const void *weight_bias_ptr,
@@ -2374,6 +3523,1223 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
template <>
template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[6][6],
+ float *outptrs[4][4]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x19, %[n_channels], #3\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q25, [%[wbptr]]\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr q22, [%[wbptr], #16]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr q9, [%[wbptr], #32]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldr q8, [%[wbptr], #48]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "mov v0.16b, v25.16b\n"
+ "ldr q7, [%[wbptr], #80]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr q5, [%[wbptr], #112]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr q4, [%[wbptr], #128]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "ldr q27, [x25, x27]\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "ldr q26, [x17, x27]\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr q31, [x25, x27]\n"
+ "ldr q28, [x24, x27]\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "subs x26, x26, #1\n"
+ "ldr q30, [x25, x27]\n"
+ "ldr q27, [x18, x27]\n"
+ "ldr q21, [x24, x27]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr q23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr q26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr q20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "subs x26, x26, #1\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr q23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr q30, [x24, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "ldr q31, [x17, x27]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr q25, [x25, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr q26, [x16, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "ldr q25, [x17, x27]\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr q29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "ldr q22, [x15, x27]\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "ldr q26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr q25, [x16, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr q22, [x18, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr q19, [x16, x27]\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "ldr q27, [x25, x27]\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "ldr q25, [%[wbptr]]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "ldr q22, [%[wbptr], #16]\n"
+ "str q2, [x20, x28]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "ldr q9, [%[wbptr], #32]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "ldr q8, [%[wbptr], #48]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str q18, [x20, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str q16, [x21, x28]\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr q7, [%[wbptr], #80]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "ldr q5, [%[wbptr], #112]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "str q13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr q4, [%[wbptr], #128]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr q31, [x25, x27]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "str q14, [x23, x28]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "str q17, [x20, x28]\n"
+ "mov v16.16b, v25.16b\n"
+ "str q0, [x21, x28]\n"
+ "mov v18.16b, v25.16b\n"
+ "str q12, [x22, x28]\n"
+ "mov v13.16b, v25.16b\n"
+ "str q10, [x23, x28]\n"
+ "mov v0.16b, v25.16b\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr q30, [x25, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str q1, [x20, x28]\n"
+ "mov v14.16b, v25.16b\n"
+ "str q15, [x21, x28]\n"
+ "mov v12.16b, v25.16b\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "str q21, [x21, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr q21, [x24, x27]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str q11, [x22, x28]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str q24, [x22, x28]\n"
+ "str q23, [x23, x28]\n"
+ "add x28, x28, #16\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr q23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr q26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr q20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr q23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "ldr q30, [x24, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr q31, [x17, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "ldr q25, [x25, x27]\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr q26, [x16, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr q25, [x17, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr q29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr q22, [x15, x27]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr q26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr q25, [x16, x27]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr q22, [x18, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "ldr q19, [x16, x27]\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str q2, [x20, x28]\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str q18, [x20, x28]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str q16, [x21, x28]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "str q17, [x20, x28]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "str q0, [x21, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str q13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str q1, [x20, x28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "str q15, [x21, x28]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "str q14, [x23, x28]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str q21, [x21, x28]\n"
+ "str q12, [x22, x28]\n"
+ "str q10, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str q11, [x22, x28]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str q24, [x22, x28]\n"
+ "str q23, [x23, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x19, 7f\n"
+ "ldr s25, [%[wbptr]]\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr s22, [%[wbptr], #4]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr s9, [%[wbptr], #8]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldr s8, [%[wbptr], #12]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "mov v0.16b, v25.16b\n"
+ "ldr s7, [%[wbptr], #20]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr s5, [%[wbptr], #28]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr s4, [%[wbptr], #32]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "subs x19, x19, #1\n"
+ "ldr s27, [x25, x27]\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "ldr s27, [x18, x27]\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "ldr s31, [x25, x27]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr s29, [x17, x27]\n"
+ "ldr s21, [x24, x27]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr s30, [x25, x27]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr s23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr s26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr s20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "subs x19, x19, #1\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr s23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr s30, [x24, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "ldr s31, [x17, x27]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr s25, [x25, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr s26, [x16, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "ldr s25, [x17, x27]\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr s29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "ldr s22, [x15, x27]\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "ldr s26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr s25, [x16, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr s22, [x18, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr s19, [x16, x27]\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "ldr s27, [x25, x27]\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "ldr s25, [%[wbptr]]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "ldr s22, [%[wbptr], #4]\n"
+ "str s2, [x20, x28]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "ldr s9, [%[wbptr], #8]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "ldr s8, [%[wbptr], #12]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str s18, [x20, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str s16, [x21, x28]\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr s7, [%[wbptr], #20]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "ldr s5, [%[wbptr], #28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "str s13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr s4, [%[wbptr], #32]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr s31, [x25, x27]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "str s14, [x23, x28]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr s29, [x17, x27]\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "str s17, [x20, x28]\n"
+ "mov v16.16b, v25.16b\n"
+ "str s0, [x21, x28]\n"
+ "mov v18.16b, v25.16b\n"
+ "str s12, [x22, x28]\n"
+ "mov v13.16b, v25.16b\n"
+ "str s10, [x23, x28]\n"
+ "mov v0.16b, v25.16b\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr s30, [x25, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str s1, [x20, x28]\n"
+ "mov v14.16b, v25.16b\n"
+ "str s15, [x21, x28]\n"
+ "mov v12.16b, v25.16b\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "str s21, [x21, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr s21, [x24, x27]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str s11, [x22, x28]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "str s20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str s24, [x22, x28]\n"
+ "str s23, [x23, x28]\n"
+ "add x28, x28, #4\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr s23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr s26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr s20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr s23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "ldr s30, [x24, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr s31, [x17, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "ldr s25, [x25, x27]\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr s26, [x16, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr s25, [x17, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr s29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr s22, [x15, x27]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr s26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr s25, [x16, x27]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr s22, [x18, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "ldr s19, [x16, x27]\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str s2, [x20, x28]\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str s18, [x20, x28]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str s16, [x21, x28]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "str s17, [x20, x28]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "str s0, [x21, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str s13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str s1, [x20, x28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "str s15, [x21, x28]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "str s14, [x23, x28]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str s21, [x21, x28]\n"
+ "str s12, [x22, x28]\n"
+ "str s10, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str s11, [x22, x28]\n"
+ "str s20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str s24, [x22, x28]\n"
+ "str s23, [x23, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
+template <>
+template <>
void Conv::execute_tile<ActivationFunction::ReLU6>(
int n_channels,
const void *weight_bias_ptr,
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index 1ae8128d55..cbdb19a067 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -63,7 +63,6 @@ DepthwiseConvolution<
{
}
-
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
@@ -92,7 +91,6 @@ void DepthwiseConvolution<
// Perform the depthwise convolution
int channels_remaining = n_channels;
-#ifdef __aarch64__
for (; channels_remaining >= 8; channels_remaining -= 8)
{
// Load input tile
@@ -140,6 +138,8 @@ void DepthwiseConvolution<
for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
{
const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
}
}
@@ -168,7 +168,6 @@ void DepthwiseConvolution<
}
outptr_base += 8;
}
-#endif // __aarch64__
for (; channels_remaining; channels_remaining--)
{
// Load input tile
@@ -244,5 +243,172 @@ void DepthwiseConvolution<
}
}
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ // Instantiate pointers
+ const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
+ int n = 0;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+ for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
+ {
+ // Load input tile
+ float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = vld1q_f16(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float16x8_t vbias = vld1q_f16(params);
+ params += 8;
+
+ float16x8_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = vld1q_f16(params);
+ params += 8;
+ }
+ }
+
+ // Perform the convolution
+ float16x8_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ v[out_i][out_j] = vbias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ vst1q_f16(outptrs[i][j] + n, v[i][j]);
+ }
+ }
+ }
+ for (; channels_remaining; channels_remaining--, n++)
+ {
+ // Load input tile
+ float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = *(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float16_t bias = *(params++);
+ float16_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = *(params++);
+ }
+ }
+
+ // Perform the convolution
+ float16_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = bias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ *(outptrs[i][j] + n) = v[i][j];
+ }
+ }
+ }
+}
+
} // namespace depthwise
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 10d110feb8..264576137c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -92,7 +92,6 @@ void DepthwiseConvolution<
// Perform the depthwise convolution
int channels_remaining = n_channels;
-#ifdef __aarch64__
for (; channels_remaining >= 4; channels_remaining -= 4)
{
// Load input tile
@@ -170,7 +169,6 @@ void DepthwiseConvolution<
}
outptr_base += 4;
}
-#endif // __aarch64__
for (; channels_remaining; channels_remaining--)
{
// Load input tile
@@ -246,4 +244,171 @@ void DepthwiseConvolution<
}
}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float, float, float
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+ int n = 0;
+ for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
+ {
+ // Load input tile
+ float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = vld1q_f32(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float32x4_t vbias = vld1q_f32(params);
+ params += 4;
+
+ float32x4_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = vld1q_f32(params);
+ params += 4;
+ }
+ }
+
+ // Perform the convolution
+ float32x4_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ v[out_i][out_j] = vbias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ vst1q_f32(outptrs[i][j] + n, v[i][j]);
+ }
+ }
+ }
+ for (; channels_remaining; channels_remaining--, n++)
+ {
+ // Load input tile
+ float u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = *(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float bias = *(params++);
+ float w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = *(params++);
+ }
+ }
+
+ // Perform the convolution
+ float v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = bias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ *(outptrs[i][j] + n) = v[i][j];
+ }
+ }
+ }
+}
+
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index 72f7c6b511..be73065b00 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -631,4 +631,329 @@ void QAsymm8DepthwiseConvolution<
}
}
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template<ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ // Activation parameters (unused if Activation is None)
+ const uint8_t aqmin = _output_quant.offset;
+ const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
+ std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
+
+ // Byte type pointer to weights and biases
+ const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
+
+ // Offset into input/output tensors
+ int n = 0;
+
+#if defined(__aarch64__) // Under Aarch64 only use quad registers
+ for (; n_channels >= 16; n_channels -= 16, n += 16)
+ {
+ // Load biases
+ const int32x4_t biases[4] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
+ };
+ wbptr += 16*sizeof(int32_t);
+
+ // Load weights
+ uint8x16_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = vld1q_u8(wbptr);
+ wbptr += 16;
+ }
+ }
+
+ // Load the input activations
+ uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
+ }
+ }
+
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ // Two sets of operations are required, we perform the
+ // multiply-accumulates for the convolution proper but must also sum
+ // the tile elements to account for the _weight_ offset.
+ uint32x4_t accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ // Get relevant weight and activation pixel
+ const uint8x16_t w = weights[wi][wj];
+ const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ // Perform multiplication and accumulation
+ const uint16x8_t muls[2] = {
+ vmull_u8(vget_low_u8(w), vget_low_u8(x)),
+ vmull_u8(vget_high_u8(w), vget_high_u8(x))
+ };
+
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems[2] = {
+ vmull_u8(vget_low_u8(x), woffset),
+ vmull_u8(vget_high_u8(x), woffset)
+ };
+
+ const uint32x4_t tmps[4] = {
+ vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
+ vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
+ vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
+ vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
+ };
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
+ }
+ }
+
+ // Rescale the accumulator and add in the new offset.
+ uint32x4_t final_accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+#ifdef FIXED_POINT_REQUANTISATION
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+ }
+
+ uint8x16_t output = vcombine_u8(
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
+ );
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmaxq_u8(output, vdupq_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vminq_u8(output, vdupq_n_u8(aqmax));
+ }
+
+ vst1q_u8(outptrs[oi][oj] + n, output);
+ }
+ }
+ }
+#endif // defined(__aarch64__)
+ for (; n_channels >= 8; n_channels -= 8, n += 8)
+ {
+ const int32x4_t biases[2] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ };
+ wbptr += 8*sizeof(int32_t);
+
+ uint8x8_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = vld1_u8(wbptr);
+ wbptr += 8;
+ }
+ }
+
+ uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = vld1_u8(inptrs[i][j] + n);
+ }
+ }
+
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ uint32x4_t accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const uint8x8_t w = weights[wi][wj];
+ const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ const uint16x8_t muls = vmull_u8(w, x);
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems = vmull_u8(x, woffset);
+
+ const uint32x4_t tmps[2] = {
+ vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
+ vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
+ };
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
+ }
+ }
+
+ uint32x4_t final_accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+#ifdef FIXED_POINT_REQUANTISATION
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+ }
+
+ uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmax_u8(output, vdup_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vmin_u8(output, vdup_n_u8(aqmax));
+ }
+
+ vst1_u8(outptrs[oi][oj] + n, output);
+ }
+ }
+ }
+ for (; n_channels; n_channels--, n++)
+ {
+ // Load bias
+ const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+ wbptr += sizeof(int32_t);
+
+ // Load weights
+ uint8_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = *(wbptr++);
+ }
+ }
+
+ // Load the input activations
+ uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = *(inptrs[i][j] + n);
+ }
+ }
+
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ int32_t acc = bias;
+ uint32_t element_sum = 0;
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+ acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
+ element_sum += static_cast<uint32_t>(x);
+ }
+ }
+
+ acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
+
+ // Requantize
+#ifdef FIXED_POINT_REQUANTISATION
+ acc = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
+ rescale_parameters.shift
+ );
+ acc += _output_quant.offset;
+ uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
+#else // floating point requantization
+ float fp_acc = static_cast<float>(acc);
+ fp_acc *= rescale_parameters.rescale;
+ fp_acc += static_cast<float>(_output_quant.offset);
+ fp_acc = std::max<float>(fp_acc, 0.0f);
+ uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
+#endif
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = std::max(output, aqmin);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = std::min(output, aqmax);
+ }
+
+ *(outptrs[oi][oj] + n) = output;
+ }
+ }
+ }
+}
+
} // namespace depthwise