aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/depthwise
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/depthwise')
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise.hpp551
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp1168
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp2809
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp2341
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp769
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp6018
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp42
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp156
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp144
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp34
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp31
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp102
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp32
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp31
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp291
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp88
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_base.hpp505
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp295
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp439
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp438
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp511
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp457
22 files changed, 0 insertions, 17252 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
deleted file mode 100644
index 70d6689731..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include "activation.hpp"
-#include "padding.hpp"
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-class IDepthwiseConvolution
-{
- public:
- virtual ~IDepthwiseConvolution() = default;
-
- virtual int output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after
- ) const = 0;
-
- /* Set input tensor and stride. */
- virtual void set_input(const void *inptr) = 0;
- virtual void set_input(const void *inptr, int column_stride) = 0;
- virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
- virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
-
- /* Set output tensor and stride. */
- virtual void set_output(void *outptr) = 0;
- virtual void set_output(void *outptr, int column_stride) = 0;
- virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
- virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
-
- /* Weights and biases are re-ordered to improve memory access patterns. Use
- * these methods to determine the size of the re-pack buffer and to set the
- * address (and implicitly reorder the weights and biases into) the buffer.
- */
- virtual size_t get_packed_params_size(void) const = 0;
- virtual void set_packed_params_buffer(void *) = 0;
-
- virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
- virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
- virtual void pack_params(
- void *buffer,
- const void* weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const = 0;
-
- /* Working space is used to pad tensors on the fly. Before running any
- * inference check the amount of space required, allocate and provide a
- * pointer to the convolution engine.
- */
- virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
- virtual void set_working_space(void *) = 0;
-
- virtual unsigned int get_window(void) const = 0;
- virtual void run(
- unsigned int start,
- unsigned int stop,
- unsigned int threadid=0
- ) = 0;
-};
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename TIn, typename TBias, typename TOut,
- typename Derived
->
-class DepthwiseConvolutionBase : public IDepthwiseConvolution
-{
- public:
- // Information about the specific convolution instance
- using InputType = TIn;
- using BiasType = TBias;
- using OutputType = TOut;
- static constexpr int output_tile_rows = OutputTileRows;
- static constexpr int output_tile_cols = OutputTileCols;
- static constexpr int kernel_rows = KernelRows;
- static constexpr int kernel_cols = KernelCols;
- static constexpr int stride_rows = StrideRows;
- static constexpr int stride_cols = StrideCols;
- static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
- static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- */
- DepthwiseConvolutionBase(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- */
- DepthwiseConvolutionBase(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- // Cannot copy or move a DepthwiseConvolution.
- DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
- DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
-
- /* Set input tensor and stride. */
- void set_input(const void *inptr) override;
- void set_input(const void *inptr, int column_stride) override;
- void set_input(const void *inptr, int row_stride, int column_stride) override;
- void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
- /* Set output tensor and stride. */
- void set_output(void *outptr) override;
- void set_output(void *outptr, int column_stride) override;
- void set_output(void *outptr, int row_stride, int column_stride) override;
- void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
- /** Get the number of output rows/columns.
- *
- * @param[in] dim_size Number of elements in the dimension (rows/columns)
- * @param[in] same_padding True if the padding is SAME, otherwise false.
- */
- static int get_output_size(
- int dim_size, unsigned int padding_before, unsigned int padding_after
- );
-
- int output_size(
- int dim_size, unsigned int padding_before, unsigned int padding_after
- ) const override;
-
- /* Determine how much memory is required to store the packed weights and
- * biases.
- */
- size_t get_packed_params_size(void) const override;
-
- /* Set the buffer for the packed weights and biases, and perform the
- * packing.
- */
- void set_packed_params_buffer(void *buffer) override;
-
- void pack_params(const void *weights, const void *biases=nullptr) const override;
-
- void pack_params(
- void *buffer,
- const void *weights,
- const void *biases=nullptr
- ) const override;
-
- void pack_params(
- void *buffer,
- const void *weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const override;
-
- /** Query the amount of working space required.
- * @param[in] The largest number of threads which will be used to execute
- * the kernel.
- */
- size_t get_working_space_size(unsigned int n_threads=1) const override;
-
- /** Set the working space buffer.
- */
- void set_working_space(void *buffer) override;
-
- /** Get the window of work to be performed by an instance of the operator.
- */
- unsigned int get_window(void) const override;
-
- /** Perform a portion of the work associated with the operator.
- *
- * Will perform the window of work described by $[start, stop)$.
- *
- * @param[in] start Start of the window of work to perform.
- * @param[in] stop End of the work to perform.
- * @param[in] ID of the thread performing the work.
- */
- void run(
- unsigned int start,
- unsigned int stop,
- unsigned int threadid=0
- ) override;
-
- protected:
- /** Get the value to use to pad the tensor.
- */
- TIn _input_padding_value(void) const;
-
- /** Implementation of the parameter packing.
- */
- void _pack_params(
- void *buffer,
- const void *weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const;
-
- /** Process a tile-row of the tensors.
- */
- void process_tile_row(
- unsigned int threadid,
- int n_channels,
- const void* packed_params,
- const InputType* inptr,
- OutputType* outptr,
- int row_pad_in_top,
- int row_pad_in_left,
- int row_pad_in_bottom,
- int row_pad_out_bottom,
- int n_tiles,
- int n_input_cols,
- int n_output_cols
- );
-
- /** Process a single tile of the tensor.
- *
- * This method will apply input/output padding (if required) and call the
- * depthwise tile implementation.
- */
- void process_tile(
- unsigned int threadid,
- int n_channels,
- const void* packed_params,
- const InputType* inptr,
- OutputType* outptr,
- int pad_in_top,
- int pad_in_left,
- int pad_in_bottom,
- int pad_in_right,
- int pad_out_bottom,
- int pad_out_right
- );
-
- /** Perform depthwise convolution on a single tile.
- */
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const InputType* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- OutputType* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const InputType* inptrs[inner_tile_rows][inner_tile_cols],
- OutputType* outptrs[output_tile_rows][output_tile_cols]
- );
-
- int n_channels(void) const;
-
- private:
- // Member variables of instances of a convolution engine.
- const InputType* _input;
- OutputType* _output;
- void* _packed_parameters;
- void* _working_space; // Per-thread working space
- const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
- _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
- const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
- const nck::ActivationFunction _activation;
-
- // Stride information for a convolution instance
- int _input_col_stride, _input_row_stride, _input_batch_stride;
- int _output_col_stride, _output_row_stride, _output_batch_stride;
-
- // Methods for getting access to working space
- size_t _get_input_working_space_size(void) const;
- size_t _get_output_working_space_size(void) const;
-
- void *_get_input_working_space(unsigned int threadid) const;
- void *_get_output_working_space(unsigned int threadid) const;
-};
-
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename TIn, typename TBias, typename TOut
->
-class DepthwiseConvolution : public DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TBias, TOut,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TBias, TOut
- >
->
-{
- using Base = DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TBias, TOut,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TBias, TOut
- > >;
- friend Base;
- using InputType = typename Base::InputType;
- using OutputType = typename Base::OutputType;
-
- public:
- using Base::DepthwiseConvolutionBase;
-
- protected:
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const TIn* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- TOut* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- );
-};
-
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float, float
-> : public DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float, float,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float, float
- >
->
-{
- using Base = DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float, float,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float, float
- > >;
- friend Base;
- using InputType = typename Base::InputType;
- using OutputType = typename Base::OutputType;
-
- public:
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- protected:
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const float* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- float* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- );
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t, float16_t
-> : public DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t, float16_t,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t, float16_t
- >
->
-{
- using Base = DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t, float16_t,
- DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t, float16_t
- > >;
- friend Base;
- using InputType = typename Base::InputType;
- using OutputType = typename Base::OutputType;
-
- public:
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- protected:
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const float16_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- float16_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- );
-};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 864c6e24a0..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x26, %[inptr0], %[input_row_stride]\n"
- "add x21, %[input_col_stride1], %[input_col_stride1]\n"
- "add x23, %[outptr0], %[output_row_stride]\n"
- "add x27, x26, %[input_row_stride]\n"
- "add x22, x21, %[input_col_stride1]\n"
- "and x24, %[n_channels], #3\n"
- "add x28, x27, %[input_row_stride]\n"
- "lsr x25, %[n_channels], #2\n"
- "cbz x25, 4f\n"
- "1:\n"
- "ldr q15, [%[wbptr]]\n"
- "subs x25, x25, #1\n"
- "mov v3.16b, v15.16b\n"
- "ldr q14, [%[wbptr], #16]\n"
- "mov v1.16b, v15.16b\n"
- "ldr q13, [%[wbptr], #32]\n"
- "mov v2.16b, v15.16b\n"
- "ldr q12, [%[wbptr], #48]\n"
- "mov v0.16b, v15.16b\n"
- "ldr q11, [%[wbptr], #64]\n"
- "ldr q10, [%[wbptr], #80]\n"
- "ldr q9, [%[wbptr], #96]\n"
- "ldr q8, [%[wbptr], #112]\n"
- "ldr q7, [%[wbptr], #128]\n"
- "ldr q6, [%[wbptr], #144]\n"
- "ldr q24, [%[inptr0]]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "ldr q22, [x26]\n"
- "fmla v1.4s, v22.4s, v14.4s\n"
- "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v19.4s, v14.4s\n"
- "ldr q18, [x27]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "ldr q21, [x26, %[input_col_stride1]]\n"
- "fmla v1.4s, v18.4s, v11.4s\n"
- "ldr q17, [%[inptr0], x21]\n"
- "ldr q20, [x28]\n"
- "ldr q5, [x27, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v13.4s\n"
- "fmla v3.4s, v18.4s, v8.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v3.4s, v21.4s, v10.4s\n"
- "ldr q19, [x26, x21]\n"
- "fmla v1.4s, v21.4s, v13.4s\n"
- "ldr q23, [%[inptr0], x22]\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "ldr q22, [x28, %[input_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v14.4s\n"
- "ldr q21, [x27, x21]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr q18, [x26, x22]\n"
- "fmla v2.4s, v17.4s, v13.4s\n"
- "ldr q16, [x28, x21]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "ldr q20, [x27, x22]\n"
- "fmla v3.4s, v5.4s, v7.4s\n"
- "ldr q4, [x28, x22]\n"
- "fmla v2.4s, v5.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v1.4s, v5.4s, v10.4s\n"
- "ldr q15, [%[wbptr]]\n"
- "fmla v0.4s, v5.4s, v11.4s\n"
- "ldr q14, [%[wbptr], #16]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v1.4s, v19.4s, v12.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v2.4s, v19.4s, v10.4s\n"
- "ldr q11, [%[wbptr], #64]\n"
- "fmla v0.4s, v19.4s, v13.4s\n"
- "ldr q24, [%[inptr0]]\n"
- "fmla v1.4s, v22.4s, v7.4s\n"
- "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "ldr q17, [%[inptr0], x21]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "ldr q13, [%[wbptr], #32]\n"
- "fmla v3.4s, v21.4s, v6.4s\n"
- "add x26, x26, #16\n"
- "fmla v1.4s, v21.4s, v9.4s\n"
- "ldr q22, [x26]\n"
- "fmla v2.4s, v21.4s, v7.4s\n"
- "ldr q8, [%[wbptr], #112]\n"
- "str q3, [%[outptr0]]\n"
- "fmla v0.4s, v21.4s, v10.4s\n"
- "fmla v1.4s, v16.4s, v6.4s\n"
- "ldr q21, [x26, %[input_col_stride1]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "add x27, x27, #16\n"
- "fmla v0.4s, v18.4s, v12.4s\n"
- "ldr q10, [%[wbptr], #80]\n"
- "str q1, [x23]\n"
- "mov v3.16b, v15.16b\n"
- "fmla v2.4s, v20.4s, v6.4s\n"
- "ldr q18, [x27]\n"
- "fmla v0.4s, v16.4s, v7.4s\n"
- "ldr q12, [%[wbptr], #48]\n"
- "mov v1.16b, v15.16b\n"
- "ldr q5, [x27, %[input_col_stride1]]\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "fmla v0.4s, v20.4s, v9.4s\n"
- "ldr q7, [%[wbptr], #128]\n"
- "mov v2.16b, v15.16b\n"
- "add x28, x28, #16\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "ldr q20, [x28]\n"
- "fmla v0.4s, v4.4s, v6.4s\n"
- "ldr q9, [%[wbptr], #96]\n"
- "fmla v1.4s, v22.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v3.4s, v19.4s, v13.4s\n"
- "subs x25, x25, #1\n"
- "str q0, [x23, %[output_col_stride1]]\n"
- "fmla v2.4s, v19.4s, v14.4s\n"
- "ldr q6, [%[wbptr], #144]\n"
- "add x23, x23, #16\n"
- "fmla v3.4s, v18.4s, v8.4s\n"
- "fmla v1.4s, v18.4s, v11.4s\n"
- "mov v0.16b, v15.16b\n"
- "bne 2b\n"
- "3:\n"
- "fmla v3.4s, v21.4s, v10.4s\n"
- "ldr q19, [x26, x21]\n"
- "fmla v1.4s, v21.4s, v13.4s\n"
- "ldr q23, [%[inptr0], x22]\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "ldr q22, [x28, %[input_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v14.4s\n"
- "ldr q21, [x27, x21]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr q18, [x26, x22]\n"
- "fmla v2.4s, v17.4s, v13.4s\n"
- "ldr q16, [x28, x21]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "ldr q20, [x27, x22]\n"
- "fmla v3.4s, v5.4s, v7.4s\n"
- "ldr q4, [x28, x22]\n"
- "fmla v2.4s, v5.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v1.4s, v5.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v5.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "add x26, x26, #16\n"
- "fmla v1.4s, v19.4s, v12.4s\n"
- "add x27, x27, #16\n"
- "fmla v2.4s, v19.4s, v10.4s\n"
- "add x28, x28, #16\n"
- "fmla v0.4s, v19.4s, v13.4s\n"
- "fmla v3.4s, v21.4s, v6.4s\n"
- "fmla v1.4s, v22.4s, v7.4s\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "str q3, [%[outptr0]]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "fmla v1.4s, v21.4s, v9.4s\n"
- "fmla v2.4s, v21.4s, v7.4s\n"
- "fmla v0.4s, v21.4s, v10.4s\n"
- "fmla v1.4s, v16.4s, v6.4s\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "fmla v0.4s, v18.4s, v12.4s\n"
- "str q1, [x23]\n"
- "fmla v2.4s, v20.4s, v6.4s\n"
- "fmla v0.4s, v16.4s, v7.4s\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v20.4s, v9.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v0.4s, v4.4s, v6.4s\n"
- "str q0, [x23, %[output_col_stride1]]\n"
- "add x23, x23, #16\n"
- "4:\n"
- "cbz x24, 7f\n"
- "ldr s15, [%[wbptr]]\n"
- "mov v3.16b, v15.16b\n"
- "ldr s14, [%[wbptr], #4]\n"
- "mov v1.16b, v15.16b\n"
- "ldr s13, [%[wbptr], #8]\n"
- "mov v2.16b, v15.16b\n"
- "ldr s12, [%[wbptr], #12]\n"
- "mov v0.16b, v15.16b\n"
- "ldr s11, [%[wbptr], #16]\n"
- "ldr s10, [%[wbptr], #20]\n"
- "subs x24, x24, #1\n"
- "ldr s9, [%[wbptr], #24]\n"
- "ldr s8, [%[wbptr], #28]\n"
- "ldr s7, [%[wbptr], #32]\n"
- "ldr s6, [%[wbptr], #36]\n"
- "ldr s24, [%[inptr0]]\n"
- "ldr s22, [x26]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v1.4s, v22.4s, v14.4s\n"
- "ldr s18, [x27]\n"
- "fmla v2.4s, v19.4s, v14.4s\n"
- "ldr s21, [x26, %[input_col_stride1]]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "ldr s17, [%[inptr0], x21]\n"
- "fmla v1.4s, v18.4s, v11.4s\n"
- "ldr s20, [x28]\n"
- "ldr s5, [x27, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v13.4s\n"
- "fmla v3.4s, v18.4s, v8.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v3.4s, v21.4s, v10.4s\n"
- "ldr s19, [x26, x21]\n"
- "fmla v1.4s, v21.4s, v13.4s\n"
- "ldr s23, [%[inptr0], x22]\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "ldr s22, [x28, %[input_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v14.4s\n"
- "ldr s21, [x27, x21]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr s18, [x26, x22]\n"
- "fmla v2.4s, v17.4s, v13.4s\n"
- "ldr s16, [x28, x21]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "ldr s20, [x27, x22]\n"
- "fmla v3.4s, v5.4s, v7.4s\n"
- "ldr s4, [x28, x22]\n"
- "fmla v2.4s, v5.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v1.4s, v5.4s, v10.4s\n"
- "ldr s15, [%[wbptr]]\n"
- "fmla v0.4s, v5.4s, v11.4s\n"
- "ldr s14, [%[wbptr], #4]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v1.4s, v19.4s, v12.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v2.4s, v19.4s, v10.4s\n"
- "ldr s11, [%[wbptr], #16]\n"
- "fmla v0.4s, v19.4s, v13.4s\n"
- "ldr s24, [%[inptr0]]\n"
- "fmla v1.4s, v22.4s, v7.4s\n"
- "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "ldr s17, [%[inptr0], x21]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "ldr s13, [%[wbptr], #8]\n"
- "fmla v3.4s, v21.4s, v6.4s\n"
- "add x26, x26, #4\n"
- "fmla v1.4s, v21.4s, v9.4s\n"
- "ldr s22, [x26]\n"
- "fmla v2.4s, v21.4s, v7.4s\n"
- "ldr s8, [%[wbptr], #28]\n"
- "str s3, [%[outptr0]]\n"
- "fmla v0.4s, v21.4s, v10.4s\n"
- "fmla v1.4s, v16.4s, v6.4s\n"
- "ldr s21, [x26, %[input_col_stride1]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "add x27, x27, #4\n"
- "fmla v0.4s, v18.4s, v12.4s\n"
- "ldr s10, [%[wbptr], #20]\n"
- "str s1, [x23]\n"
- "mov v3.16b, v15.16b\n"
- "fmla v2.4s, v20.4s, v6.4s\n"
- "ldr s18, [x27]\n"
- "fmla v0.4s, v16.4s, v7.4s\n"
- "ldr s12, [%[wbptr], #12]\n"
- "mov v1.16b, v15.16b\n"
- "ldr s5, [x27, %[input_col_stride1]]\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "fmla v0.4s, v20.4s, v9.4s\n"
- "ldr s7, [%[wbptr], #32]\n"
- "mov v2.16b, v15.16b\n"
- "add x28, x28, #4\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "ldr s20, [x28]\n"
- "fmla v0.4s, v4.4s, v6.4s\n"
- "ldr s9, [%[wbptr], #24]\n"
- "fmla v1.4s, v22.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v3.4s, v19.4s, v13.4s\n"
- "subs x24, x24, #1\n"
- "str s0, [x23, %[output_col_stride1]]\n"
- "fmla v2.4s, v19.4s, v14.4s\n"
- "ldr s6, [%[wbptr], #36]\n"
- "add x23, x23, #4\n"
- "fmla v3.4s, v18.4s, v8.4s\n"
- "fmla v1.4s, v18.4s, v11.4s\n"
- "mov v0.16b, v15.16b\n"
- "bne 5b\n"
- "6:\n"
- "fmla v3.4s, v21.4s, v10.4s\n"
- "ldr s19, [x26, x21]\n"
- "fmla v1.4s, v21.4s, v13.4s\n"
- "ldr s23, [%[inptr0], x22]\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "ldr s22, [x28, %[input_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v14.4s\n"
- "ldr s21, [x27, x21]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr s18, [x26, x22]\n"
- "fmla v2.4s, v17.4s, v13.4s\n"
- "ldr s16, [x28, x21]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "ldr s20, [x27, x22]\n"
- "fmla v3.4s, v5.4s, v7.4s\n"
- "ldr s4, [x28, x22]\n"
- "fmla v2.4s, v5.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v1.4s, v5.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v5.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "add x26, x26, #4\n"
- "fmla v1.4s, v19.4s, v12.4s\n"
- "add x27, x27, #4\n"
- "fmla v2.4s, v19.4s, v10.4s\n"
- "add x28, x28, #4\n"
- "fmla v0.4s, v19.4s, v13.4s\n"
- "fmla v3.4s, v21.4s, v6.4s\n"
- "fmla v1.4s, v22.4s, v7.4s\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "str s3, [%[outptr0]]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "fmla v1.4s, v21.4s, v9.4s\n"
- "fmla v2.4s, v21.4s, v7.4s\n"
- "fmla v0.4s, v21.4s, v10.4s\n"
- "fmla v1.4s, v16.4s, v6.4s\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "fmla v0.4s, v18.4s, v12.4s\n"
- "str s1, [x23]\n"
- "fmla v2.4s, v20.4s, v6.4s\n"
- "fmla v0.4s, v16.4s, v7.4s\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v20.4s, v9.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v0.4s, v4.4s, v6.4s\n"
- "str s0, [x23, %[output_col_stride1]]\n"
- "add x23, x23, #4\n"
- "7:\n"
- : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x21, %[inptr0], %[input_row_stride]\n"
- "add x24, %[input_col_stride1], %[input_col_stride1]\n"
- "add x22, %[outptr0], %[output_row_stride]\n"
- "add x23, x21, %[input_row_stride]\n"
- "add x27, x24, %[input_col_stride1]\n"
- "and x25, %[n_channels], #3\n"
- "add x28, x23, %[input_row_stride]\n"
- "lsr x26, %[n_channels], #2\n"
- "cbz x26, 4f\n"
- "1:\n"
- "ldr q11, [%[wbptr]]\n"
- "subs x26, x26, #1\n"
- "mov v17.16b, v11.16b\n"
- "ldr q13, [%[wbptr], #16]\n"
- "mov v15.16b, v11.16b\n"
- "ldr q4, [%[wbptr], #32]\n"
- "mov v16.16b, v11.16b\n"
- "ldr q2, [%[wbptr], #48]\n"
- "mov v14.16b, v11.16b\n"
- "ldr q5, [%[wbptr], #64]\n"
- "ldr q10, [%[wbptr], #80]\n"
- "ldr q1, [%[wbptr], #96]\n"
- "ldr q12, [%[wbptr], #112]\n"
- "ldr q0, [%[wbptr], #128]\n"
- "ldr q3, [%[wbptr], #144]\n"
- "ldr q6, [%[inptr0]]\n"
- "fmla v17.4s, v6.4s, v13.4s\n"
- "ldr q27, [x21]\n"
- "fmla v15.4s, v27.4s, v13.4s\n"
- "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "ldr q24, [x23]\n"
- "fmla v17.4s, v27.4s, v5.4s\n"
- "ldr q22, [x21, %[input_col_stride1]]\n"
- "ldr q9, [%[inptr0], x24]\n"
- "ldr q8, [x28]\n"
- "ldr q20, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v17.4s, v24.4s, v12.4s\n"
- "ldr q26, [x21, x24]\n"
- "fmla v15.4s, v24.4s, v5.4s\n"
- "ldr q27, [%[inptr0], x27]\n"
- "fmla v16.4s, v22.4s, v5.4s\n"
- "ldr q25, [x28, %[input_col_stride1]]\n"
- "fmla v17.4s, v22.4s, v10.4s\n"
- "ldr q24, [x23, x24]\n"
- "fmla v15.4s, v22.4s, v4.4s\n"
- "ldr q21, [x21, x27]\n"
- "fmla v14.4s, v22.4s, v13.4s\n"
- "ldr q7, [x28, x24]\n"
- "fmla v17.4s, v9.4s, v2.4s\n"
- "ldr q19, [x23, x27]\n"
- "fmla v16.4s, v9.4s, v4.4s\n"
- "ldr q18, [x28, x27]\n"
- "fmla v15.4s, v8.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v20.4s, v0.4s\n"
- "ldr q11, [%[wbptr]]\n"
- "fmla v16.4s, v20.4s, v12.4s\n"
- "ldr q13, [%[wbptr], #16]\n"
- "fmla v15.4s, v20.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v14.4s, v20.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v17.4s, v26.4s, v1.4s\n"
- "ldr q6, [%[inptr0]]\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "ldr q5, [%[wbptr], #64]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr q9, [%[inptr0], x24]\n"
- "fmla v15.4s, v25.4s, v0.4s\n"
- "add x21, x21, #16\n"
- "fmla v16.4s, v27.4s, v2.4s\n"
- "ldr q27, [x21]\n"
- "fmla v14.4s, v25.4s, v12.4s\n"
- "ldr q4, [%[wbptr], #32]\n"
- "fmla v17.4s, v24.4s, v3.4s\n"
- "ldr q22, [x21, %[input_col_stride1]]\n"
- "fmla v15.4s, v24.4s, v1.4s\n"
- "add x23, x23, #16\n"
- "fmla v16.4s, v24.4s, v0.4s\n"
- "ldr q12, [%[wbptr], #112]\n"
- "fmla v14.4s, v24.4s, v10.4s\n"
- "ldr q24, [x23]\n"
- "fmla v15.4s, v7.4s, v3.4s\n"
- "ldr q20, [x23, %[input_col_stride1]]\n"
- "fmla v16.4s, v21.4s, v1.4s\n"
- "add x28, x28, #16\n"
- "fmla v14.4s, v21.4s, v2.4s\n"
- "ldr q10, [%[wbptr], #80]\n"
- "movi v26.16b, #0\n"
- "ldr q8, [x28]\n"
- "fmla v16.4s, v19.4s, v3.4s\n"
- "subs x26, x26, #1\n"
- "fmla v14.4s, v7.4s, v0.4s\n"
- "ldr q2, [%[wbptr], #48]\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str q17, [%[outptr0]]\n"
- "str q16, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v14.4s, v19.4s, v1.4s\n"
- "str q15, [x22]\n"
- "mov v17.16b, v11.16b\n"
- "mov v15.16b, v11.16b\n"
- "ldr q0, [%[wbptr], #128]\n"
- "fmla v14.4s, v18.4s, v3.4s\n"
- "ldr q1, [%[wbptr], #96]\n"
- "mov v16.16b, v11.16b\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v17.4s, v6.4s, v13.4s\n"
- "fmla v15.4s, v27.4s, v13.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "ldr q3, [%[wbptr], #144]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "str q14, [x22, %[output_col_stride1]]\n"
- "mov v14.16b, v11.16b\n"
- "add x22, x22, #16\n"
- "fmla v17.4s, v27.4s, v5.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v17.4s, v24.4s, v12.4s\n"
- "ldr q26, [x21, x24]\n"
- "fmla v15.4s, v24.4s, v5.4s\n"
- "ldr q27, [%[inptr0], x27]\n"
- "fmla v16.4s, v22.4s, v5.4s\n"
- "ldr q25, [x28, %[input_col_stride1]]\n"
- "fmla v17.4s, v22.4s, v10.4s\n"
- "ldr q24, [x23, x24]\n"
- "fmla v15.4s, v22.4s, v4.4s\n"
- "ldr q21, [x21, x27]\n"
- "fmla v14.4s, v22.4s, v13.4s\n"
- "ldr q7, [x28, x24]\n"
- "fmla v17.4s, v9.4s, v2.4s\n"
- "ldr q19, [x23, x27]\n"
- "fmla v16.4s, v9.4s, v4.4s\n"
- "ldr q18, [x28, x27]\n"
- "fmla v15.4s, v8.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v20.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v20.4s, v12.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v15.4s, v20.4s, v10.4s\n"
- "add x21, x21, #16\n"
- "fmla v14.4s, v20.4s, v5.4s\n"
- "add x23, x23, #16\n"
- "fmla v17.4s, v26.4s, v1.4s\n"
- "add x28, x28, #16\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "movi v26.16b, #0\n"
- "fmla v17.4s, v24.4s, v3.4s\n"
- "fmla v16.4s, v27.4s, v2.4s\n"
- "fmla v15.4s, v25.4s, v0.4s\n"
- "fmla v14.4s, v25.4s, v12.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmla v16.4s, v24.4s, v0.4s\n"
- "str q17, [%[outptr0]]\n"
- "fmla v15.4s, v24.4s, v1.4s\n"
- "fmla v14.4s, v24.4s, v10.4s\n"
- "fmla v16.4s, v21.4s, v1.4s\n"
- "fmla v15.4s, v7.4s, v3.4s\n"
- "fmla v14.4s, v21.4s, v2.4s\n"
- "fmla v16.4s, v19.4s, v3.4s\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "fmla v14.4s, v7.4s, v0.4s\n"
- "str q15, [x22]\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "fmla v14.4s, v19.4s, v1.4s\n"
- "str q16, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v14.4s, v18.4s, v3.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "str q14, [x22, %[output_col_stride1]]\n"
- "add x22, x22, #16\n"
- "4:\n"
- "cbz x25, 7f\n"
- "ldr s11, [%[wbptr]]\n"
- "mov v17.16b, v11.16b\n"
- "ldr s13, [%[wbptr], #4]\n"
- "mov v15.16b, v11.16b\n"
- "ldr s4, [%[wbptr], #8]\n"
- "mov v16.16b, v11.16b\n"
- "ldr s2, [%[wbptr], #12]\n"
- "mov v14.16b, v11.16b\n"
- "ldr s5, [%[wbptr], #16]\n"
- "ldr s10, [%[wbptr], #20]\n"
- "subs x25, x25, #1\n"
- "ldr s1, [%[wbptr], #24]\n"
- "ldr s12, [%[wbptr], #28]\n"
- "ldr s0, [%[wbptr], #32]\n"
- "ldr s3, [%[wbptr], #36]\n"
- "ldr s6, [%[inptr0]]\n"
- "ldr s27, [x21]\n"
- "fmla v17.4s, v6.4s, v13.4s\n"
- "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v15.4s, v27.4s, v13.4s\n"
- "ldr s24, [x23]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "ldr s22, [x21, %[input_col_stride1]]\n"
- "fmla v17.4s, v27.4s, v5.4s\n"
- "ldr s9, [%[inptr0], x24]\n"
- "ldr s8, [x28]\n"
- "ldr s20, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v17.4s, v24.4s, v12.4s\n"
- "ldr s26, [x21, x24]\n"
- "fmla v15.4s, v24.4s, v5.4s\n"
- "ldr s27, [%[inptr0], x27]\n"
- "fmla v16.4s, v22.4s, v5.4s\n"
- "ldr s25, [x28, %[input_col_stride1]]\n"
- "fmla v17.4s, v22.4s, v10.4s\n"
- "ldr s24, [x23, x24]\n"
- "fmla v15.4s, v22.4s, v4.4s\n"
- "ldr s21, [x21, x27]\n"
- "fmla v14.4s, v22.4s, v13.4s\n"
- "ldr s7, [x28, x24]\n"
- "fmla v17.4s, v9.4s, v2.4s\n"
- "ldr s19, [x23, x27]\n"
- "fmla v16.4s, v9.4s, v4.4s\n"
- "ldr s18, [x28, x27]\n"
- "fmla v15.4s, v8.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v20.4s, v0.4s\n"
- "ldr s11, [%[wbptr]]\n"
- "fmla v16.4s, v20.4s, v12.4s\n"
- "ldr s13, [%[wbptr], #4]\n"
- "fmla v15.4s, v20.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v14.4s, v20.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v17.4s, v26.4s, v1.4s\n"
- "ldr s6, [%[inptr0]]\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "ldr s5, [%[wbptr], #16]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr s9, [%[inptr0], x24]\n"
- "fmla v15.4s, v25.4s, v0.4s\n"
- "add x21, x21, #4\n"
- "fmla v16.4s, v27.4s, v2.4s\n"
- "ldr s27, [x21]\n"
- "fmla v14.4s, v25.4s, v12.4s\n"
- "ldr s4, [%[wbptr], #8]\n"
- "fmla v17.4s, v24.4s, v3.4s\n"
- "ldr s22, [x21, %[input_col_stride1]]\n"
- "fmla v15.4s, v24.4s, v1.4s\n"
- "add x23, x23, #4\n"
- "fmla v16.4s, v24.4s, v0.4s\n"
- "ldr s12, [%[wbptr], #28]\n"
- "fmla v14.4s, v24.4s, v10.4s\n"
- "ldr s24, [x23]\n"
- "fmla v15.4s, v7.4s, v3.4s\n"
- "ldr s20, [x23, %[input_col_stride1]]\n"
- "fmla v16.4s, v21.4s, v1.4s\n"
- "add x28, x28, #4\n"
- "fmla v14.4s, v21.4s, v2.4s\n"
- "ldr s10, [%[wbptr], #20]\n"
- "movi v26.16b, #0\n"
- "ldr s8, [x28]\n"
- "fmla v16.4s, v19.4s, v3.4s\n"
- "subs x25, x25, #1\n"
- "fmla v14.4s, v7.4s, v0.4s\n"
- "ldr s2, [%[wbptr], #12]\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str s17, [%[outptr0]]\n"
- "str s16, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v14.4s, v19.4s, v1.4s\n"
- "str s15, [x22]\n"
- "mov v17.16b, v11.16b\n"
- "mov v15.16b, v11.16b\n"
- "ldr s0, [%[wbptr], #32]\n"
- "fmla v14.4s, v18.4s, v3.4s\n"
- "ldr s1, [%[wbptr], #24]\n"
- "mov v16.16b, v11.16b\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v17.4s, v6.4s, v13.4s\n"
- "fmla v15.4s, v27.4s, v13.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "ldr s3, [%[wbptr], #36]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "str s14, [x22, %[output_col_stride1]]\n"
- "mov v14.16b, v11.16b\n"
- "add x22, x22, #4\n"
- "fmla v17.4s, v27.4s, v5.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v17.4s, v24.4s, v12.4s\n"
- "ldr s26, [x21, x24]\n"
- "fmla v15.4s, v24.4s, v5.4s\n"
- "ldr s27, [%[inptr0], x27]\n"
- "fmla v16.4s, v22.4s, v5.4s\n"
- "ldr s25, [x28, %[input_col_stride1]]\n"
- "fmla v17.4s, v22.4s, v10.4s\n"
- "ldr s24, [x23, x24]\n"
- "fmla v15.4s, v22.4s, v4.4s\n"
- "ldr s21, [x21, x27]\n"
- "fmla v14.4s, v22.4s, v13.4s\n"
- "ldr s7, [x28, x24]\n"
- "fmla v17.4s, v9.4s, v2.4s\n"
- "ldr s19, [x23, x27]\n"
- "fmla v16.4s, v9.4s, v4.4s\n"
- "ldr s18, [x28, x27]\n"
- "fmla v15.4s, v8.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v20.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v20.4s, v12.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v15.4s, v20.4s, v10.4s\n"
- "add x21, x21, #4\n"
- "fmla v14.4s, v20.4s, v5.4s\n"
- "add x23, x23, #4\n"
- "fmla v17.4s, v26.4s, v1.4s\n"
- "add x28, x28, #4\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "movi v26.16b, #0\n"
- "fmla v17.4s, v24.4s, v3.4s\n"
- "fmla v16.4s, v27.4s, v2.4s\n"
- "fmla v15.4s, v25.4s, v0.4s\n"
- "fmla v14.4s, v25.4s, v12.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmla v16.4s, v24.4s, v0.4s\n"
- "str s17, [%[outptr0]]\n"
- "fmla v15.4s, v24.4s, v1.4s\n"
- "fmla v14.4s, v24.4s, v10.4s\n"
- "fmla v16.4s, v21.4s, v1.4s\n"
- "fmla v15.4s, v7.4s, v3.4s\n"
- "fmla v14.4s, v21.4s, v2.4s\n"
- "fmla v16.4s, v19.4s, v3.4s\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "fmla v14.4s, v7.4s, v0.4s\n"
- "str s15, [x22]\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "fmla v14.4s, v19.4s, v1.4s\n"
- "str s16, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v14.4s, v18.4s, v3.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "str s14, [x22, %[output_col_stride1]]\n"
- "add x22, x22, #4\n"
- "7:\n"
- : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
- : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x21, %[inptr0], %[input_row_stride]\n"
- "add x23, %[input_col_stride1], %[input_col_stride1]\n"
- "add x24, %[outptr0], %[output_row_stride]\n"
- "add x27, x21, %[input_row_stride]\n"
- "add x22, x23, %[input_col_stride1]\n"
- "and x25, %[n_channels], #3\n"
- "add x28, x27, %[input_row_stride]\n"
- "lsr x26, %[n_channels], #2\n"
- "cbz x26, 4f\n"
- "1:\n"
- "ldr q19, [%[wbptr]]\n"
- "subs x26, x26, #1\n"
- "mov v3.16b, v19.16b\n"
- "ldr q12, [%[wbptr], #16]\n"
- "mov v1.16b, v19.16b\n"
- "ldr q11, [%[wbptr], #32]\n"
- "mov v2.16b, v19.16b\n"
- "ldr q10, [%[wbptr], #48]\n"
- "mov v0.16b, v19.16b\n"
- "ldr q13, [%[wbptr], #64]\n"
- "ldr q23, [%[wbptr], #80]\n"
- "ldr q15, [%[wbptr], #96]\n"
- "ldr q20, [%[wbptr], #112]\n"
- "ldr q21, [%[wbptr], #128]\n"
- "ldr q14, [%[wbptr], #144]\n"
- "ldr q16, [%[inptr0]]\n"
- "fmla v3.4s, v16.4s, v12.4s\n"
- "ldr q28, [x21]\n"
- "fmla v1.4s, v28.4s, v12.4s\n"
- "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr q24, [x27]\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "ldr q8, [x21, %[input_col_stride1]]\n"
- "ldr q9, [%[inptr0], x23]\n"
- "ldr q18, [x28]\n"
- "ldr q6, [x27, %[input_col_stride1]]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v3.4s, v24.4s, v20.4s\n"
- "ldr q25, [x21, x23]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "ldr q28, [%[inptr0], x22]\n"
- "fmla v2.4s, v8.4s, v13.4s\n"
- "ldr q24, [x28, %[input_col_stride1]]\n"
- "fmla v3.4s, v8.4s, v23.4s\n"
- "ldr q27, [x27, x23]\n"
- "fmla v1.4s, v8.4s, v11.4s\n"
- "ldr q7, [x21, x22]\n"
- "fmla v0.4s, v8.4s, v12.4s\n"
- "ldr q17, [x28, x23]\n"
- "fmla v3.4s, v9.4s, v10.4s\n"
- "ldr q5, [x27, x22]\n"
- "fmla v2.4s, v9.4s, v11.4s\n"
- "ldr q4, [x28, x22]\n"
- "fmla v1.4s, v18.4s, v20.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v3.4s, v6.4s, v21.4s\n"
- "ldr q19, [%[wbptr]]\n"
- "fmla v2.4s, v6.4s, v20.4s\n"
- "ldr q12, [%[wbptr], #16]\n"
- "fmla v1.4s, v6.4s, v23.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v6.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v3.4s, v25.4s, v15.4s\n"
- "ldr q16, [%[inptr0]]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v25.4s, v23.4s\n"
- "ldr q13, [%[wbptr], #64]\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "ldr q9, [%[inptr0], x23]\n"
- "fmla v1.4s, v24.4s, v21.4s\n"
- "add x21, x21, #16\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "ldr q28, [x21]\n"
- "fmla v0.4s, v24.4s, v20.4s\n"
- "ldr q11, [%[wbptr], #32]\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "ldr q8, [x21, %[input_col_stride1]]\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "add x27, x27, #16\n"
- "fmla v2.4s, v27.4s, v21.4s\n"
- "ldr q20, [%[wbptr], #112]\n"
- "fmla v0.4s, v27.4s, v23.4s\n"
- "ldr q24, [x27]\n"
- "fmla v1.4s, v17.4s, v14.4s\n"
- "ldr q6, [x27, %[input_col_stride1]]\n"
- "fmla v2.4s, v7.4s, v15.4s\n"
- "add x28, x28, #16\n"
- "fmla v0.4s, v7.4s, v10.4s\n"
- "ldr q23, [%[wbptr], #80]\n"
- "movi v25.16b, #0\n"
- "ldr q18, [x28]\n"
- "fmla v2.4s, v5.4s, v14.4s\n"
- "subs x26, x26, #1\n"
- "fmla v0.4s, v17.4s, v21.4s\n"
- "ldr q10, [%[wbptr], #48]\n"
- "fmov v26.4s, #6.0\n"
- "fmax v3.4s, v3.4s, v25.4s\n"
- "fmax v2.4s, v2.4s, v25.4s\n"
- "fmax v1.4s, v1.4s, v25.4s\n"
- "fmla v0.4s, v5.4s, v15.4s\n"
- "ldr q21, [%[wbptr], #128]\n"
- "fmin v3.4s, v3.4s, v26.4s\n"
- "fmin v2.4s, v2.4s, v26.4s\n"
- "fmin v1.4s, v1.4s, v26.4s\n"
- "str q3, [%[outptr0]]\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v4.4s, v14.4s\n"
- "str q1, [x24]\n"
- "mov v3.16b, v19.16b\n"
- "mov v1.16b, v19.16b\n"
- "ldr q15, [%[wbptr], #96]\n"
- "fmax v0.4s, v0.4s, v25.4s\n"
- "ldr q14, [%[wbptr], #144]\n"
- "mov v2.16b, v19.16b\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmin v0.4s, v0.4s, v26.4s\n"
- "fmla v3.4s, v16.4s, v12.4s\n"
- "fmla v1.4s, v28.4s, v12.4s\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "str q0, [x24, %[output_col_stride1]]\n"
- "mov v0.16b, v19.16b\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "add x24, x24, #16\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v3.4s, v24.4s, v20.4s\n"
- "ldr q25, [x21, x23]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "ldr q28, [%[inptr0], x22]\n"
- "fmla v2.4s, v8.4s, v13.4s\n"
- "ldr q24, [x28, %[input_col_stride1]]\n"
- "fmla v3.4s, v8.4s, v23.4s\n"
- "ldr q27, [x27, x23]\n"
- "fmla v1.4s, v8.4s, v11.4s\n"
- "ldr q7, [x21, x22]\n"
- "fmla v0.4s, v8.4s, v12.4s\n"
- "ldr q17, [x28, x23]\n"
- "fmla v3.4s, v9.4s, v10.4s\n"
- "ldr q5, [x27, x22]\n"
- "fmla v2.4s, v9.4s, v11.4s\n"
- "ldr q4, [x28, x22]\n"
- "fmla v1.4s, v18.4s, v20.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v3.4s, v6.4s, v21.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v2.4s, v6.4s, v20.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v1.4s, v6.4s, v23.4s\n"
- "add x21, x21, #16\n"
- "fmla v0.4s, v6.4s, v13.4s\n"
- "add x27, x27, #16\n"
- "fmla v3.4s, v25.4s, v15.4s\n"
- "add x28, x28, #16\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v23.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "movi v25.16b, #0\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "fmov v26.4s, #6.0\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v21.4s\n"
- "fmla v0.4s, v24.4s, v20.4s\n"
- "fmax v3.4s, v3.4s, v25.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v2.4s, v27.4s, v21.4s\n"
- "fmla v0.4s, v27.4s, v23.4s\n"
- "fmin v3.4s, v3.4s, v26.4s\n"
- "str q3, [%[outptr0]]\n"
- "fmla v2.4s, v7.4s, v15.4s\n"
- "fmla v0.4s, v7.4s, v10.4s\n"
- "fmla v1.4s, v17.4s, v14.4s\n"
- "fmla v2.4s, v5.4s, v14.4s\n"
- "fmla v0.4s, v17.4s, v21.4s\n"
- "fmax v1.4s, v1.4s, v25.4s\n"
- "fmax v2.4s, v2.4s, v25.4s\n"
- "fmla v0.4s, v5.4s, v15.4s\n"
- "fmin v1.4s, v1.4s, v26.4s\n"
- "fmin v2.4s, v2.4s, v26.4s\n"
- "str q1, [x24]\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v4.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmax v0.4s, v0.4s, v25.4s\n"
- "fmin v0.4s, v0.4s, v26.4s\n"
- "str q0, [x24, %[output_col_stride1]]\n"
- "add x24, x24, #16\n"
- "4:\n"
- "cbz x25, 7f\n"
- "ldr s19, [%[wbptr]]\n"
- "mov v3.16b, v19.16b\n"
- "ldr s12, [%[wbptr], #4]\n"
- "mov v1.16b, v19.16b\n"
- "ldr s11, [%[wbptr], #8]\n"
- "mov v2.16b, v19.16b\n"
- "ldr s10, [%[wbptr], #12]\n"
- "mov v0.16b, v19.16b\n"
- "ldr s13, [%[wbptr], #16]\n"
- "ldr s23, [%[wbptr], #20]\n"
- "subs x25, x25, #1\n"
- "ldr s15, [%[wbptr], #24]\n"
- "ldr s20, [%[wbptr], #28]\n"
- "ldr s21, [%[wbptr], #32]\n"
- "ldr s14, [%[wbptr], #36]\n"
- "ldr s16, [%[inptr0]]\n"
- "ldr s28, [x21]\n"
- "fmla v3.4s, v16.4s, v12.4s\n"
- "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v1.4s, v28.4s, v12.4s\n"
- "ldr s24, [x27]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr s8, [x21, %[input_col_stride1]]\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "ldr s9, [%[inptr0], x23]\n"
- "ldr s18, [x28]\n"
- "ldr s6, [x27, %[input_col_stride1]]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v3.4s, v24.4s, v20.4s\n"
- "ldr s25, [x21, x23]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "ldr s28, [%[inptr0], x22]\n"
- "fmla v2.4s, v8.4s, v13.4s\n"
- "ldr s24, [x28, %[input_col_stride1]]\n"
- "fmla v3.4s, v8.4s, v23.4s\n"
- "ldr s27, [x27, x23]\n"
- "fmla v1.4s, v8.4s, v11.4s\n"
- "ldr s7, [x21, x22]\n"
- "fmla v0.4s, v8.4s, v12.4s\n"
- "ldr s17, [x28, x23]\n"
- "fmla v3.4s, v9.4s, v10.4s\n"
- "ldr s5, [x27, x22]\n"
- "fmla v2.4s, v9.4s, v11.4s\n"
- "ldr s4, [x28, x22]\n"
- "fmla v1.4s, v18.4s, v20.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v3.4s, v6.4s, v21.4s\n"
- "ldr s19, [%[wbptr]]\n"
- "fmla v2.4s, v6.4s, v20.4s\n"
- "ldr s12, [%[wbptr], #4]\n"
- "fmla v1.4s, v6.4s, v23.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v6.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v3.4s, v25.4s, v15.4s\n"
- "ldr s16, [%[inptr0]]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v25.4s, v23.4s\n"
- "ldr s13, [%[wbptr], #16]\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "ldr s9, [%[inptr0], x23]\n"
- "fmla v1.4s, v24.4s, v21.4s\n"
- "add x21, x21, #4\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "ldr s28, [x21]\n"
- "fmla v0.4s, v24.4s, v20.4s\n"
- "ldr s11, [%[wbptr], #8]\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "ldr s8, [x21, %[input_col_stride1]]\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "add x27, x27, #4\n"
- "fmla v2.4s, v27.4s, v21.4s\n"
- "ldr s20, [%[wbptr], #28]\n"
- "fmla v0.4s, v27.4s, v23.4s\n"
- "ldr s24, [x27]\n"
- "fmla v1.4s, v17.4s, v14.4s\n"
- "ldr s6, [x27, %[input_col_stride1]]\n"
- "fmla v2.4s, v7.4s, v15.4s\n"
- "add x28, x28, #4\n"
- "fmla v0.4s, v7.4s, v10.4s\n"
- "ldr s23, [%[wbptr], #20]\n"
- "movi v25.16b, #0\n"
- "ldr s18, [x28]\n"
- "fmla v2.4s, v5.4s, v14.4s\n"
- "subs x25, x25, #1\n"
- "fmla v0.4s, v17.4s, v21.4s\n"
- "ldr s10, [%[wbptr], #12]\n"
- "fmov v26.4s, #6.0\n"
- "fmax v3.4s, v3.4s, v25.4s\n"
- "fmax v2.4s, v2.4s, v25.4s\n"
- "fmax v1.4s, v1.4s, v25.4s\n"
- "fmla v0.4s, v5.4s, v15.4s\n"
- "ldr s21, [%[wbptr], #32]\n"
- "fmin v3.4s, v3.4s, v26.4s\n"
- "fmin v2.4s, v2.4s, v26.4s\n"
- "fmin v1.4s, v1.4s, v26.4s\n"
- "str s3, [%[outptr0]]\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v4.4s, v14.4s\n"
- "str s1, [x24]\n"
- "mov v3.16b, v19.16b\n"
- "mov v1.16b, v19.16b\n"
- "ldr s15, [%[wbptr], #24]\n"
- "fmax v0.4s, v0.4s, v25.4s\n"
- "ldr s14, [%[wbptr], #36]\n"
- "mov v2.16b, v19.16b\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmin v0.4s, v0.4s, v26.4s\n"
- "fmla v3.4s, v16.4s, v12.4s\n"
- "fmla v1.4s, v28.4s, v12.4s\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "str s0, [x24, %[output_col_stride1]]\n"
- "mov v0.16b, v19.16b\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "add x24, x24, #4\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v3.4s, v24.4s, v20.4s\n"
- "ldr s25, [x21, x23]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "ldr s28, [%[inptr0], x22]\n"
- "fmla v2.4s, v8.4s, v13.4s\n"
- "ldr s24, [x28, %[input_col_stride1]]\n"
- "fmla v3.4s, v8.4s, v23.4s\n"
- "ldr s27, [x27, x23]\n"
- "fmla v1.4s, v8.4s, v11.4s\n"
- "ldr s7, [x21, x22]\n"
- "fmla v0.4s, v8.4s, v12.4s\n"
- "ldr s17, [x28, x23]\n"
- "fmla v3.4s, v9.4s, v10.4s\n"
- "ldr s5, [x27, x22]\n"
- "fmla v2.4s, v9.4s, v11.4s\n"
- "ldr s4, [x28, x22]\n"
- "fmla v1.4s, v18.4s, v20.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v3.4s, v6.4s, v21.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v2.4s, v6.4s, v20.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v1.4s, v6.4s, v23.4s\n"
- "add x21, x21, #4\n"
- "fmla v0.4s, v6.4s, v13.4s\n"
- "add x27, x27, #4\n"
- "fmla v3.4s, v25.4s, v15.4s\n"
- "add x28, x28, #4\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v23.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "movi v25.16b, #0\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "fmov v26.4s, #6.0\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v21.4s\n"
- "fmla v0.4s, v24.4s, v20.4s\n"
- "fmax v3.4s, v3.4s, v25.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v2.4s, v27.4s, v21.4s\n"
- "fmla v0.4s, v27.4s, v23.4s\n"
- "fmin v3.4s, v3.4s, v26.4s\n"
- "str s3, [%[outptr0]]\n"
- "fmla v2.4s, v7.4s, v15.4s\n"
- "fmla v0.4s, v7.4s, v10.4s\n"
- "fmla v1.4s, v17.4s, v14.4s\n"
- "fmla v2.4s, v5.4s, v14.4s\n"
- "fmla v0.4s, v17.4s, v21.4s\n"
- "fmax v1.4s, v1.4s, v25.4s\n"
- "fmax v2.4s, v2.4s, v25.4s\n"
- "fmla v0.4s, v5.4s, v15.4s\n"
- "fmin v1.4s, v1.4s, v26.4s\n"
- "fmin v2.4s, v2.4s, v26.4s\n"
- "str s1, [x24]\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v4.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmax v0.4s, v0.4s, v25.4s\n"
- "fmin v0.4s, v0.4s, v26.4s\n"
- "str s0, [x24, %[output_col_stride1]]\n"
- "add x24, x24, #4\n"
- "7:\n"
- : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-#endif // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index 2554436172..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ /dev/null
@@ -1,2809 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x23, %[inptr0], %[input_row_stride]\n"
- "add x19, %[input_col_stride1], %[input_col_stride1]\n"
- "add x22, %[outptr0], %[output_row_stride]\n"
- "add x24, x23, %[input_row_stride]\n"
- "add x20, x19, %[input_col_stride1]\n"
- "and x27, %[n_channels], #3\n"
- "add x25, x24, %[input_row_stride]\n"
- "add x21, x20, %[input_col_stride1]\n"
- "lsr x28, %[n_channels], #2\n"
- "add x26, x25, %[input_row_stride]\n"
- "cbz x28, 4f\n"
- "1:\n"
- "ldr q14, [%[wbptr]]\n"
- "subs x28, x28, #1\n"
- "mov v12.16b, v14.16b\n"
- "ldr q8, [%[wbptr], #16]\n"
- "mov v10.16b, v14.16b\n"
- "ldr q7, [%[wbptr], #32]\n"
- "mov v11.16b, v14.16b\n"
- "ldr q6, [%[wbptr], #48]\n"
- "mov v9.16b, v14.16b\n"
- "ldr q5, [%[wbptr], #64]\n"
- "ldr q4, [%[wbptr], #80]\n"
- "ldr q3, [%[wbptr], #96]\n"
- "ldr q2, [%[wbptr], #112]\n"
- "ldr q1, [%[wbptr], #128]\n"
- "ldr q0, [%[wbptr], #144]\n"
- "ldr q15, [%[inptr0]]\n"
- "fmla v12.4s, v15.4s, v8.4s\n"
- "ldr q20, [x23]\n"
- "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q17, [x24]\n"
- "fmla v10.4s, v17.4s, v8.4s\n"
- "ldr q16, [x23, %[input_col_stride1]]\n"
- "fmla v12.4s, v20.4s, v5.4s\n"
- "ldr q18, [%[inptr0], x19]\n"
- "ldr q14, [x25]\n"
- "ldr q15, [x24, %[input_col_stride1]]\n"
- "fmla v12.4s, v13.4s, v7.4s\n"
- "fmla v12.4s, v17.4s, v2.4s\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v11.4s, v18.4s, v8.4s\n"
- "ldr q19, [x23, x19]\n"
- "fmla v10.4s, v14.4s, v5.4s\n"
- "ldr q20, [%[inptr0], x20]\n"
- "fmla v12.4s, v15.4s, v1.4s\n"
- "ldr q14, [x26]\n"
- "fmla v11.4s, v19.4s, v5.4s\n"
- "ldr q13, [x25, %[input_col_stride1]]\n"
- "fmla v10.4s, v15.4s, v7.4s\n"
- "ldr q17, [x24, x19]\n"
- "fmla v12.4s, v19.4s, v3.4s\n"
- "ldr q19, [x23, x20]\n"
- "fmla v11.4s, v20.4s, v7.4s\n"
- "ldr q18, [%[inptr0], x21]\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "ldr q16, [x26, %[input_col_stride1]]\n"
- "fmla v12.4s, v17.4s, v0.4s\n"
- "ldr q14, [x25, x19]\n"
- "fmla v11.4s, v17.4s, v2.4s\n"
- "ldr q15, [x24, x20]\n"
- "fmla v10.4s, v13.4s, v4.4s\n"
- "ldr q13, [x23, x21]\n"
- "str q12, [%[outptr0]]\n"
- "fmla v9.4s, v17.4s, v8.4s\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr q12, [x26, x19]\n"
- "fmla v10.4s, v17.4s, v6.4s\n"
- "ldr q20, [x25, x20]\n"
- "fmla v9.4s, v14.4s, v5.4s\n"
- "ldr q17, [x24, x21]\n"
- "fmla v11.4s, v18.4s, v6.4s\n"
- "ldr q19, [x26, x20]\n"
- "fmla v10.4s, v16.4s, v1.4s\n"
- "ldr q18, [x25, x21]\n"
- "fmla v9.4s, v15.4s, v7.4s\n"
- "ldr q16, [x26, x21]\n"
- "fmla v11.4s, v15.4s, v1.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr q14, [%[wbptr]]\n"
- "fmla v9.4s, v12.4s, v2.4s\n"
- "ldr q8, [%[wbptr], #16]\n"
- "fmla v11.4s, v13.4s, v3.4s\n"
- "ldr q7, [%[wbptr], #32]\n"
- "fmla v10.4s, v12.4s, v0.4s\n"
- "ldr q5, [%[wbptr], #64]\n"
- "fmla v9.4s, v20.4s, v4.4s\n"
- "ldr q2, [%[wbptr], #112]\n"
- "fmla v11.4s, v17.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "str q10, [x22]\n"
- "mov v12.16b, v14.16b\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr q4, [%[wbptr], #80]\n"
- "str q11, [%[outptr0], %[output_col_stride1]]\n"
- "mov v10.16b, v14.16b\n"
- "mov v11.16b, v14.16b\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr q6, [%[wbptr], #48]\n"
- "ldr q15, [%[inptr0]]\n"
- "add x23, x23, #16\n"
- "fmla v12.4s, v15.4s, v8.4s\n"
- "ldr q20, [x23]\n"
- "fmla v9.4s, v18.4s, v3.4s\n"
- "ldr q1, [%[wbptr], #128]\n"
- "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
- "add x24, x24, #16\n"
- "fmla v12.4s, v20.4s, v5.4s\n"
- "ldr q17, [x24]\n"
- "fmla v9.4s, v16.4s, v0.4s\n"
- "ldr q3, [%[wbptr], #96]\n"
- "fmla v10.4s, v17.4s, v8.4s\n"
- "ldr q16, [x23, %[input_col_stride1]]\n"
- "fmla v12.4s, v13.4s, v7.4s\n"
- "ldr q18, [%[inptr0], x19]\n"
- "str q9, [x22, %[output_col_stride1]]\n"
- "add x25, x25, #16\n"
- "mov v9.16b, v14.16b\n"
- "ldr q0, [%[wbptr], #144]\n"
- "fmla v12.4s, v17.4s, v2.4s\n"
- "ldr q14, [x25]\n"
- "ldr q15, [x24, %[input_col_stride1]]\n"
- "add x26, x26, #16\n"
- "add %[outptr0], %[outptr0], #16\n"
- "add x22, x22, #16\n"
- "subs x28, x28, #1\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v11.4s, v18.4s, v8.4s\n"
- "ldr q19, [x23, x19]\n"
- "fmla v10.4s, v14.4s, v5.4s\n"
- "ldr q20, [%[inptr0], x20]\n"
- "fmla v12.4s, v15.4s, v1.4s\n"
- "ldr q14, [x26]\n"
- "fmla v11.4s, v19.4s, v5.4s\n"
- "ldr q13, [x25, %[input_col_stride1]]\n"
- "fmla v10.4s, v15.4s, v7.4s\n"
- "ldr q17, [x24, x19]\n"
- "fmla v12.4s, v19.4s, v3.4s\n"
- "ldr q19, [x23, x20]\n"
- "fmla v11.4s, v20.4s, v7.4s\n"
- "ldr q18, [%[inptr0], x21]\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "ldr q16, [x26, %[input_col_stride1]]\n"
- "fmla v12.4s, v17.4s, v0.4s\n"
- "ldr q14, [x25, x19]\n"
- "fmla v11.4s, v17.4s, v2.4s\n"
- "ldr q15, [x24, x20]\n"
- "fmla v10.4s, v13.4s, v4.4s\n"
- "ldr q13, [x23, x21]\n"
- "str q12, [%[outptr0]]\n"
- "fmla v9.4s, v17.4s, v8.4s\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr q12, [x26, x19]\n"
- "fmla v10.4s, v17.4s, v6.4s\n"
- "ldr q20, [x25, x20]\n"
- "fmla v9.4s, v14.4s, v5.4s\n"
- "ldr q17, [x24, x21]\n"
- "fmla v11.4s, v18.4s, v6.4s\n"
- "ldr q19, [x26, x20]\n"
- "fmla v10.4s, v16.4s, v1.4s\n"
- "ldr q18, [x25, x21]\n"
- "fmla v9.4s, v15.4s, v7.4s\n"
- "ldr q16, [x26, x21]\n"
- "fmla v11.4s, v15.4s, v1.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v12.4s, v2.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v11.4s, v13.4s, v3.4s\n"
- "add x23, x23, #16\n"
- "fmla v10.4s, v12.4s, v0.4s\n"
- "add x24, x24, #16\n"
- "fmla v9.4s, v20.4s, v4.4s\n"
- "add x25, x25, #16\n"
- "fmla v11.4s, v17.4s, v0.4s\n"
- "add x26, x26, #16\n"
- "str q10, [x22]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "str q11, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v9.4s, v18.4s, v3.4s\n"
- "fmla v9.4s, v16.4s, v0.4s\n"
- "str q9, [x22, %[output_col_stride1]]\n"
- "add x22, x22, #16\n"
- "4:\n"
- "cbz x27, 7f\n"
- "ldr s14, [%[wbptr]]\n"
- "mov v12.16b, v14.16b\n"
- "ldr s8, [%[wbptr], #4]\n"
- "mov v10.16b, v14.16b\n"
- "ldr s7, [%[wbptr], #8]\n"
- "mov v11.16b, v14.16b\n"
- "ldr s6, [%[wbptr], #12]\n"
- "mov v9.16b, v14.16b\n"
- "ldr s5, [%[wbptr], #16]\n"
- "ldr s4, [%[wbptr], #20]\n"
- "subs x27, x27, #1\n"
- "ldr s3, [%[wbptr], #24]\n"
- "ldr s2, [%[wbptr], #28]\n"
- "ldr s1, [%[wbptr], #32]\n"
- "ldr s0, [%[wbptr], #36]\n"
- "ldr s15, [%[inptr0]]\n"
- "ldr s20, [x23]\n"
- "fmla v12.4s, v15.4s, v8.4s\n"
- "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s17, [x24]\n"
- "ldr s16, [x23, %[input_col_stride1]]\n"
- "fmla v10.4s, v17.4s, v8.4s\n"
- "ldr s18, [%[inptr0], x19]\n"
- "fmla v12.4s, v20.4s, v5.4s\n"
- "ldr s14, [x25]\n"
- "ldr s15, [x24, %[input_col_stride1]]\n"
- "fmla v12.4s, v13.4s, v7.4s\n"
- "fmla v12.4s, v17.4s, v2.4s\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v11.4s, v18.4s, v8.4s\n"
- "ldr s19, [x23, x19]\n"
- "fmla v10.4s, v14.4s, v5.4s\n"
- "ldr s20, [%[inptr0], x20]\n"
- "fmla v12.4s, v15.4s, v1.4s\n"
- "ldr s14, [x26]\n"
- "fmla v11.4s, v19.4s, v5.4s\n"
- "ldr s13, [x25, %[input_col_stride1]]\n"
- "fmla v10.4s, v15.4s, v7.4s\n"
- "ldr s17, [x24, x19]\n"
- "fmla v12.4s, v19.4s, v3.4s\n"
- "ldr s19, [x23, x20]\n"
- "fmla v11.4s, v20.4s, v7.4s\n"
- "ldr s18, [%[inptr0], x21]\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "ldr s16, [x26, %[input_col_stride1]]\n"
- "fmla v12.4s, v17.4s, v0.4s\n"
- "ldr s14, [x25, x19]\n"
- "fmla v11.4s, v17.4s, v2.4s\n"
- "ldr s15, [x24, x20]\n"
- "fmla v10.4s, v13.4s, v4.4s\n"
- "ldr s13, [x23, x21]\n"
- "str s12, [%[outptr0]]\n"
- "fmla v9.4s, v17.4s, v8.4s\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr s12, [x26, x19]\n"
- "fmla v10.4s, v17.4s, v6.4s\n"
- "ldr s20, [x25, x20]\n"
- "fmla v9.4s, v14.4s, v5.4s\n"
- "ldr s17, [x24, x21]\n"
- "fmla v11.4s, v18.4s, v6.4s\n"
- "ldr s19, [x26, x20]\n"
- "fmla v10.4s, v16.4s, v1.4s\n"
- "ldr s18, [x25, x21]\n"
- "fmla v9.4s, v15.4s, v7.4s\n"
- "ldr s16, [x26, x21]\n"
- "fmla v11.4s, v15.4s, v1.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr s14, [%[wbptr]]\n"
- "fmla v9.4s, v12.4s, v2.4s\n"
- "ldr s8, [%[wbptr], #4]\n"
- "fmla v11.4s, v13.4s, v3.4s\n"
- "ldr s7, [%[wbptr], #8]\n"
- "fmla v10.4s, v12.4s, v0.4s\n"
- "ldr s5, [%[wbptr], #16]\n"
- "fmla v9.4s, v20.4s, v4.4s\n"
- "ldr s2, [%[wbptr], #28]\n"
- "fmla v11.4s, v17.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "str s10, [x22]\n"
- "mov v12.16b, v14.16b\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr s4, [%[wbptr], #20]\n"
- "str s11, [%[outptr0], %[output_col_stride1]]\n"
- "mov v10.16b, v14.16b\n"
- "mov v11.16b, v14.16b\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr s6, [%[wbptr], #12]\n"
- "ldr s15, [%[inptr0]]\n"
- "add x23, x23, #4\n"
- "fmla v12.4s, v15.4s, v8.4s\n"
- "ldr s20, [x23]\n"
- "fmla v9.4s, v18.4s, v3.4s\n"
- "ldr s1, [%[wbptr], #32]\n"
- "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
- "add x24, x24, #4\n"
- "fmla v12.4s, v20.4s, v5.4s\n"
- "ldr s17, [x24]\n"
- "fmla v9.4s, v16.4s, v0.4s\n"
- "ldr s3, [%[wbptr], #24]\n"
- "fmla v10.4s, v17.4s, v8.4s\n"
- "ldr s16, [x23, %[input_col_stride1]]\n"
- "fmla v12.4s, v13.4s, v7.4s\n"
- "ldr s18, [%[inptr0], x19]\n"
- "str s9, [x22, %[output_col_stride1]]\n"
- "add x25, x25, #4\n"
- "mov v9.16b, v14.16b\n"
- "ldr s0, [%[wbptr], #36]\n"
- "fmla v12.4s, v17.4s, v2.4s\n"
- "ldr s14, [x25]\n"
- "ldr s15, [x24, %[input_col_stride1]]\n"
- "add x26, x26, #4\n"
- "add %[outptr0], %[outptr0], #4\n"
- "add x22, x22, #4\n"
- "subs x27, x27, #1\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v11.4s, v18.4s, v8.4s\n"
- "ldr s19, [x23, x19]\n"
- "fmla v10.4s, v14.4s, v5.4s\n"
- "ldr s20, [%[inptr0], x20]\n"
- "fmla v12.4s, v15.4s, v1.4s\n"
- "ldr s14, [x26]\n"
- "fmla v11.4s, v19.4s, v5.4s\n"
- "ldr s13, [x25, %[input_col_stride1]]\n"
- "fmla v10.4s, v15.4s, v7.4s\n"
- "ldr s17, [x24, x19]\n"
- "fmla v12.4s, v19.4s, v3.4s\n"
- "ldr s19, [x23, x20]\n"
- "fmla v11.4s, v20.4s, v7.4s\n"
- "ldr s18, [%[inptr0], x21]\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "ldr s16, [x26, %[input_col_stride1]]\n"
- "fmla v12.4s, v17.4s, v0.4s\n"
- "ldr s14, [x25, x19]\n"
- "fmla v11.4s, v17.4s, v2.4s\n"
- "ldr s15, [x24, x20]\n"
- "fmla v10.4s, v13.4s, v4.4s\n"
- "ldr s13, [x23, x21]\n"
- "str s12, [%[outptr0]]\n"
- "fmla v9.4s, v17.4s, v8.4s\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr s12, [x26, x19]\n"
- "fmla v10.4s, v17.4s, v6.4s\n"
- "ldr s20, [x25, x20]\n"
- "fmla v9.4s, v14.4s, v5.4s\n"
- "ldr s17, [x24, x21]\n"
- "fmla v11.4s, v18.4s, v6.4s\n"
- "ldr s19, [x26, x20]\n"
- "fmla v10.4s, v16.4s, v1.4s\n"
- "ldr s18, [x25, x21]\n"
- "fmla v9.4s, v15.4s, v7.4s\n"
- "ldr s16, [x26, x21]\n"
- "fmla v11.4s, v15.4s, v1.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v12.4s, v2.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v11.4s, v13.4s, v3.4s\n"
- "add x23, x23, #4\n"
- "fmla v10.4s, v12.4s, v0.4s\n"
- "add x24, x24, #4\n"
- "fmla v9.4s, v20.4s, v4.4s\n"
- "add x25, x25, #4\n"
- "fmla v11.4s, v17.4s, v0.4s\n"
- "add x26, x26, #4\n"
- "str s10, [x22]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "str s11, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v9.4s, v18.4s, v3.4s\n"
- "fmla v9.4s, v16.4s, v0.4s\n"
- "str s9, [x22, %[output_col_stride1]]\n"
- "add x22, x22, #4\n"
- "7:\n"
- : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
- : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
- __asm __volatile(
- "mov x23, xzr\n"
- "mov x24, xzr\n"
- "and x25, %[n_channels], #3\n"
- "lsr x26, %[n_channels], #2\n"
- "cbz x26, 4f\n"
- "1:\n"
- "ldr q13, [%[wbptr]]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "mov v10.16b, v13.16b\n"
- "ldr q12, [%[wbptr], #16]\n"
- "mov v8.16b, v13.16b\n"
- "ldr q6, [%[wbptr], #32]\n"
- "mov v9.16b, v13.16b\n"
- "ldr q5, [%[wbptr], #48]\n"
- "mov v7.16b, v13.16b\n"
- "ldr q11, [%[wbptr], #64]\n"
- "ldr q4, [%[wbptr], #80]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr q3, [%[wbptr], #96]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr q2, [%[wbptr], #112]\n"
- "ldr x27, [%[inptrs], 120]\n"
- "ldr q1, [%[wbptr], #128]\n"
- "subs x26, x26, #1\n"
- "ldr q0, [%[wbptr], #144]\n"
- "ldr q14, [x19, x23]\n"
- "fmla v10.4s, v14.4s, v12.4s\n"
- "ldr q18, [x20, x23]\n"
- "ldr q14, [x21, x23]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr q16, [x27, x23]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "ldr q19, [x19, x23]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "fmla v10.4s, v18.4s, v11.4s\n"
- "ldr q15, [x20, x23]\n"
- "ldr q18, [x21, x23]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "ldr q13, [x19, x23]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v8.4s, v14.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v10.4s, v15.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v9.4s, v13.4s, v12.4s\n"
- "ldr q14, [x20, x23]\n"
- "ldr q17, [x19, x23]\n"
- "ldr x22, [%[inptrs], 160]\n"
- "fmla v8.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 128]\n"
- "fmla v10.4s, v13.4s, v5.4s\n"
- "ldr q15, [x22, x23]\n"
- "fmla v9.4s, v14.4s, v11.4s\n"
- "ldr q19, [x27, x23]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "ldr x20, [%[inptrs], 64]\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v8.4s, v18.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 168]\n"
- "fmla v10.4s, v18.4s, v1.4s\n"
- "ldr q13, [x21, x23]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr q18, [x20, x23]\n"
- "fmla v7.4s, v13.4s, v12.4s\n"
- "ldr q17, [x19, x23]\n"
- "fmla v8.4s, v15.4s, v2.4s\n"
- "ldr q15, [x22, x23]\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr x27, [%[inptrs], 136]\n"
- "fmla v9.4s, v13.4s, v2.4s\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr q16, [x27, x23]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v8.4s, v19.4s, v4.4s\n"
- "ldr q19, [x21, x23]\n"
- "fmla v10.4s, v13.4s, v0.4s\n"
- "ldr q12, [x20, x23]\n"
- "fmla v9.4s, v18.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 176]\n"
- "fmla v7.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 144]\n"
- "fmla v8.4s, v13.4s, v5.4s\n"
- "ldr q11, [x22, x23]\n"
- "ldr q13, [x27, x23]\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v9.4s, v17.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 184]\n"
- "fmla v7.4s, v19.4s, v6.4s\n"
- "ldr q14, [x21, x23]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr q17, [x22, x23]\n"
- "ldr x27, [%[inptrs], 152]\n"
- "ldr x22, [%[inptrs], 192]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str q10, [x21, x24]\n"
- "fmla v7.4s, v11.4s, v2.4s\n"
- "fmla v8.4s, v16.4s, v3.4s\n"
- "ldr q16, [x27, x23]\n"
- "ldr q15, [x22, x23]\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v9.4s, v12.4s, v3.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v7.4s, v13.4s, v4.4s\n"
- "ldr q13, [%[wbptr]]\n"
- "fmla v8.4s, v11.4s, v0.4s\n"
- "ldr q12, [%[wbptr], #16]\n"
- "mov v10.16b, v13.16b\n"
- "ldr q6, [%[wbptr], #32]\n"
- "fmla v9.4s, v14.4s, v0.4s\n"
- "ldr q11, [%[wbptr], #64]\n"
- "fmla v7.4s, v14.4s, v5.4s\n"
- "ldr q4, [%[wbptr], #80]\n"
- "str q8, [x28, x24]\n"
- "add x23, x23, #16\n"
- "mov v8.16b, v13.16b\n"
- "ldr q2, [%[wbptr], #112]\n"
- "str q9, [x21, x24]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "fmla v7.4s, v17.4s, v1.4s\n"
- "ldr q5, [%[wbptr], #48]\n"
- "mov v9.16b, v13.16b\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr x27, [%[inptrs], 120]\n"
- "subs x26, x26, #1\n"
- "fmla v7.4s, v16.4s, v3.4s\n"
- "ldr q1, [%[wbptr], #128]\n"
- "ldr q14, [x19, x23]\n"
- "fmla v10.4s, v14.4s, v12.4s\n"
- "ldr q18, [x20, x23]\n"
- "ldr q14, [x21, x23]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "fmla v7.4s, v15.4s, v0.4s\n"
- "ldr q3, [%[wbptr], #96]\n"
- "ldr q19, [x19, x23]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "fmla v10.4s, v18.4s, v11.4s\n"
- "ldr q16, [x27, x23]\n"
- "ldr q15, [x20, x23]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "str q7, [x28, x24]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "mov v7.16b, v13.16b\n"
- "ldr q0, [%[wbptr], #144]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr q13, [x19, x23]\n"
- "ldr q18, [x21, x23]\n"
- "add x24, x24, #16\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v8.4s, v14.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v10.4s, v15.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v9.4s, v13.4s, v12.4s\n"
- "ldr q14, [x20, x23]\n"
- "ldr q17, [x19, x23]\n"
- "ldr x22, [%[inptrs], 160]\n"
- "fmla v8.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 128]\n"
- "fmla v10.4s, v13.4s, v5.4s\n"
- "ldr q15, [x22, x23]\n"
- "fmla v9.4s, v14.4s, v11.4s\n"
- "ldr q19, [x27, x23]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "ldr x20, [%[inptrs], 64]\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v8.4s, v18.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 168]\n"
- "fmla v10.4s, v18.4s, v1.4s\n"
- "ldr q13, [x21, x23]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr q18, [x20, x23]\n"
- "fmla v7.4s, v13.4s, v12.4s\n"
- "ldr q17, [x19, x23]\n"
- "fmla v8.4s, v15.4s, v2.4s\n"
- "ldr q15, [x22, x23]\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr x27, [%[inptrs], 136]\n"
- "fmla v9.4s, v13.4s, v2.4s\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr q16, [x27, x23]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v8.4s, v19.4s, v4.4s\n"
- "ldr q19, [x21, x23]\n"
- "fmla v10.4s, v13.4s, v0.4s\n"
- "ldr q12, [x20, x23]\n"
- "fmla v9.4s, v18.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 176]\n"
- "fmla v7.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 144]\n"
- "fmla v8.4s, v13.4s, v5.4s\n"
- "ldr q11, [x22, x23]\n"
- "ldr q13, [x27, x23]\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v9.4s, v17.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 184]\n"
- "fmla v7.4s, v19.4s, v6.4s\n"
- "ldr q14, [x21, x23]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr q17, [x22, x23]\n"
- "ldr x27, [%[inptrs], 152]\n"
- "ldr x22, [%[inptrs], 192]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str q10, [x21, x24]\n"
- "fmla v7.4s, v11.4s, v2.4s\n"
- "fmla v8.4s, v16.4s, v3.4s\n"
- "ldr q16, [x27, x23]\n"
- "ldr q15, [x22, x23]\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v9.4s, v12.4s, v3.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v7.4s, v13.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v8.4s, v11.4s, v0.4s\n"
- "add x23, x23, #16\n"
- "fmla v9.4s, v14.4s, v0.4s\n"
- "fmla v7.4s, v14.4s, v5.4s\n"
- "str q8, [x28, x24]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "str q9, [x21, x24]\n"
- "fmla v7.4s, v17.4s, v1.4s\n"
- "fmla v7.4s, v16.4s, v3.4s\n"
- "fmla v7.4s, v15.4s, v0.4s\n"
- "str q7, [x28, x24]\n"
- "add x24, x24, #16\n"
- "4:\n"
- "cbz x25, 7f\n"
- "ldr s13, [%[wbptr]]\n"
- "mov v10.16b, v13.16b\n"
- "ldr s12, [%[wbptr], #4]\n"
- "mov v8.16b, v13.16b\n"
- "ldr s6, [%[wbptr], #8]\n"
- "mov v9.16b, v13.16b\n"
- "ldr s5, [%[wbptr], #12]\n"
- "mov v7.16b, v13.16b\n"
- "ldr s11, [%[wbptr], #16]\n"
- "ldr s4, [%[wbptr], #20]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr s3, [%[wbptr], #24]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr s2, [%[wbptr], #28]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr s1, [%[wbptr], #32]\n"
- "ldr x27, [%[inptrs], 120]\n"
- "ldr s0, [%[wbptr], #36]\n"
- "subs x25, x25, #1\n"
- "ldr s14, [x19, x23]\n"
- "ldr s18, [x20, x23]\n"
- "fmla v10.4s, v14.4s, v12.4s\n"
- "ldr s14, [x21, x23]\n"
- "ldr s16, [x27, x23]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "ldr s19, [x19, x23]\n"
- "fmla v10.4s, v18.4s, v11.4s\n"
- "ldr s15, [x20, x23]\n"
- "ldr s18, [x21, x23]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "ldr s13, [x19, x23]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v8.4s, v14.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v10.4s, v15.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v9.4s, v13.4s, v12.4s\n"
- "ldr s14, [x20, x23]\n"
- "ldr s17, [x19, x23]\n"
- "ldr x22, [%[inptrs], 160]\n"
- "fmla v8.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 128]\n"
- "fmla v10.4s, v13.4s, v5.4s\n"
- "ldr s15, [x22, x23]\n"
- "fmla v9.4s, v14.4s, v11.4s\n"
- "ldr s19, [x27, x23]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "ldr x20, [%[inptrs], 64]\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v8.4s, v18.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 168]\n"
- "fmla v10.4s, v18.4s, v1.4s\n"
- "ldr s13, [x21, x23]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr s18, [x20, x23]\n"
- "fmla v7.4s, v13.4s, v12.4s\n"
- "ldr s17, [x19, x23]\n"
- "fmla v8.4s, v15.4s, v2.4s\n"
- "ldr s15, [x22, x23]\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr x27, [%[inptrs], 136]\n"
- "fmla v9.4s, v13.4s, v2.4s\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr s16, [x27, x23]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v8.4s, v19.4s, v4.4s\n"
- "ldr s19, [x21, x23]\n"
- "fmla v10.4s, v13.4s, v0.4s\n"
- "ldr s12, [x20, x23]\n"
- "fmla v9.4s, v18.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 176]\n"
- "fmla v7.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 144]\n"
- "fmla v8.4s, v13.4s, v5.4s\n"
- "ldr s11, [x22, x23]\n"
- "ldr s13, [x27, x23]\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v9.4s, v17.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 184]\n"
- "fmla v7.4s, v19.4s, v6.4s\n"
- "ldr s14, [x21, x23]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr s17, [x22, x23]\n"
- "ldr x27, [%[inptrs], 152]\n"
- "ldr x22, [%[inptrs], 192]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str s10, [x21, x24]\n"
- "fmla v7.4s, v11.4s, v2.4s\n"
- "fmla v8.4s, v16.4s, v3.4s\n"
- "ldr s16, [x27, x23]\n"
- "ldr s15, [x22, x23]\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v9.4s, v12.4s, v3.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v7.4s, v13.4s, v4.4s\n"
- "ldr s13, [%[wbptr]]\n"
- "fmla v8.4s, v11.4s, v0.4s\n"
- "ldr s12, [%[wbptr], #4]\n"
- "mov v10.16b, v13.16b\n"
- "ldr s6, [%[wbptr], #8]\n"
- "fmla v9.4s, v14.4s, v0.4s\n"
- "ldr s11, [%[wbptr], #16]\n"
- "fmla v7.4s, v14.4s, v5.4s\n"
- "ldr s4, [%[wbptr], #20]\n"
- "str s8, [x28, x24]\n"
- "add x23, x23, #4\n"
- "mov v8.16b, v13.16b\n"
- "ldr s2, [%[wbptr], #28]\n"
- "str s9, [x21, x24]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "fmla v7.4s, v17.4s, v1.4s\n"
- "ldr s5, [%[wbptr], #12]\n"
- "mov v9.16b, v13.16b\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr x27, [%[inptrs], 120]\n"
- "subs x25, x25, #1\n"
- "fmla v7.4s, v16.4s, v3.4s\n"
- "ldr s1, [%[wbptr], #32]\n"
- "ldr s14, [x19, x23]\n"
- "fmla v10.4s, v14.4s, v12.4s\n"
- "ldr s18, [x20, x23]\n"
- "ldr s14, [x21, x23]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "fmla v7.4s, v15.4s, v0.4s\n"
- "ldr s3, [%[wbptr], #24]\n"
- "ldr s19, [x19, x23]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "fmla v10.4s, v18.4s, v11.4s\n"
- "ldr s16, [x27, x23]\n"
- "ldr s15, [x20, x23]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "str s7, [x28, x24]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "mov v7.16b, v13.16b\n"
- "ldr s0, [%[wbptr], #36]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr s13, [x19, x23]\n"
- "ldr s18, [x21, x23]\n"
- "add x24, x24, #4\n"
- "fmla v10.4s, v14.4s, v2.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v8.4s, v14.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v10.4s, v15.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v9.4s, v13.4s, v12.4s\n"
- "ldr s14, [x20, x23]\n"
- "ldr s17, [x19, x23]\n"
- "ldr x22, [%[inptrs], 160]\n"
- "fmla v8.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 128]\n"
- "fmla v10.4s, v13.4s, v5.4s\n"
- "ldr s15, [x22, x23]\n"
- "fmla v9.4s, v14.4s, v11.4s\n"
- "ldr s19, [x27, x23]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "ldr x20, [%[inptrs], 64]\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v8.4s, v18.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 168]\n"
- "fmla v10.4s, v18.4s, v1.4s\n"
- "ldr s13, [x21, x23]\n"
- "fmla v9.4s, v17.4s, v6.4s\n"
- "ldr s18, [x20, x23]\n"
- "fmla v7.4s, v13.4s, v12.4s\n"
- "ldr s17, [x19, x23]\n"
- "fmla v8.4s, v15.4s, v2.4s\n"
- "ldr s15, [x22, x23]\n"
- "fmla v10.4s, v14.4s, v3.4s\n"
- "ldr x27, [%[inptrs], 136]\n"
- "fmla v9.4s, v13.4s, v2.4s\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr s16, [x27, x23]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v8.4s, v19.4s, v4.4s\n"
- "ldr s19, [x21, x23]\n"
- "fmla v10.4s, v13.4s, v0.4s\n"
- "ldr s12, [x20, x23]\n"
- "fmla v9.4s, v18.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 176]\n"
- "fmla v7.4s, v16.4s, v11.4s\n"
- "ldr x27, [%[inptrs], 144]\n"
- "fmla v8.4s, v13.4s, v5.4s\n"
- "ldr s11, [x22, x23]\n"
- "ldr s13, [x27, x23]\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v9.4s, v17.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 184]\n"
- "fmla v7.4s, v19.4s, v6.4s\n"
- "ldr s14, [x21, x23]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr s17, [x22, x23]\n"
- "ldr x27, [%[inptrs], 152]\n"
- "ldr x22, [%[inptrs], 192]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str s10, [x21, x24]\n"
- "fmla v7.4s, v11.4s, v2.4s\n"
- "fmla v8.4s, v16.4s, v3.4s\n"
- "ldr s16, [x27, x23]\n"
- "ldr s15, [x22, x23]\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v9.4s, v12.4s, v3.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v7.4s, v13.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v8.4s, v11.4s, v0.4s\n"
- "add x23, x23, #4\n"
- "fmla v9.4s, v14.4s, v0.4s\n"
- "fmla v7.4s, v14.4s, v5.4s\n"
- "str s8, [x28, x24]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "str s9, [x21, x24]\n"
- "fmla v7.4s, v17.4s, v1.4s\n"
- "fmla v7.4s, v16.4s, v3.4s\n"
- "fmla v7.4s, v15.4s, v0.4s\n"
- "str s7, [x28, x24]\n"
- "add x24, x24, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x24, %[inptr0], %[input_row_stride]\n"
- "add x27, %[input_col_stride1], %[input_col_stride1]\n"
- "add x19, %[outptr0], %[output_row_stride]\n"
- "add x25, x24, %[input_row_stride]\n"
- "add x23, x27, %[input_col_stride1]\n"
- "and x20, %[n_channels], #3\n"
- "add x28, x25, %[input_row_stride]\n"
- "add x22, x23, %[input_col_stride1]\n"
- "lsr x21, %[n_channels], #2\n"
- "add x26, x28, %[input_row_stride]\n"
- "cbz x21, 4f\n"
- "1:\n"
- "ldr q16, [%[wbptr]]\n"
- "subs x21, x21, #1\n"
- "mov v3.16b, v16.16b\n"
- "ldr q4, [%[wbptr], #16]\n"
- "mov v1.16b, v16.16b\n"
- "ldr q5, [%[wbptr], #32]\n"
- "mov v2.16b, v16.16b\n"
- "ldr q12, [%[wbptr], #48]\n"
- "mov v0.16b, v16.16b\n"
- "ldr q11, [%[wbptr], #64]\n"
- "ldr q10, [%[wbptr], #80]\n"
- "ldr q6, [%[wbptr], #96]\n"
- "ldr q9, [%[wbptr], #112]\n"
- "ldr q8, [%[wbptr], #128]\n"
- "ldr q7, [%[wbptr], #144]\n"
- "ldr q21, [%[inptr0]]\n"
- "fmla v3.4s, v21.4s, v4.4s\n"
- "ldr q23, [x24]\n"
- "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q14, [x25]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr q13, [x24, %[input_col_stride1]]\n"
- "fmla v3.4s, v23.4s, v11.4s\n"
- "ldr q18, [%[inptr0], x27]\n"
- "ldr q15, [x28]\n"
- "ldr q22, [x25, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v5.4s\n"
- "fmla v3.4s, v14.4s, v9.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v3.4s, v13.4s, v10.4s\n"
- "ldr q17, [x24, x27]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr q20, [%[inptr0], x23]\n"
- "fmla v1.4s, v15.4s, v11.4s\n"
- "ldr q19, [x26]\n"
- "fmla v3.4s, v18.4s, v12.4s\n"
- "ldr q13, [x28, %[input_col_stride1]]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr q14, [x25, x27]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "ldr q15, [x24, x23]\n"
- "fmla v3.4s, v22.4s, v8.4s\n"
- "ldr q16, [%[inptr0], x22]\n"
- "fmla v2.4s, v20.4s, v5.4s\n"
- "ldr q20, [x26, %[input_col_stride1]]\n"
- "fmla v1.4s, v19.4s, v9.4s\n"
- "ldr q19, [x28, x27]\n"
- "fmla v3.4s, v17.4s, v6.4s\n"
- "ldr q21, [x25, x23]\n"
- "fmla v2.4s, v14.4s, v9.4s\n"
- "ldr q22, [x24, x22]\n"
- "fmla v1.4s, v13.4s, v10.4s\n"
- "ldr q23, [x26, x27]\n"
- "fmla v3.4s, v14.4s, v7.4s\n"
- "ldr q18, [x28, x23]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "ldr q13, [x25, x22]\n"
- "fmla v1.4s, v14.4s, v12.4s\n"
- "ldr q14, [x26, x23]\n"
- "fmla v2.4s, v15.4s, v10.4s\n"
- "ldr q17, [x28, x22]\n"
- "fmla v0.4s, v19.4s, v11.4s\n"
- "ldr q15, [x26, x22]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr q16, [%[wbptr]]\n"
- "fmla v0.4s, v21.4s, v5.4s\n"
- "ldr q4, [%[wbptr], #16]\n"
- "fmla v1.4s, v19.4s, v6.4s\n"
- "ldr q11, [%[wbptr], #64]\n"
- "fmla v2.4s, v21.4s, v8.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "ldr q5, [%[wbptr], #32]\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v2.4s, v22.4s, v6.4s\n"
- "ldr q21, [%[inptr0]]\n"
- "fmla v0.4s, v18.4s, v10.4s\n"
- "ldr q9, [%[wbptr], #112]\n"
- "movi v20.16b, #0\n"
- "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v13.4s, v7.4s\n"
- "ldr q18, [%[inptr0], x27]\n"
- "fmla v0.4s, v13.4s, v12.4s\n"
- "ldr q10, [%[wbptr], #80]\n"
- "fmax v3.4s, v3.4s, v20.4s\n"
- "add x24, x24, #16\n"
- "fmax v2.4s, v2.4s, v20.4s\n"
- "ldr q23, [x24]\n"
- "str q3, [%[outptr0]]\n"
- "fmla v0.4s, v14.4s, v8.4s\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v1.4s, v1.4s, v20.4s\n"
- "mov v3.16b, v16.16b\n"
- "ldr q12, [%[wbptr], #48]\n"
- "str q1, [x19]\n"
- "fmla v0.4s, v17.4s, v6.4s\n"
- "mov v1.16b, v16.16b\n"
- "ldr q8, [%[wbptr], #128]\n"
- "mov v2.16b, v16.16b\n"
- "ldr q13, [x24, %[input_col_stride1]]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "ldr q6, [%[wbptr], #96]\n"
- "fmla v3.4s, v21.4s, v4.4s\n"
- "add x25, x25, #16\n"
- "ldr q14, [x25]\n"
- "add x28, x28, #16\n"
- "fmax v0.4s, v0.4s, v20.4s\n"
- "ldr q7, [%[wbptr], #144]\n"
- "fmla v3.4s, v23.4s, v11.4s\n"
- "ldr q15, [x28]\n"
- "str q0, [x19, %[output_col_stride1]]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "mov v0.16b, v16.16b\n"
- "ldr q22, [x25, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v5.4s\n"
- "add x26, x26, #16\n"
- "add %[outptr0], %[outptr0], #16\n"
- "add x19, x19, #16\n"
- "subs x21, x21, #1\n"
- "fmla v3.4s, v14.4s, v9.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v3.4s, v13.4s, v10.4s\n"
- "ldr q17, [x24, x27]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr q20, [%[inptr0], x23]\n"
- "fmla v1.4s, v15.4s, v11.4s\n"
- "ldr q19, [x26]\n"
- "fmla v3.4s, v18.4s, v12.4s\n"
- "ldr q13, [x28, %[input_col_stride1]]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr q14, [x25, x27]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "ldr q15, [x24, x23]\n"
- "fmla v3.4s, v22.4s, v8.4s\n"
- "ldr q16, [%[inptr0], x22]\n"
- "fmla v2.4s, v20.4s, v5.4s\n"
- "ldr q20, [x26, %[input_col_stride1]]\n"
- "fmla v1.4s, v19.4s, v9.4s\n"
- "ldr q19, [x28, x27]\n"
- "fmla v3.4s, v17.4s, v6.4s\n"
- "ldr q21, [x25, x23]\n"
- "fmla v2.4s, v14.4s, v9.4s\n"
- "ldr q22, [x24, x22]\n"
- "fmla v1.4s, v13.4s, v10.4s\n"
- "ldr q23, [x26, x27]\n"
- "fmla v3.4s, v14.4s, v7.4s\n"
- "ldr q18, [x28, x23]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "ldr q13, [x25, x22]\n"
- "fmla v1.4s, v14.4s, v12.4s\n"
- "ldr q14, [x26, x23]\n"
- "fmla v2.4s, v15.4s, v10.4s\n"
- "ldr q17, [x28, x22]\n"
- "fmla v0.4s, v19.4s, v11.4s\n"
- "ldr q15, [x26, x22]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v21.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v1.4s, v19.4s, v6.4s\n"
- "add x24, x24, #16\n"
- "fmla v2.4s, v21.4s, v8.4s\n"
- "add x25, x25, #16\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "add x28, x28, #16\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "add x26, x26, #16\n"
- "fmla v2.4s, v22.4s, v6.4s\n"
- "movi v20.16b, #0\n"
- "fmla v0.4s, v18.4s, v10.4s\n"
- "fmax v3.4s, v3.4s, v20.4s\n"
- "fmla v2.4s, v13.4s, v7.4s\n"
- "fmax v1.4s, v1.4s, v20.4s\n"
- "str q3, [%[outptr0]]\n"
- "fmla v0.4s, v13.4s, v12.4s\n"
- "str q1, [x19]\n"
- "fmax v2.4s, v2.4s, v20.4s\n"
- "fmla v0.4s, v14.4s, v8.4s\n"
- "str q2, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v0.4s, v17.4s, v6.4s\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmax v0.4s, v0.4s, v20.4s\n"
- "str q0, [x19, %[output_col_stride1]]\n"
- "add x19, x19, #16\n"
- "4:\n"
- "cbz x20, 7f\n"
- "ldr s16, [%[wbptr]]\n"
- "mov v3.16b, v16.16b\n"
- "ldr s4, [%[wbptr], #4]\n"
- "mov v1.16b, v16.16b\n"
- "ldr s5, [%[wbptr], #8]\n"
- "mov v2.16b, v16.16b\n"
- "ldr s12, [%[wbptr], #12]\n"
- "mov v0.16b, v16.16b\n"
- "ldr s11, [%[wbptr], #16]\n"
- "ldr s10, [%[wbptr], #20]\n"
- "subs x20, x20, #1\n"
- "ldr s6, [%[wbptr], #24]\n"
- "ldr s9, [%[wbptr], #28]\n"
- "ldr s8, [%[wbptr], #32]\n"
- "ldr s7, [%[wbptr], #36]\n"
- "ldr s21, [%[inptr0]]\n"
- "ldr s23, [x24]\n"
- "fmla v3.4s, v21.4s, v4.4s\n"
- "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s14, [x25]\n"
- "ldr s13, [x24, %[input_col_stride1]]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr s18, [%[inptr0], x27]\n"
- "fmla v3.4s, v23.4s, v11.4s\n"
- "ldr s15, [x28]\n"
- "ldr s22, [x25, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v5.4s\n"
- "fmla v3.4s, v14.4s, v9.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v3.4s, v13.4s, v10.4s\n"
- "ldr s17, [x24, x27]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr s20, [%[inptr0], x23]\n"
- "fmla v1.4s, v15.4s, v11.4s\n"
- "ldr s19, [x26]\n"
- "fmla v3.4s, v18.4s, v12.4s\n"
- "ldr s13, [x28, %[input_col_stride1]]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr s14, [x25, x27]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "ldr s15, [x24, x23]\n"
- "fmla v3.4s, v22.4s, v8.4s\n"
- "ldr s16, [%[inptr0], x22]\n"
- "fmla v2.4s, v20.4s, v5.4s\n"
- "ldr s20, [x26, %[input_col_stride1]]\n"
- "fmla v1.4s, v19.4s, v9.4s\n"
- "ldr s19, [x28, x27]\n"
- "fmla v3.4s, v17.4s, v6.4s\n"
- "ldr s21, [x25, x23]\n"
- "fmla v2.4s, v14.4s, v9.4s\n"
- "ldr s22, [x24, x22]\n"
- "fmla v1.4s, v13.4s, v10.4s\n"
- "ldr s23, [x26, x27]\n"
- "fmla v3.4s, v14.4s, v7.4s\n"
- "ldr s18, [x28, x23]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "ldr s13, [x25, x22]\n"
- "fmla v1.4s, v14.4s, v12.4s\n"
- "ldr s14, [x26, x23]\n"
- "fmla v2.4s, v15.4s, v10.4s\n"
- "ldr s17, [x28, x22]\n"
- "fmla v0.4s, v19.4s, v11.4s\n"
- "ldr s15, [x26, x22]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr s16, [%[wbptr]]\n"
- "fmla v0.4s, v21.4s, v5.4s\n"
- "ldr s4, [%[wbptr], #4]\n"
- "fmla v1.4s, v19.4s, v6.4s\n"
- "ldr s11, [%[wbptr], #16]\n"
- "fmla v2.4s, v21.4s, v8.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "ldr s5, [%[wbptr], #8]\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v2.4s, v22.4s, v6.4s\n"
- "ldr s21, [%[inptr0]]\n"
- "fmla v0.4s, v18.4s, v10.4s\n"
- "ldr s9, [%[wbptr], #28]\n"
- "movi v20.16b, #0\n"
- "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v13.4s, v7.4s\n"
- "ldr s18, [%[inptr0], x27]\n"
- "fmla v0.4s, v13.4s, v12.4s\n"
- "ldr s10, [%[wbptr], #20]\n"
- "fmax v3.4s, v3.4s, v20.4s\n"
- "add x24, x24, #4\n"
- "fmax v2.4s, v2.4s, v20.4s\n"
- "ldr s23, [x24]\n"
- "str s3, [%[outptr0]]\n"
- "fmla v0.4s, v14.4s, v8.4s\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v1.4s, v1.4s, v20.4s\n"
- "mov v3.16b, v16.16b\n"
- "ldr s12, [%[wbptr], #12]\n"
- "str s1, [x19]\n"
- "fmla v0.4s, v17.4s, v6.4s\n"
- "mov v1.16b, v16.16b\n"
- "ldr s8, [%[wbptr], #32]\n"
- "mov v2.16b, v16.16b\n"
- "ldr s13, [x24, %[input_col_stride1]]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "ldr s6, [%[wbptr], #24]\n"
- "fmla v3.4s, v21.4s, v4.4s\n"
- "add x25, x25, #4\n"
- "ldr s14, [x25]\n"
- "add x28, x28, #4\n"
- "fmax v0.4s, v0.4s, v20.4s\n"
- "ldr s7, [%[wbptr], #36]\n"
- "fmla v3.4s, v23.4s, v11.4s\n"
- "ldr s15, [x28]\n"
- "str s0, [x19, %[output_col_stride1]]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "mov v0.16b, v16.16b\n"
- "ldr s22, [x25, %[input_col_stride1]]\n"
- "fmla v3.4s, v19.4s, v5.4s\n"
- "add x26, x26, #4\n"
- "add %[outptr0], %[outptr0], #4\n"
- "add x19, x19, #4\n"
- "subs x20, x20, #1\n"
- "fmla v3.4s, v14.4s, v9.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v3.4s, v13.4s, v10.4s\n"
- "ldr s17, [x24, x27]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr s20, [%[inptr0], x23]\n"
- "fmla v1.4s, v15.4s, v11.4s\n"
- "ldr s19, [x26]\n"
- "fmla v3.4s, v18.4s, v12.4s\n"
- "ldr s13, [x28, %[input_col_stride1]]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr s14, [x25, x27]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "ldr s15, [x24, x23]\n"
- "fmla v3.4s, v22.4s, v8.4s\n"
- "ldr s16, [%[inptr0], x22]\n"
- "fmla v2.4s, v20.4s, v5.4s\n"
- "ldr s20, [x26, %[input_col_stride1]]\n"
- "fmla v1.4s, v19.4s, v9.4s\n"
- "ldr s19, [x28, x27]\n"
- "fmla v3.4s, v17.4s, v6.4s\n"
- "ldr s21, [x25, x23]\n"
- "fmla v2.4s, v14.4s, v9.4s\n"
- "ldr s22, [x24, x22]\n"
- "fmla v1.4s, v13.4s, v10.4s\n"
- "ldr s23, [x26, x27]\n"
- "fmla v3.4s, v14.4s, v7.4s\n"
- "ldr s18, [x28, x23]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "ldr s13, [x25, x22]\n"
- "fmla v1.4s, v14.4s, v12.4s\n"
- "ldr s14, [x26, x23]\n"
- "fmla v2.4s, v15.4s, v10.4s\n"
- "ldr s17, [x28, x22]\n"
- "fmla v0.4s, v19.4s, v11.4s\n"
- "ldr s15, [x26, x22]\n"
- "fmla v1.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v21.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v1.4s, v19.4s, v6.4s\n"
- "add x24, x24, #4\n"
- "fmla v2.4s, v21.4s, v8.4s\n"
- "add x25, x25, #4\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "add x28, x28, #4\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "add x26, x26, #4\n"
- "fmla v2.4s, v22.4s, v6.4s\n"
- "movi v20.16b, #0\n"
- "fmla v0.4s, v18.4s, v10.4s\n"
- "fmax v3.4s, v3.4s, v20.4s\n"
- "fmla v2.4s, v13.4s, v7.4s\n"
- "fmax v1.4s, v1.4s, v20.4s\n"
- "str s3, [%[outptr0]]\n"
- "fmla v0.4s, v13.4s, v12.4s\n"
- "str s1, [x19]\n"
- "fmax v2.4s, v2.4s, v20.4s\n"
- "fmla v0.4s, v14.4s, v8.4s\n"
- "str s2, [%[outptr0], %[output_col_stride1]]\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v0.4s, v17.4s, v6.4s\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmax v0.4s, v0.4s, v20.4s\n"
- "str s0, [x19, %[output_col_stride1]]\n"
- "add x19, x19, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
- : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
- __asm __volatile(
- "mov x22, xzr\n"
- "mov x26, xzr\n"
- "and x23, %[n_channels], #3\n"
- "lsr x24, %[n_channels], #2\n"
- "cbz x24, 4f\n"
- "1:\n"
- "ldr q14, [%[wbptr]]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "mov v3.16b, v14.16b\n"
- "ldr q13, [%[wbptr], #16]\n"
- "mov v1.16b, v14.16b\n"
- "ldr q11, [%[wbptr], #32]\n"
- "mov v2.16b, v14.16b\n"
- "ldr q4, [%[wbptr], #48]\n"
- "mov v0.16b, v14.16b\n"
- "ldr q12, [%[wbptr], #64]\n"
- "ldr q9, [%[wbptr], #80]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr q8, [%[wbptr], #96]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr q7, [%[wbptr], #112]\n"
- "ldr x25, [%[inptrs], 120]\n"
- "ldr q6, [%[wbptr], #128]\n"
- "subs x24, x24, #1\n"
- "ldr q5, [%[wbptr], #144]\n"
- "ldr q15, [x19, x22]\n"
- "fmla v3.4s, v15.4s, v13.4s\n"
- "ldr q17, [x20, x22]\n"
- "ldr q16, [x21, x22]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr q15, [x25, x22]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "ldr q10, [x19, x22]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr q17, [x20, x22]\n"
- "ldr q14, [x21, x22]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "ldr q18, [x19, x22]\n"
- "fmla v3.4s, v10.4s, v11.4s\n"
- "fmla v3.4s, v16.4s, v7.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v1.4s, v16.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v3.4s, v17.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v2.4s, v18.4s, v13.4s\n"
- "ldr q16, [x20, x22]\n"
- "movi v10.16b, #0\n"
- "ldr q17, [x19, x22]\n"
- "fmla v1.4s, v15.4s, v12.4s\n"
- "ldr x27, [%[inptrs], 160]\n"
- "fmla v3.4s, v18.4s, v4.4s\n"
- "ldr x25, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr q18, [x27, x22]\n"
- "ldr q15, [x25, x22]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "fmla v1.4s, v14.4s, v11.4s\n"
- "ldr x20, [%[inptrs], 64]\n"
- "fmla v3.4s, v14.4s, v6.4s\n"
- "ldr q14, [x21, x22]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr q17, [x20, x22]\n"
- "fmla v0.4s, v14.4s, v13.4s\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v1.4s, v18.4s, v7.4s\n"
- "ldr x27, [%[inptrs], 168]\n"
- "fmla v3.4s, v16.4s, v8.4s\n"
- "ldr q18, [x19, x22]\n"
- "fmla v2.4s, v14.4s, v7.4s\n"
- "ldr q13, [x27, x22]\n"
- "ldr x25, [%[inptrs], 136]\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v1.4s, v15.4s, v9.4s\n"
- "ldr x27, [%[inptrs], 176]\n"
- "fmla v3.4s, v14.4s, v5.4s\n"
- "ldr q16, [x25, x22]\n"
- "fmla v2.4s, v17.4s, v9.4s\n"
- "ldr q17, [x21, x22]\n"
- "fmla v0.4s, v16.4s, v12.4s\n"
- "ldr q12, [x20, x22]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr q15, [x27, x22]\n"
- "fmax v3.4s, v3.4s, v10.4s\n"
- "ldr x25, [%[inptrs], 144]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v0.4s, v17.4s, v11.4s\n"
- "ldr q14, [x25, x22]\n"
- "fmla v1.4s, v13.4s, v6.4s\n"
- "ldr q11, [x21, x22]\n"
- "ldr x27, [%[inptrs], 184]\n"
- "ldr x25, [%[inptrs], 152]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str q3, [x21, x26]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmla v1.4s, v16.4s, v8.4s\n"
- "ldr q18, [x27, x22]\n"
- "ldr q17, [x25, x22]\n"
- "ldr x27, [%[inptrs], 192]\n"
- "fmla v2.4s, v12.4s, v8.4s\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v0.4s, v14.4s, v9.4s\n"
- "ldr q16, [x27, x22]\n"
- "fmla v1.4s, v15.4s, v5.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "ldr q14, [%[wbptr]]\n"
- "add x22, x22, #16\n"
- "fmla v2.4s, v11.4s, v5.4s\n"
- "ldr q13, [%[wbptr], #16]\n"
- "fmla v0.4s, v11.4s, v4.4s\n"
- "ldr q11, [%[wbptr], #32]\n"
- "fmax v1.4s, v1.4s, v10.4s\n"
- "ldr q12, [%[wbptr], #64]\n"
- "mov v3.16b, v14.16b\n"
- "ldr q9, [%[wbptr], #80]\n"
- "fmax v2.4s, v2.4s, v10.4s\n"
- "ldr q7, [%[wbptr], #112]\n"
- "str q1, [x28, x26]\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v1.16b, v14.16b\n"
- "ldr q4, [%[wbptr], #48]\n"
- "str q2, [x21, x26]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "mov v2.16b, v14.16b\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v17.4s, v8.4s\n"
- "ldr q6, [%[wbptr], #128]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr x25, [%[inptrs], 120]\n"
- "subs x24, x24, #1\n"
- "ldr q15, [x19, x22]\n"
- "fmla v0.4s, v16.4s, v5.4s\n"
- "ldr q8, [%[wbptr], #96]\n"
- "fmla v3.4s, v15.4s, v13.4s\n"
- "ldr q17, [x20, x22]\n"
- "ldr q16, [x21, x22]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr q15, [x25, x22]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "fmax v0.4s, v0.4s, v10.4s\n"
- "ldr q5, [%[wbptr], #144]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr q10, [x19, x22]\n"
- "ldr q17, [x20, x22]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "str q0, [x28, x26]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "mov v0.16b, v14.16b\n"
- "ldr q18, [x19, x22]\n"
- "fmla v3.4s, v10.4s, v11.4s\n"
- "ldr q14, [x21, x22]\n"
- "add x26, x26, #16\n"
- "fmla v3.4s, v16.4s, v7.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v1.4s, v16.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v3.4s, v17.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v2.4s, v18.4s, v13.4s\n"
- "ldr q16, [x20, x22]\n"
- "movi v10.16b, #0\n"
- "ldr q17, [x19, x22]\n"
- "fmla v1.4s, v15.4s, v12.4s\n"
- "ldr x27, [%[inptrs], 160]\n"
- "fmla v3.4s, v18.4s, v4.4s\n"
- "ldr x25, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr q18, [x27, x22]\n"
- "ldr q15, [x25, x22]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "fmla v1.4s, v14.4s, v11.4s\n"
- "ldr x20, [%[inptrs], 64]\n"
- "fmla v3.4s, v14.4s, v6.4s\n"
- "ldr q14, [x21, x22]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr q17, [x20, x22]\n"
- "fmla v0.4s, v14.4s, v13.4s\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v1.4s, v18.4s, v7.4s\n"
- "ldr x27, [%[inptrs], 168]\n"
- "fmla v3.4s, v16.4s, v8.4s\n"
- "ldr q18, [x19, x22]\n"
- "fmla v2.4s, v14.4s, v7.4s\n"
- "ldr q13, [x27, x22]\n"
- "ldr x25, [%[inptrs], 136]\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v1.4s, v15.4s, v9.4s\n"
- "ldr x27, [%[inptrs], 176]\n"
- "fmla v3.4s, v14.4s, v5.4s\n"
- "ldr q16, [x25, x22]\n"
- "fmla v2.4s, v17.4s, v9.4s\n"
- "ldr q17, [x21, x22]\n"
- "fmla v0.4s, v16.4s, v12.4s\n"
- "ldr q12, [x20, x22]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr q15, [x27, x22]\n"
- "fmax v3.4s, v3.4s, v10.4s\n"
- "ldr x25, [%[inptrs], 144]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v0.4s, v17.4s, v11.4s\n"
- "ldr q14, [x25, x22]\n"
- "fmla v1.4s, v13.4s, v6.4s\n"
- "ldr q11, [x21, x22]\n"
- "ldr x27, [%[inptrs], 184]\n"
- "ldr x25, [%[inptrs], 152]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str q3, [x21, x26]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmla v1.4s, v16.4s, v8.4s\n"
- "ldr q18, [x27, x22]\n"
- "ldr q17, [x25, x22]\n"
- "ldr x27, [%[inptrs], 192]\n"
- "fmla v2.4s, v12.4s, v8.4s\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v0.4s, v14.4s, v9.4s\n"
- "ldr q16, [x27, x22]\n"
- "fmla v1.4s, v15.4s, v5.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "add x22, x22, #16\n"
- "fmla v2.4s, v11.4s, v5.4s\n"
- "fmla v0.4s, v11.4s, v4.4s\n"
- "fmax v1.4s, v1.4s, v10.4s\n"
- "fmax v2.4s, v2.4s, v10.4s\n"
- "str q1, [x28, x26]\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 24]\n"
- "str q2, [x21, x26]\n"
- "fmla v0.4s, v17.4s, v8.4s\n"
- "fmla v0.4s, v16.4s, v5.4s\n"
- "fmax v0.4s, v0.4s, v10.4s\n"
- "str q0, [x28, x26]\n"
- "add x26, x26, #16\n"
- "4:\n"
- "cbz x23, 7f\n"
- "ldr s14, [%[wbptr]]\n"
- "mov v3.16b, v14.16b\n"
- "ldr s13, [%[wbptr], #4]\n"
- "mov v1.16b, v14.16b\n"
- "ldr s11, [%[wbptr], #8]\n"
- "mov v2.16b, v14.16b\n"
- "ldr s4, [%[wbptr], #12]\n"
- "mov v0.16b, v14.16b\n"
- "ldr s12, [%[wbptr], #16]\n"
- "ldr s9, [%[wbptr], #20]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr s8, [%[wbptr], #24]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr s7, [%[wbptr], #28]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr s6, [%[wbptr], #32]\n"
- "ldr x25, [%[inptrs], 120]\n"
- "ldr s5, [%[wbptr], #36]\n"
- "subs x23, x23, #1\n"
- "ldr s15, [x19, x22]\n"
- "ldr s17, [x20, x22]\n"
- "fmla v3.4s, v15.4s, v13.4s\n"
- "ldr s16, [x21, x22]\n"
- "ldr s15, [x25, x22]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "ldr s10, [x19, x22]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr s17, [x20, x22]\n"
- "ldr s14, [x21, x22]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "ldr s18, [x19, x22]\n"
- "fmla v3.4s, v10.4s, v11.4s\n"
- "fmla v3.4s, v16.4s, v7.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v1.4s, v16.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v3.4s, v17.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v2.4s, v18.4s, v13.4s\n"
- "ldr s16, [x20, x22]\n"
- "movi v10.16b, #0\n"
- "ldr s17, [x19, x22]\n"
- "fmla v1.4s, v15.4s, v12.4s\n"
- "ldr x27, [%[inptrs], 160]\n"
- "fmla v3.4s, v18.4s, v4.4s\n"
- "ldr x25, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr s18, [x27, x22]\n"
- "ldr s15, [x25, x22]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "fmla v1.4s, v14.4s, v11.4s\n"
- "ldr x20, [%[inptrs], 64]\n"
- "fmla v3.4s, v14.4s, v6.4s\n"
- "ldr s14, [x21, x22]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr s17, [x20, x22]\n"
- "fmla v0.4s, v14.4s, v13.4s\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v1.4s, v18.4s, v7.4s\n"
- "ldr x27, [%[inptrs], 168]\n"
- "fmla v3.4s, v16.4s, v8.4s\n"
- "ldr s18, [x19, x22]\n"
- "fmla v2.4s, v14.4s, v7.4s\n"
- "ldr s13, [x27, x22]\n"
- "ldr x25, [%[inptrs], 136]\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v1.4s, v15.4s, v9.4s\n"
- "ldr x27, [%[inptrs], 176]\n"
- "fmla v3.4s, v14.4s, v5.4s\n"
- "ldr s16, [x25, x22]\n"
- "fmla v2.4s, v17.4s, v9.4s\n"
- "ldr s17, [x21, x22]\n"
- "fmla v0.4s, v16.4s, v12.4s\n"
- "ldr s12, [x20, x22]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr s15, [x27, x22]\n"
- "fmax v3.4s, v3.4s, v10.4s\n"
- "ldr x25, [%[inptrs], 144]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v0.4s, v17.4s, v11.4s\n"
- "ldr s14, [x25, x22]\n"
- "fmla v1.4s, v13.4s, v6.4s\n"
- "ldr s11, [x21, x22]\n"
- "ldr x27, [%[inptrs], 184]\n"
- "ldr x25, [%[inptrs], 152]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str s3, [x21, x26]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmla v1.4s, v16.4s, v8.4s\n"
- "ldr s18, [x27, x22]\n"
- "ldr s17, [x25, x22]\n"
- "ldr x27, [%[inptrs], 192]\n"
- "fmla v2.4s, v12.4s, v8.4s\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v0.4s, v14.4s, v9.4s\n"
- "ldr s16, [x27, x22]\n"
- "fmla v1.4s, v15.4s, v5.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "ldr s14, [%[wbptr]]\n"
- "add x22, x22, #4\n"
- "fmla v2.4s, v11.4s, v5.4s\n"
- "ldr s13, [%[wbptr], #4]\n"
- "fmla v0.4s, v11.4s, v4.4s\n"
- "ldr s11, [%[wbptr], #8]\n"
- "fmax v1.4s, v1.4s, v10.4s\n"
- "ldr s12, [%[wbptr], #16]\n"
- "mov v3.16b, v14.16b\n"
- "ldr s9, [%[wbptr], #20]\n"
- "fmax v2.4s, v2.4s, v10.4s\n"
- "ldr s7, [%[wbptr], #28]\n"
- "str s1, [x28, x26]\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v1.16b, v14.16b\n"
- "ldr s4, [%[wbptr], #12]\n"
- "str s2, [x21, x26]\n"
- "ldr x28, [%[outptrs], 24]\n"
- "mov v2.16b, v14.16b\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v0.4s, v17.4s, v8.4s\n"
- "ldr s6, [%[wbptr], #32]\n"
- "ldr x19, [%[inptrs], 0]\n"
- "ldr x20, [%[inptrs], 40]\n"
- "ldr x21, [%[inptrs], 80]\n"
- "ldr x25, [%[inptrs], 120]\n"
- "subs x23, x23, #1\n"
- "ldr s15, [x19, x22]\n"
- "fmla v0.4s, v16.4s, v5.4s\n"
- "ldr s8, [%[wbptr], #24]\n"
- "fmla v3.4s, v15.4s, v13.4s\n"
- "ldr s17, [x20, x22]\n"
- "ldr s16, [x21, x22]\n"
- "ldr x19, [%[inptrs], 8]\n"
- "ldr s15, [x25, x22]\n"
- "ldr x20, [%[inptrs], 48]\n"
- "fmax v0.4s, v0.4s, v10.4s\n"
- "ldr s5, [%[wbptr], #36]\n"
- "fmla v3.4s, v17.4s, v12.4s\n"
- "ldr s10, [x19, x22]\n"
- "ldr s17, [x20, x22]\n"
- "ldr x19, [%[inptrs], 16]\n"
- "str s0, [x28, x26]\n"
- "ldr x21, [%[inptrs], 88]\n"
- "mov v0.16b, v14.16b\n"
- "ldr s18, [x19, x22]\n"
- "fmla v3.4s, v10.4s, v11.4s\n"
- "ldr s14, [x21, x22]\n"
- "add x26, x26, #4\n"
- "fmla v3.4s, v16.4s, v7.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v1.4s, v16.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 56]\n"
- "fmla v3.4s, v17.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 24]\n"
- "fmla v2.4s, v18.4s, v13.4s\n"
- "ldr s16, [x20, x22]\n"
- "movi v10.16b, #0\n"
- "ldr s17, [x19, x22]\n"
- "fmla v1.4s, v15.4s, v12.4s\n"
- "ldr x27, [%[inptrs], 160]\n"
- "fmla v3.4s, v18.4s, v4.4s\n"
- "ldr x25, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v12.4s\n"
- "ldr s18, [x27, x22]\n"
- "ldr s15, [x25, x22]\n"
- "ldr x21, [%[inptrs], 96]\n"
- "fmla v1.4s, v14.4s, v11.4s\n"
- "ldr x20, [%[inptrs], 64]\n"
- "fmla v3.4s, v14.4s, v6.4s\n"
- "ldr s14, [x21, x22]\n"
- "fmla v2.4s, v17.4s, v11.4s\n"
- "ldr s17, [x20, x22]\n"
- "fmla v0.4s, v14.4s, v13.4s\n"
- "ldr x19, [%[inptrs], 32]\n"
- "fmla v1.4s, v18.4s, v7.4s\n"
- "ldr x27, [%[inptrs], 168]\n"
- "fmla v3.4s, v16.4s, v8.4s\n"
- "ldr s18, [x19, x22]\n"
- "fmla v2.4s, v14.4s, v7.4s\n"
- "ldr s13, [x27, x22]\n"
- "ldr x25, [%[inptrs], 136]\n"
- "ldr x21, [%[inptrs], 104]\n"
- "ldr x20, [%[inptrs], 72]\n"
- "fmla v1.4s, v15.4s, v9.4s\n"
- "ldr x27, [%[inptrs], 176]\n"
- "fmla v3.4s, v14.4s, v5.4s\n"
- "ldr s16, [x25, x22]\n"
- "fmla v2.4s, v17.4s, v9.4s\n"
- "ldr s17, [x21, x22]\n"
- "fmla v0.4s, v16.4s, v12.4s\n"
- "ldr s12, [x20, x22]\n"
- "fmla v1.4s, v14.4s, v4.4s\n"
- "ldr s15, [x27, x22]\n"
- "fmax v3.4s, v3.4s, v10.4s\n"
- "ldr x25, [%[inptrs], 144]\n"
- "fmla v2.4s, v18.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 112]\n"
- "fmla v0.4s, v17.4s, v11.4s\n"
- "ldr s14, [x25, x22]\n"
- "fmla v1.4s, v13.4s, v6.4s\n"
- "ldr s11, [x21, x22]\n"
- "ldr x27, [%[inptrs], 184]\n"
- "ldr x25, [%[inptrs], 152]\n"
- "ldr x21, [%[outptrs], 0]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 16]\n"
- "str s3, [x21, x26]\n"
- "fmla v0.4s, v15.4s, v7.4s\n"
- "fmla v1.4s, v16.4s, v8.4s\n"
- "ldr s18, [x27, x22]\n"
- "ldr s17, [x25, x22]\n"
- "ldr x27, [%[inptrs], 192]\n"
- "fmla v2.4s, v12.4s, v8.4s\n"
- "ldr x21, [%[outptrs], 8]\n"
- "fmla v0.4s, v14.4s, v9.4s\n"
- "ldr s16, [x27, x22]\n"
- "fmla v1.4s, v15.4s, v5.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "add x22, x22, #4\n"
- "fmla v2.4s, v11.4s, v5.4s\n"
- "fmla v0.4s, v11.4s, v4.4s\n"
- "fmax v1.4s, v1.4s, v10.4s\n"
- "fmax v2.4s, v2.4s, v10.4s\n"
- "str s1, [x28, x26]\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "ldr x28, [%[outptrs], 24]\n"
- "str s2, [x21, x26]\n"
- "fmla v0.4s, v17.4s, v8.4s\n"
- "fmla v0.4s, v16.4s, v5.4s\n"
- "fmax v0.4s, v0.4s, v10.4s\n"
- "str s0, [x28, x26]\n"
- "add x26, x26, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x21, %[inptr0], %[input_row_stride]\n"
- "add x23, %[input_col_stride1], %[input_col_stride1]\n"
- "add x24, %[outptr0], %[output_row_stride]\n"
- "add x28, x21, %[input_row_stride]\n"
- "add x26, x23, %[input_col_stride1]\n"
- "and x19, %[n_channels], #3\n"
- "add x27, x28, %[input_row_stride]\n"
- "add x25, x26, %[input_col_stride1]\n"
- "lsr x20, %[n_channels], #2\n"
- "add x22, x27, %[input_row_stride]\n"
- "cbz x20, 4f\n"
- "1:\n"
- "ldr q14, [%[wbptr]]\n"
- "subs x20, x20, #1\n"
- "mov v5.16b, v14.16b\n"
- "ldr q0, [%[wbptr], #16]\n"
- "mov v11.16b, v14.16b\n"
- "ldr q1, [%[wbptr], #32]\n"
- "mov v12.16b, v14.16b\n"
- "ldr q2, [%[wbptr], #48]\n"
- "mov v10.16b, v14.16b\n"
- "ldr q6, [%[wbptr], #64]\n"
- "ldr q3, [%[wbptr], #80]\n"
- "ldr q7, [%[wbptr], #96]\n"
- "ldr q4, [%[wbptr], #112]\n"
- "ldr q8, [%[wbptr], #128]\n"
- "ldr q9, [%[wbptr], #144]\n"
- "ldr q19, [%[inptr0]]\n"
- "fmla v5.4s, v19.4s, v0.4s\n"
- "ldr q15, [x21]\n"
- "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q16, [x28]\n"
- "fmla v11.4s, v16.4s, v0.4s\n"
- "ldr q23, [x21, %[input_col_stride1]]\n"
- "fmla v5.4s, v15.4s, v6.4s\n"
- "ldr q18, [%[inptr0], x23]\n"
- "ldr q17, [x27]\n"
- "ldr q13, [x28, %[input_col_stride1]]\n"
- "fmla v5.4s, v21.4s, v1.4s\n"
- "fmla v5.4s, v16.4s, v4.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v5.4s, v23.4s, v3.4s\n"
- "ldr q21, [x21, x23]\n"
- "fmla v12.4s, v18.4s, v0.4s\n"
- "ldr q20, [%[inptr0], x26]\n"
- "fmla v11.4s, v17.4s, v6.4s\n"
- "ldr q19, [x22]\n"
- "fmla v5.4s, v18.4s, v2.4s\n"
- "ldr q15, [x27, %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v6.4s\n"
- "ldr q16, [x28, x23]\n"
- "fmla v11.4s, v13.4s, v1.4s\n"
- "ldr q17, [x21, x26]\n"
- "fmla v5.4s, v13.4s, v8.4s\n"
- "ldr q14, [%[inptr0], x25]\n"
- "fmla v12.4s, v20.4s, v1.4s\n"
- "ldr q20, [x22, %[input_col_stride1]]\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr q19, [x27, x23]\n"
- "fmla v5.4s, v21.4s, v7.4s\n"
- "ldr q22, [x28, x26]\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "ldr q21, [x21, x25]\n"
- "fmla v11.4s, v15.4s, v3.4s\n"
- "ldr q23, [x22, x23]\n"
- "fmla v5.4s, v16.4s, v9.4s\n"
- "ldr q18, [x27, x26]\n"
- "fmla v10.4s, v16.4s, v0.4s\n"
- "ldr q15, [x28, x25]\n"
- "fmla v11.4s, v16.4s, v2.4s\n"
- "ldr q16, [x22, x26]\n"
- "fmla v12.4s, v17.4s, v3.4s\n"
- "ldr q17, [x27, x25]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr q13, [x22, x25]\n"
- "fmla v11.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v12.4s, v14.4s, v2.4s\n"
- "ldr q14, [%[wbptr]]\n"
- "fmla v10.4s, v22.4s, v1.4s\n"
- "ldr q0, [%[wbptr], #16]\n"
- "fmla v11.4s, v19.4s, v7.4s\n"
- "ldr q6, [%[wbptr], #64]\n"
- "fmla v12.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v10.4s, v23.4s, v4.4s\n"
- "ldr q1, [%[wbptr], #32]\n"
- "fmla v11.4s, v23.4s, v9.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v12.4s, v21.4s, v7.4s\n"
- "ldr q19, [%[inptr0]]\n"
- "fmla v10.4s, v18.4s, v3.4s\n"
- "ldr q4, [%[wbptr], #112]\n"
- "movi v20.16b, #0\n"
- "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v12.4s, v15.4s, v9.4s\n"
- "ldr q18, [%[inptr0], x23]\n"
- "fmla v10.4s, v15.4s, v2.4s\n"
- "ldr q3, [%[wbptr], #80]\n"
- "fmov v22.4s, #6.0\n"
- "add x21, x21, #16\n"
- "fmax v5.4s, v5.4s, v20.4s\n"
- "ldr q15, [x21]\n"
- "fmla v10.4s, v16.4s, v8.4s\n"
- "ldr q2, [%[wbptr], #48]\n"
- "fmin v5.4s, v5.4s, v22.4s\n"
- "ldr q23, [x21, %[input_col_stride1]]\n"
- "fmax v12.4s, v12.4s, v20.4s\n"
- "add x28, x28, #16\n"
- "str q5, [%[outptr0]]\n"
- "fmla v10.4s, v17.4s, v7.4s\n"
- "fmin v12.4s, v12.4s, v22.4s\n"
- "ldr q8, [%[wbptr], #128]\n"
- "fmax v11.4s, v11.4s, v20.4s\n"
- "ldr q16, [x28]\n"
- "str q12, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v13.4s, v9.4s\n"
- "fmin v11.4s, v11.4s, v22.4s\n"
- "ldr q7, [%[wbptr], #96]\n"
- "mov v5.16b, v14.16b\n"
- "ldr q13, [x28, %[input_col_stride1]]\n"
- "str q11, [x24]\n"
- "fmax v10.4s, v10.4s, v20.4s\n"
- "mov v11.16b, v14.16b\n"
- "ldr q9, [%[wbptr], #144]\n"
- "fmin v10.4s, v10.4s, v22.4s\n"
- "add x27, x27, #16\n"
- "mov v12.16b, v14.16b\n"
- "ldr q17, [x27]\n"
- "str q10, [x24, %[output_col_stride1]]\n"
- "fmla v5.4s, v19.4s, v0.4s\n"
- "mov v10.16b, v14.16b\n"
- "add x22, x22, #16\n"
- "fmla v11.4s, v16.4s, v0.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v5.4s, v15.4s, v6.4s\n"
- "add x24, x24, #16\n"
- "subs x20, x20, #1\n"
- "fmla v5.4s, v21.4s, v1.4s\n"
- "fmla v5.4s, v16.4s, v4.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v5.4s, v23.4s, v3.4s\n"
- "ldr q21, [x21, x23]\n"
- "fmla v12.4s, v18.4s, v0.4s\n"
- "ldr q20, [%[inptr0], x26]\n"
- "fmla v11.4s, v17.4s, v6.4s\n"
- "ldr q19, [x22]\n"
- "fmla v5.4s, v18.4s, v2.4s\n"
- "ldr q15, [x27, %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v6.4s\n"
- "ldr q16, [x28, x23]\n"
- "fmla v11.4s, v13.4s, v1.4s\n"
- "ldr q17, [x21, x26]\n"
- "fmla v5.4s, v13.4s, v8.4s\n"
- "ldr q14, [%[inptr0], x25]\n"
- "fmla v12.4s, v20.4s, v1.4s\n"
- "ldr q20, [x22, %[input_col_stride1]]\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr q19, [x27, x23]\n"
- "fmla v5.4s, v21.4s, v7.4s\n"
- "ldr q22, [x28, x26]\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "ldr q21, [x21, x25]\n"
- "fmla v11.4s, v15.4s, v3.4s\n"
- "ldr q23, [x22, x23]\n"
- "fmla v5.4s, v16.4s, v9.4s\n"
- "ldr q18, [x27, x26]\n"
- "fmla v10.4s, v16.4s, v0.4s\n"
- "ldr q15, [x28, x25]\n"
- "fmla v11.4s, v16.4s, v2.4s\n"
- "ldr q16, [x22, x26]\n"
- "fmla v12.4s, v17.4s, v3.4s\n"
- "ldr q17, [x27, x25]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr q13, [x22, x25]\n"
- "fmla v11.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v12.4s, v14.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v10.4s, v22.4s, v1.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v11.4s, v19.4s, v7.4s\n"
- "add x21, x21, #16\n"
- "fmla v12.4s, v22.4s, v8.4s\n"
- "add x28, x28, #16\n"
- "fmla v10.4s, v23.4s, v4.4s\n"
- "add x27, x27, #16\n"
- "fmla v11.4s, v23.4s, v9.4s\n"
- "add x22, x22, #16\n"
- "fmla v12.4s, v21.4s, v7.4s\n"
- "movi v20.16b, #0\n"
- "fmla v10.4s, v18.4s, v3.4s\n"
- "fmov v22.4s, #6.0\n"
- "fmax v5.4s, v5.4s, v20.4s\n"
- "fmax v11.4s, v11.4s, v20.4s\n"
- "fmla v12.4s, v15.4s, v9.4s\n"
- "fmla v10.4s, v15.4s, v2.4s\n"
- "fmin v5.4s, v5.4s, v22.4s\n"
- "fmin v11.4s, v11.4s, v22.4s\n"
- "fmax v12.4s, v12.4s, v20.4s\n"
- "str q5, [%[outptr0]]\n"
- "str q11, [x24]\n"
- "fmla v10.4s, v16.4s, v8.4s\n"
- "fmin v12.4s, v12.4s, v22.4s\n"
- "str q12, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v17.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v10.4s, v13.4s, v9.4s\n"
- "fmax v10.4s, v10.4s, v20.4s\n"
- "fmin v10.4s, v10.4s, v22.4s\n"
- "str q10, [x24, %[output_col_stride1]]\n"
- "add x24, x24, #16\n"
- "4:\n"
- "cbz x19, 7f\n"
- "ldr s14, [%[wbptr]]\n"
- "mov v5.16b, v14.16b\n"
- "ldr s0, [%[wbptr], #4]\n"
- "mov v11.16b, v14.16b\n"
- "ldr s1, [%[wbptr], #8]\n"
- "mov v12.16b, v14.16b\n"
- "ldr s2, [%[wbptr], #12]\n"
- "mov v10.16b, v14.16b\n"
- "ldr s6, [%[wbptr], #16]\n"
- "ldr s3, [%[wbptr], #20]\n"
- "subs x19, x19, #1\n"
- "ldr s7, [%[wbptr], #24]\n"
- "ldr s4, [%[wbptr], #28]\n"
- "ldr s8, [%[wbptr], #32]\n"
- "ldr s9, [%[wbptr], #36]\n"
- "ldr s19, [%[inptr0]]\n"
- "ldr s15, [x21]\n"
- "fmla v5.4s, v19.4s, v0.4s\n"
- "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s16, [x28]\n"
- "ldr s23, [x21, %[input_col_stride1]]\n"
- "fmla v11.4s, v16.4s, v0.4s\n"
- "ldr s18, [%[inptr0], x23]\n"
- "fmla v5.4s, v15.4s, v6.4s\n"
- "ldr s17, [x27]\n"
- "ldr s13, [x28, %[input_col_stride1]]\n"
- "fmla v5.4s, v21.4s, v1.4s\n"
- "fmla v5.4s, v16.4s, v4.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v5.4s, v23.4s, v3.4s\n"
- "ldr s21, [x21, x23]\n"
- "fmla v12.4s, v18.4s, v0.4s\n"
- "ldr s20, [%[inptr0], x26]\n"
- "fmla v11.4s, v17.4s, v6.4s\n"
- "ldr s19, [x22]\n"
- "fmla v5.4s, v18.4s, v2.4s\n"
- "ldr s15, [x27, %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v6.4s\n"
- "ldr s16, [x28, x23]\n"
- "fmla v11.4s, v13.4s, v1.4s\n"
- "ldr s17, [x21, x26]\n"
- "fmla v5.4s, v13.4s, v8.4s\n"
- "ldr s14, [%[inptr0], x25]\n"
- "fmla v12.4s, v20.4s, v1.4s\n"
- "ldr s20, [x22, %[input_col_stride1]]\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr s19, [x27, x23]\n"
- "fmla v5.4s, v21.4s, v7.4s\n"
- "ldr s22, [x28, x26]\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "ldr s21, [x21, x25]\n"
- "fmla v11.4s, v15.4s, v3.4s\n"
- "ldr s23, [x22, x23]\n"
- "fmla v5.4s, v16.4s, v9.4s\n"
- "ldr s18, [x27, x26]\n"
- "fmla v10.4s, v16.4s, v0.4s\n"
- "ldr s15, [x28, x25]\n"
- "fmla v11.4s, v16.4s, v2.4s\n"
- "ldr s16, [x22, x26]\n"
- "fmla v12.4s, v17.4s, v3.4s\n"
- "ldr s17, [x27, x25]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr s13, [x22, x25]\n"
- "fmla v11.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v12.4s, v14.4s, v2.4s\n"
- "ldr s14, [%[wbptr]]\n"
- "fmla v10.4s, v22.4s, v1.4s\n"
- "ldr s0, [%[wbptr], #4]\n"
- "fmla v11.4s, v19.4s, v7.4s\n"
- "ldr s6, [%[wbptr], #16]\n"
- "fmla v12.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v10.4s, v23.4s, v4.4s\n"
- "ldr s1, [%[wbptr], #8]\n"
- "fmla v11.4s, v23.4s, v9.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v12.4s, v21.4s, v7.4s\n"
- "ldr s19, [%[inptr0]]\n"
- "fmla v10.4s, v18.4s, v3.4s\n"
- "ldr s4, [%[wbptr], #28]\n"
- "movi v20.16b, #0\n"
- "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v12.4s, v15.4s, v9.4s\n"
- "ldr s18, [%[inptr0], x23]\n"
- "fmla v10.4s, v15.4s, v2.4s\n"
- "ldr s3, [%[wbptr], #20]\n"
- "fmov v22.4s, #6.0\n"
- "add x21, x21, #4\n"
- "fmax v5.4s, v5.4s, v20.4s\n"
- "ldr s15, [x21]\n"
- "fmla v10.4s, v16.4s, v8.4s\n"
- "ldr s2, [%[wbptr], #12]\n"
- "fmin v5.4s, v5.4s, v22.4s\n"
- "ldr s23, [x21, %[input_col_stride1]]\n"
- "fmax v12.4s, v12.4s, v20.4s\n"
- "add x28, x28, #4\n"
- "str s5, [%[outptr0]]\n"
- "fmla v10.4s, v17.4s, v7.4s\n"
- "fmin v12.4s, v12.4s, v22.4s\n"
- "ldr s8, [%[wbptr], #32]\n"
- "fmax v11.4s, v11.4s, v20.4s\n"
- "ldr s16, [x28]\n"
- "str s12, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v13.4s, v9.4s\n"
- "fmin v11.4s, v11.4s, v22.4s\n"
- "ldr s7, [%[wbptr], #24]\n"
- "mov v5.16b, v14.16b\n"
- "ldr s13, [x28, %[input_col_stride1]]\n"
- "str s11, [x24]\n"
- "fmax v10.4s, v10.4s, v20.4s\n"
- "mov v11.16b, v14.16b\n"
- "ldr s9, [%[wbptr], #36]\n"
- "fmin v10.4s, v10.4s, v22.4s\n"
- "add x27, x27, #4\n"
- "mov v12.16b, v14.16b\n"
- "ldr s17, [x27]\n"
- "str s10, [x24, %[output_col_stride1]]\n"
- "fmla v5.4s, v19.4s, v0.4s\n"
- "mov v10.16b, v14.16b\n"
- "add x22, x22, #4\n"
- "fmla v11.4s, v16.4s, v0.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v5.4s, v15.4s, v6.4s\n"
- "add x24, x24, #4\n"
- "subs x19, x19, #1\n"
- "fmla v5.4s, v21.4s, v1.4s\n"
- "fmla v5.4s, v16.4s, v4.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v5.4s, v23.4s, v3.4s\n"
- "ldr s21, [x21, x23]\n"
- "fmla v12.4s, v18.4s, v0.4s\n"
- "ldr s20, [%[inptr0], x26]\n"
- "fmla v11.4s, v17.4s, v6.4s\n"
- "ldr s19, [x22]\n"
- "fmla v5.4s, v18.4s, v2.4s\n"
- "ldr s15, [x27, %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v6.4s\n"
- "ldr s16, [x28, x23]\n"
- "fmla v11.4s, v13.4s, v1.4s\n"
- "ldr s17, [x21, x26]\n"
- "fmla v5.4s, v13.4s, v8.4s\n"
- "ldr s14, [%[inptr0], x25]\n"
- "fmla v12.4s, v20.4s, v1.4s\n"
- "ldr s20, [x22, %[input_col_stride1]]\n"
- "fmla v11.4s, v19.4s, v4.4s\n"
- "ldr s19, [x27, x23]\n"
- "fmla v5.4s, v21.4s, v7.4s\n"
- "ldr s22, [x28, x26]\n"
- "fmla v12.4s, v16.4s, v4.4s\n"
- "ldr s21, [x21, x25]\n"
- "fmla v11.4s, v15.4s, v3.4s\n"
- "ldr s23, [x22, x23]\n"
- "fmla v5.4s, v16.4s, v9.4s\n"
- "ldr s18, [x27, x26]\n"
- "fmla v10.4s, v16.4s, v0.4s\n"
- "ldr s15, [x28, x25]\n"
- "fmla v11.4s, v16.4s, v2.4s\n"
- "ldr s16, [x22, x26]\n"
- "fmla v12.4s, v17.4s, v3.4s\n"
- "ldr s17, [x27, x25]\n"
- "fmla v10.4s, v19.4s, v6.4s\n"
- "ldr s13, [x22, x25]\n"
- "fmla v11.4s, v20.4s, v8.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v12.4s, v14.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v10.4s, v22.4s, v1.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v11.4s, v19.4s, v7.4s\n"
- "add x21, x21, #4\n"
- "fmla v12.4s, v22.4s, v8.4s\n"
- "add x28, x28, #4\n"
- "fmla v10.4s, v23.4s, v4.4s\n"
- "add x27, x27, #4\n"
- "fmla v11.4s, v23.4s, v9.4s\n"
- "add x22, x22, #4\n"
- "fmla v12.4s, v21.4s, v7.4s\n"
- "movi v20.16b, #0\n"
- "fmla v10.4s, v18.4s, v3.4s\n"
- "fmov v22.4s, #6.0\n"
- "fmax v5.4s, v5.4s, v20.4s\n"
- "fmax v11.4s, v11.4s, v20.4s\n"
- "fmla v12.4s, v15.4s, v9.4s\n"
- "fmla v10.4s, v15.4s, v2.4s\n"
- "fmin v5.4s, v5.4s, v22.4s\n"
- "fmin v11.4s, v11.4s, v22.4s\n"
- "fmax v12.4s, v12.4s, v20.4s\n"
- "str s5, [%[outptr0]]\n"
- "str s11, [x24]\n"
- "fmla v10.4s, v16.4s, v8.4s\n"
- "fmin v12.4s, v12.4s, v22.4s\n"
- "str s12, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v17.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v10.4s, v13.4s, v9.4s\n"
- "fmax v10.4s, v10.4s, v20.4s\n"
- "fmin v10.4s, v10.4s, v22.4s\n"
- "str s10, [x24, %[output_col_stride1]]\n"
- "add x24, x24, #4\n"
- "7:\n"
- : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
- : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
- __asm __volatile(
- "mov x27, xzr\n"
- "mov x28, xzr\n"
- "and x26, %[n_channels], #3\n"
- "lsr x25, %[n_channels], #2\n"
- "cbz x25, 4f\n"
- "1:\n"
- "ldr q15, [%[wbptr]]\n"
- "ldr x21, [%[inptrs], 0]\n"
- "mov v8.16b, v15.16b\n"
- "ldr q14, [%[wbptr], #16]\n"
- "mov v3.16b, v15.16b\n"
- "ldr q10, [%[wbptr], #32]\n"
- "mov v2.16b, v15.16b\n"
- "ldr q7, [%[wbptr], #48]\n"
- "mov v4.16b, v15.16b\n"
- "ldr q13, [%[wbptr], #64]\n"
- "ldr q5, [%[wbptr], #80]\n"
- "ldr x22, [%[inptrs], 40]\n"
- "ldr q0, [%[wbptr], #96]\n"
- "ldr x20, [%[inptrs], 80]\n"
- "ldr q9, [%[wbptr], #112]\n"
- "ldr x23, [%[inptrs], 120]\n"
- "ldr q6, [%[wbptr], #128]\n"
- "subs x25, x25, #1\n"
- "ldr q1, [%[wbptr], #144]\n"
- "ldr q17, [x21, x27]\n"
- "fmla v8.4s, v17.4s, v14.4s\n"
- "ldr q18, [x22, x27]\n"
- "ldr q16, [x20, x27]\n"
- "ldr x21, [%[inptrs], 8]\n"
- "ldr q17, [x23, x27]\n"
- "ldr x22, [%[inptrs], 48]\n"
- "ldr q11, [x21, x27]\n"
- "ldr x20, [%[inptrs], 88]\n"
- "fmla v8.4s, v18.4s, v13.4s\n"
- "ldr q19, [x22, x27]\n"
- "ldr q15, [x20, x27]\n"
- "ldr x21, [%[inptrs], 16]\n"
- "ldr q12, [x21, x27]\n"
- "fmla v8.4s, v11.4s, v10.4s\n"
- "fmla v8.4s, v16.4s, v9.4s\n"
- "beq 3f\n"
- "2:\n"
- "fmla v3.4s, v16.4s, v14.4s\n"
- "ldr x22, [%[inptrs], 56]\n"
- "fmla v8.4s, v19.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 24]\n"
- "fmla v2.4s, v12.4s, v14.4s\n"
- "ldr q16, [x22, x27]\n"
- "movi v11.16b, #0\n"
- "ldr q18, [x21, x27]\n"
- "fmla v3.4s, v17.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v8.4s, v12.4s, v7.4s\n"
- "ldr x23, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v13.4s\n"
- "ldr q19, [x20, x27]\n"
- "fmov v12.4s, #6.0\n"
- "ldr q17, [x23, x27]\n"
- "fmla v3.4s, v15.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 96]\n"
- "fmla v8.4s, v15.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 64]\n"
- "fmla v2.4s, v18.4s, v10.4s\n"
- "ldr q15, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v14.4s\n"
- "ldr q18, [x22, x27]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "ldr x21, [%[inptrs], 32]\n"
- "fmla v8.4s, v16.4s, v0.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v2.4s, v15.4s, v9.4s\n"
- "ldr q19, [x21, x27]\n"
- "ldr q16, [x20, x27]\n"
- "ldr x23, [%[inptrs], 136]\n"
- "fmla v3.4s, v17.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 104]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr q14, [x23, x27]\n"
- "fmla v2.4s, v18.4s, v5.4s\n"
- "ldr q17, [x20, x27]\n"
- "fmla v4.4s, v14.4s, v13.4s\n"
- "ldr x22, [%[inptrs], 72]\n"
- "fmla v3.4s, v15.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmax v8.4s, v8.4s, v11.4s\n"
- "ldr q18, [x22, x27]\n"
- "fmla v2.4s, v19.4s, v7.4s\n"
- "ldr q13, [x20, x27]\n"
- "fmla v4.4s, v17.4s, v10.4s\n"
- "ldr x23, [%[inptrs], 144]\n"
- "fmla v3.4s, v16.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 112]\n"
- "fmin v8.4s, v8.4s, v12.4s\n"
- "ldr q10, [x23, x27]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr q15, [x20, x27]\n"
- "fmla v4.4s, v13.4s, v9.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v3.4s, v14.4s, v0.4s\n"
- "ldr x23, [%[inptrs], 152]\n"
- "ldr q9, [x20, x27]\n"
- "ldr x22, [%[outptrs], 0]\n"
- "fmla v2.4s, v18.4s, v0.4s\n"
- "ldr q19, [x23, x27]\n"
- "str q8, [x22, x28]\n"
- "fmla v4.4s, v10.4s, v5.4s\n"
- "fmla v3.4s, v13.4s, v1.4s\n"
- "ldr x20, [%[inptrs], 192]\n"
- "ldr x22, [%[outptrs], 8]\n"
- "ldr x24, [%[outptrs], 16]\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v15.4s, v1.4s\n"
- "ldr q16, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v7.4s\n"
- "ldr q15, [%[wbptr]]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "ldr q14, [%[wbptr], #16]\n"
- "mov v8.16b, v15.16b\n"
- "ldr q10, [%[wbptr], #32]\n"
- "fmax v2.4s, v2.4s, v11.4s\n"
- "ldr q13, [%[wbptr], #64]\n"
- "fmla v4.4s, v9.4s, v6.4s\n"
- "ldr q7, [%[wbptr], #48]\n"
- "fmin v3.4s, v3.4s, v12.4s\n"
- "ldr q5, [%[wbptr], #80]\n"
- "fmin v2.4s, v2.4s, v12.4s\n"
- "ldr q9, [%[wbptr], #112]\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "add x27, x27, #16\n"
- "str q3, [x24, x28]\n"
- "fmla v4.4s, v19.4s, v0.4s\n"
- "str q2, [x22, x28]\n"
- "mov v3.16b, v15.16b\n"
- "mov v2.16b, v15.16b\n"
- "ldr q6, [%[wbptr], #128]\n"
- "ldr x24, [%[outptrs], 24]\n"
- "ldr x21, [%[inptrs], 0]\n"
- "ldr x22, [%[inptrs], 40]\n"
- "fmla v4.4s, v16.4s, v1.4s\n"
- "ldr q0, [%[wbptr], #96]\n"
- "ldr q17, [x21, x27]\n"
- "ldr x20, [%[inptrs], 80]\n"
- "fmla v8.4s, v17.4s, v14.4s\n"
- "ldr q18, [x22, x27]\n"
- "ldr q16, [x20, x27]\n"
- "ldr x21, [%[inptrs], 8]\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "ldr q1, [%[wbptr], #144]\n"
- "ldr q11, [x21, x27]\n"
- "ldr x22, [%[inptrs], 48]\n"
- "fmla v8.4s, v18.4s, v13.4s\n"
- "ldr x21, [%[inptrs], 16]\n"
- "fmin v4.4s, v4.4s, v12.4s\n"
- "ldr q19, [x22, x27]\n"
- "ldr q12, [x21, x27]\n"
- "ldr x23, [%[inptrs], 120]\n"
- "ldr x20, [%[inptrs], 88]\n"
- "subs x25, x25, #1\n"
- "str q4, [x24, x28]\n"
- "mov v4.16b, v15.16b\n"
- "ldr q17, [x23, x27]\n"
- "fmla v8.4s, v11.4s, v10.4s\n"
- "ldr q15, [x20, x27]\n"
- "add x28, x28, #16\n"
- "fmla v8.4s, v16.4s, v9.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v3.4s, v16.4s, v14.4s\n"
- "ldr x22, [%[inptrs], 56]\n"
- "fmla v8.4s, v19.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 24]\n"
- "fmla v2.4s, v12.4s, v14.4s\n"
- "ldr q16, [x22, x27]\n"
- "movi v11.16b, #0\n"
- "ldr q18, [x21, x27]\n"
- "fmla v3.4s, v17.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v8.4s, v12.4s, v7.4s\n"
- "ldr x23, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v13.4s\n"
- "ldr q19, [x20, x27]\n"
- "fmov v12.4s, #6.0\n"
- "ldr q17, [x23, x27]\n"
- "fmla v3.4s, v15.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 96]\n"
- "fmla v8.4s, v15.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 64]\n"
- "fmla v2.4s, v18.4s, v10.4s\n"
- "ldr q15, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v14.4s\n"
- "ldr q18, [x22, x27]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "ldr x21, [%[inptrs], 32]\n"
- "fmla v8.4s, v16.4s, v0.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v2.4s, v15.4s, v9.4s\n"
- "ldr q19, [x21, x27]\n"
- "ldr q16, [x20, x27]\n"
- "ldr x23, [%[inptrs], 136]\n"
- "fmla v3.4s, v17.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 104]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr q14, [x23, x27]\n"
- "fmla v2.4s, v18.4s, v5.4s\n"
- "ldr q17, [x20, x27]\n"
- "fmla v4.4s, v14.4s, v13.4s\n"
- "ldr x22, [%[inptrs], 72]\n"
- "fmla v3.4s, v15.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmax v8.4s, v8.4s, v11.4s\n"
- "ldr q18, [x22, x27]\n"
- "fmla v2.4s, v19.4s, v7.4s\n"
- "ldr q13, [x20, x27]\n"
- "fmla v4.4s, v17.4s, v10.4s\n"
- "ldr x23, [%[inptrs], 144]\n"
- "fmla v3.4s, v16.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 112]\n"
- "fmin v8.4s, v8.4s, v12.4s\n"
- "ldr q10, [x23, x27]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr q15, [x20, x27]\n"
- "fmla v4.4s, v13.4s, v9.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v3.4s, v14.4s, v0.4s\n"
- "ldr x23, [%[inptrs], 152]\n"
- "ldr q9, [x20, x27]\n"
- "ldr x22, [%[outptrs], 0]\n"
- "fmla v2.4s, v18.4s, v0.4s\n"
- "ldr q19, [x23, x27]\n"
- "str q8, [x22, x28]\n"
- "fmla v4.4s, v10.4s, v5.4s\n"
- "fmla v3.4s, v13.4s, v1.4s\n"
- "ldr x20, [%[inptrs], 192]\n"
- "ldr x22, [%[outptrs], 8]\n"
- "ldr x24, [%[outptrs], 16]\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v15.4s, v1.4s\n"
- "ldr q16, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v7.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "add x27, x27, #16\n"
- "fmax v2.4s, v2.4s, v11.4s\n"
- "fmla v4.4s, v9.4s, v6.4s\n"
- "fmin v3.4s, v3.4s, v12.4s\n"
- "fmin v2.4s, v2.4s, v12.4s\n"
- "str q3, [x24, x28]\n"
- "fmla v4.4s, v19.4s, v0.4s\n"
- "str q2, [x22, x28]\n"
- "ldr x24, [%[outptrs], 24]\n"
- "fmla v4.4s, v16.4s, v1.4s\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "fmin v4.4s, v4.4s, v12.4s\n"
- "str q4, [x24, x28]\n"
- "add x28, x28, #16\n"
- "4:\n"
- "cbz x26, 7f\n"
- "ldr s15, [%[wbptr]]\n"
- "mov v8.16b, v15.16b\n"
- "ldr s14, [%[wbptr], #4]\n"
- "mov v3.16b, v15.16b\n"
- "ldr s10, [%[wbptr], #8]\n"
- "mov v2.16b, v15.16b\n"
- "ldr s7, [%[wbptr], #12]\n"
- "mov v4.16b, v15.16b\n"
- "ldr s13, [%[wbptr], #16]\n"
- "ldr s5, [%[wbptr], #20]\n"
- "ldr x21, [%[inptrs], 0]\n"
- "ldr s0, [%[wbptr], #24]\n"
- "ldr x22, [%[inptrs], 40]\n"
- "ldr s9, [%[wbptr], #28]\n"
- "ldr x20, [%[inptrs], 80]\n"
- "ldr s6, [%[wbptr], #32]\n"
- "ldr x23, [%[inptrs], 120]\n"
- "ldr s1, [%[wbptr], #36]\n"
- "subs x26, x26, #1\n"
- "ldr s17, [x21, x27]\n"
- "ldr s18, [x22, x27]\n"
- "fmla v8.4s, v17.4s, v14.4s\n"
- "ldr s16, [x20, x27]\n"
- "ldr s17, [x23, x27]\n"
- "ldr x21, [%[inptrs], 8]\n"
- "ldr x22, [%[inptrs], 48]\n"
- "ldr x20, [%[inptrs], 88]\n"
- "ldr s11, [x21, x27]\n"
- "fmla v8.4s, v18.4s, v13.4s\n"
- "ldr s19, [x22, x27]\n"
- "ldr s15, [x20, x27]\n"
- "ldr x21, [%[inptrs], 16]\n"
- "ldr s12, [x21, x27]\n"
- "fmla v8.4s, v11.4s, v10.4s\n"
- "fmla v8.4s, v16.4s, v9.4s\n"
- "beq 6f\n"
- "5:\n"
- "fmla v3.4s, v16.4s, v14.4s\n"
- "ldr x22, [%[inptrs], 56]\n"
- "fmla v8.4s, v19.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 24]\n"
- "fmla v2.4s, v12.4s, v14.4s\n"
- "ldr s16, [x22, x27]\n"
- "movi v11.16b, #0\n"
- "ldr s18, [x21, x27]\n"
- "fmla v3.4s, v17.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v8.4s, v12.4s, v7.4s\n"
- "ldr x23, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v13.4s\n"
- "ldr s19, [x20, x27]\n"
- "fmov v12.4s, #6.0\n"
- "ldr s17, [x23, x27]\n"
- "fmla v3.4s, v15.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 96]\n"
- "fmla v8.4s, v15.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 64]\n"
- "fmla v2.4s, v18.4s, v10.4s\n"
- "ldr s15, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v14.4s\n"
- "ldr s18, [x22, x27]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "ldr x21, [%[inptrs], 32]\n"
- "fmla v8.4s, v16.4s, v0.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v2.4s, v15.4s, v9.4s\n"
- "ldr s19, [x21, x27]\n"
- "ldr s16, [x20, x27]\n"
- "ldr x23, [%[inptrs], 136]\n"
- "fmla v3.4s, v17.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 104]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr s14, [x23, x27]\n"
- "fmla v2.4s, v18.4s, v5.4s\n"
- "ldr s17, [x20, x27]\n"
- "fmla v4.4s, v14.4s, v13.4s\n"
- "ldr x22, [%[inptrs], 72]\n"
- "fmla v3.4s, v15.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmax v8.4s, v8.4s, v11.4s\n"
- "ldr s18, [x22, x27]\n"
- "fmla v2.4s, v19.4s, v7.4s\n"
- "ldr s13, [x20, x27]\n"
- "fmla v4.4s, v17.4s, v10.4s\n"
- "ldr x23, [%[inptrs], 144]\n"
- "fmla v3.4s, v16.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 112]\n"
- "fmin v8.4s, v8.4s, v12.4s\n"
- "ldr s10, [x23, x27]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr s15, [x20, x27]\n"
- "fmla v4.4s, v13.4s, v9.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v3.4s, v14.4s, v0.4s\n"
- "ldr x23, [%[inptrs], 152]\n"
- "ldr s9, [x20, x27]\n"
- "ldr x22, [%[outptrs], 0]\n"
- "fmla v2.4s, v18.4s, v0.4s\n"
- "ldr s19, [x23, x27]\n"
- "str s8, [x22, x28]\n"
- "fmla v4.4s, v10.4s, v5.4s\n"
- "fmla v3.4s, v13.4s, v1.4s\n"
- "ldr x20, [%[inptrs], 192]\n"
- "ldr x22, [%[outptrs], 8]\n"
- "ldr x24, [%[outptrs], 16]\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v15.4s, v1.4s\n"
- "ldr s16, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v7.4s\n"
- "ldr s15, [%[wbptr]]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "ldr s14, [%[wbptr], #4]\n"
- "mov v8.16b, v15.16b\n"
- "ldr s10, [%[wbptr], #8]\n"
- "fmax v2.4s, v2.4s, v11.4s\n"
- "ldr s13, [%[wbptr], #16]\n"
- "fmla v4.4s, v9.4s, v6.4s\n"
- "ldr s7, [%[wbptr], #12]\n"
- "fmin v3.4s, v3.4s, v12.4s\n"
- "ldr s5, [%[wbptr], #20]\n"
- "fmin v2.4s, v2.4s, v12.4s\n"
- "ldr s9, [%[wbptr], #28]\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "add x27, x27, #4\n"
- "str s3, [x24, x28]\n"
- "fmla v4.4s, v19.4s, v0.4s\n"
- "str s2, [x22, x28]\n"
- "mov v3.16b, v15.16b\n"
- "mov v2.16b, v15.16b\n"
- "ldr s6, [%[wbptr], #32]\n"
- "ldr x24, [%[outptrs], 24]\n"
- "ldr x21, [%[inptrs], 0]\n"
- "ldr x22, [%[inptrs], 40]\n"
- "fmla v4.4s, v16.4s, v1.4s\n"
- "ldr s0, [%[wbptr], #24]\n"
- "ldr s17, [x21, x27]\n"
- "ldr x20, [%[inptrs], 80]\n"
- "fmla v8.4s, v17.4s, v14.4s\n"
- "ldr s18, [x22, x27]\n"
- "ldr s16, [x20, x27]\n"
- "ldr x21, [%[inptrs], 8]\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "ldr s1, [%[wbptr], #36]\n"
- "ldr s11, [x21, x27]\n"
- "ldr x22, [%[inptrs], 48]\n"
- "fmla v8.4s, v18.4s, v13.4s\n"
- "ldr x21, [%[inptrs], 16]\n"
- "fmin v4.4s, v4.4s, v12.4s\n"
- "ldr s19, [x22, x27]\n"
- "ldr s12, [x21, x27]\n"
- "ldr x23, [%[inptrs], 120]\n"
- "ldr x20, [%[inptrs], 88]\n"
- "subs x26, x26, #1\n"
- "str s4, [x24, x28]\n"
- "mov v4.16b, v15.16b\n"
- "ldr s17, [x23, x27]\n"
- "fmla v8.4s, v11.4s, v10.4s\n"
- "ldr s15, [x20, x27]\n"
- "add x28, x28, #4\n"
- "fmla v8.4s, v16.4s, v9.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v3.4s, v16.4s, v14.4s\n"
- "ldr x22, [%[inptrs], 56]\n"
- "fmla v8.4s, v19.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 24]\n"
- "fmla v2.4s, v12.4s, v14.4s\n"
- "ldr s16, [x22, x27]\n"
- "movi v11.16b, #0\n"
- "ldr s18, [x21, x27]\n"
- "fmla v3.4s, v17.4s, v13.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v8.4s, v12.4s, v7.4s\n"
- "ldr x23, [%[inptrs], 128]\n"
- "fmla v2.4s, v16.4s, v13.4s\n"
- "ldr s19, [x20, x27]\n"
- "fmov v12.4s, #6.0\n"
- "ldr s17, [x23, x27]\n"
- "fmla v3.4s, v15.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 96]\n"
- "fmla v8.4s, v15.4s, v6.4s\n"
- "ldr x22, [%[inptrs], 64]\n"
- "fmla v2.4s, v18.4s, v10.4s\n"
- "ldr s15, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v14.4s\n"
- "ldr s18, [x22, x27]\n"
- "fmla v3.4s, v19.4s, v9.4s\n"
- "ldr x21, [%[inptrs], 32]\n"
- "fmla v8.4s, v16.4s, v0.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v2.4s, v15.4s, v9.4s\n"
- "ldr s19, [x21, x27]\n"
- "ldr s16, [x20, x27]\n"
- "ldr x23, [%[inptrs], 136]\n"
- "fmla v3.4s, v17.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 104]\n"
- "fmla v8.4s, v15.4s, v1.4s\n"
- "ldr s14, [x23, x27]\n"
- "fmla v2.4s, v18.4s, v5.4s\n"
- "ldr s17, [x20, x27]\n"
- "fmla v4.4s, v14.4s, v13.4s\n"
- "ldr x22, [%[inptrs], 72]\n"
- "fmla v3.4s, v15.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmax v8.4s, v8.4s, v11.4s\n"
- "ldr s18, [x22, x27]\n"
- "fmla v2.4s, v19.4s, v7.4s\n"
- "ldr s13, [x20, x27]\n"
- "fmla v4.4s, v17.4s, v10.4s\n"
- "ldr x23, [%[inptrs], 144]\n"
- "fmla v3.4s, v16.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 112]\n"
- "fmin v8.4s, v8.4s, v12.4s\n"
- "ldr s10, [x23, x27]\n"
- "fmla v2.4s, v17.4s, v6.4s\n"
- "ldr s15, [x20, x27]\n"
- "fmla v4.4s, v13.4s, v9.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v3.4s, v14.4s, v0.4s\n"
- "ldr x23, [%[inptrs], 152]\n"
- "ldr s9, [x20, x27]\n"
- "ldr x22, [%[outptrs], 0]\n"
- "fmla v2.4s, v18.4s, v0.4s\n"
- "ldr s19, [x23, x27]\n"
- "str s8, [x22, x28]\n"
- "fmla v4.4s, v10.4s, v5.4s\n"
- "fmla v3.4s, v13.4s, v1.4s\n"
- "ldr x20, [%[inptrs], 192]\n"
- "ldr x22, [%[outptrs], 8]\n"
- "ldr x24, [%[outptrs], 16]\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v15.4s, v1.4s\n"
- "ldr s16, [x20, x27]\n"
- "fmla v4.4s, v15.4s, v7.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "add x27, x27, #4\n"
- "fmax v2.4s, v2.4s, v11.4s\n"
- "fmla v4.4s, v9.4s, v6.4s\n"
- "fmin v3.4s, v3.4s, v12.4s\n"
- "fmin v2.4s, v2.4s, v12.4s\n"
- "str s3, [x24, x28]\n"
- "fmla v4.4s, v19.4s, v0.4s\n"
- "str s2, [x22, x28]\n"
- "ldr x24, [%[outptrs], 24]\n"
- "fmla v4.4s, v16.4s, v1.4s\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "fmin v4.4s, v4.4s, v12.4s\n"
- "str s4, [x24, x28]\n"
- "add x28, x28, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr)
- : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-#endif // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 2142c431ac..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,2341 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x20, %[inptr0], %[input_row_stride]\n"
- "add x13, %[input_col_stride1], %[input_col_stride1]\n"
- "add x24, %[outptr0], %[output_row_stride]\n"
- "add x21, x20, %[input_row_stride]\n"
- "add x14, x13, #64\n"
- "add x15, x13, %[input_col_stride1]\n"
- "add x22, x21, %[input_row_stride]\n"
- "add x16, x15, #64\n"
- "add x17, x15, %[input_col_stride1]\n"
- "add x23, x22, %[input_row_stride]\n"
- "add x9, x17, #64\n"
- "add x25, x24, %[output_row_stride]\n"
- "add x26, %[output_col_stride1], %[output_col_stride1]\n"
- "and x27, %[n_channels], #3\n"
- "lsr x28, %[n_channels], #2\n"
- "cbz x28, 4f\n"
- "1:\n"
- "ldr q25, [%[wbptr]]\n"
- "subs x28, x28, #1\n"
- "mov v17.16b, v25.16b\n"
- "ldr q16, [%[wbptr], #16]\n"
- "mov v13.16b, v25.16b\n"
- "ldr q7, [%[wbptr], #32]\n"
- "mov v15.16b, v25.16b\n"
- "ldr q6, [%[wbptr], #48]\n"
- "mov v10.16b, v25.16b\n"
- "ldr q5, [%[wbptr], #64]\n"
- "mov v12.16b, v25.16b\n"
- "ldr q4, [%[wbptr], #80]\n"
- "mov v14.16b, v25.16b\n"
- "ldr q3, [%[wbptr], #96]\n"
- "mov v9.16b, v25.16b\n"
- "ldr q2, [%[wbptr], #112]\n"
- "mov v11.16b, v25.16b\n"
- "ldr q1, [%[wbptr], #128]\n"
- "mov v8.16b, v25.16b\n"
- "ldr q0, [%[wbptr], #144]\n"
- "ldr q26, [%[inptr0]]\n"
- "ldr q28, [x20]\n"
- "fmla v17.4s, v26.4s, v16.4s\n"
- "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v13.4s, v28.4s, v16.4s\n"
- "ldr q27, [x21]\n"
- "fmla v15.4s, v29.4s, v16.4s\n"
- "ldr q21, [x20, %[input_col_stride1]]\n"
- "fmla v17.4s, v28.4s, v5.4s\n"
- "ldr q20, [%[inptr0], x13]\n"
- "ldr q23, [x22]\n"
- "ldr q19, [x21, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x20, #64]\n"
- "fmla v17.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [%[inptr0], x19]\n"
- "prfm pldl1keep, [x21, #64]\n"
- "prfm pldl1keep, [x20, x19]\n"
- "prfm pldl1keep, [%[inptr0], x14]\n"
- "prfm pldl1keep, [x22, #64]\n"
- "prfm pldl1keep, [x21, x19]\n"
- "beq 3f\n"
- "2:\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "ldr q30, [x20, x13]\n"
- "fmla v13.4s, v27.4s, v5.4s\n"
- "ldr q29, [%[inptr0], x15]\n"
- "fmla v10.4s, v27.4s, v16.4s\n"
- "ldr q28, [x23]\n"
- "fmla v17.4s, v21.4s, v4.4s\n"
- "ldr q24, [x22, %[input_col_stride1]]\n"
- "fmla v13.4s, v21.4s, v7.4s\n"
- "ldr q18, [x21, x13]\n"
- "fmla v15.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [x20, x14]\n"
- "fmla v12.4s, v21.4s, v16.4s\n"
- "ldr q22, [x20, x15]\n"
- "fmla v17.4s, v20.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v15.4s, v20.4s, v7.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v14.4s, v20.4s, v16.4s\n"
- "ldr q25, [%[inptr0], x17]\n"
- "fmla v13.4s, v23.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x19]\n"
- "fmla v10.4s, v23.4s, v5.4s\n"
- "ldr q26, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v19.4s, v1.4s\n"
- "prfm pldl1keep, [x21, x14]\n"
- "fmla v13.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x9]\n"
- "fmla v10.4s, v19.4s, v7.4s\n"
- "prfm pldl1keep, [x23, x19]\n"
- "fmla v12.4s, v19.4s, v5.4s\n"
- "prfm pldl1keep, [x22, x14]\n"
- "fmla v9.4s, v19.4s, v16.4s\n"
- "ldr q27, [x22, x13]\n"
- "fmla v17.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x9]\n"
- "fmla v15.4s, v30.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x14]\n"
- "fmla v12.4s, v30.4s, v7.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x9]\n"
- "fmla v11.4s, v30.4s, v16.4s\n"
- "ldr q21, [x21, x15]\n"
- "fmla v15.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr q20, [x20, x17]\n"
- "fmla v10.4s, v28.4s, v2.4s\n"
- "ldr q19, [x23, x13]\n"
- "fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x9]\n"
- "fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x9]\n"
- "fmla v10.4s, v24.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v9.4s, v24.4s, v5.4s\n"
- "ldr q23, [x22, x15]\n"
- "fmla v17.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v13.4s, v18.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v15.4s, v18.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "str q17, [%[outptr0]]\n"
- "fmla v10.4s, v18.4s, v6.4s\n"
- "fmla v12.4s, v18.4s, v4.4s\n"
- "ldr q17, [x21, x17]\n"
- "fmla v14.4s, v18.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x19]\n"
- "fmla v9.4s, v18.4s, v7.4s\n"
- "prfm pldl1keep, [%[inptr0], x14]\n"
- "fmla v11.4s, v18.4s, v5.4s\n"
- "add x20, x20, #16\n"
- "fmla v8.4s, v18.4s, v16.4s\n"
- "ldr q24, [x23, x15]\n"
- "fmla v15.4s, v22.4s, v3.4s\n"
- "ldr q18, [x22, x17]\n"
- "fmla v12.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, #64]\n"
- "fmla v14.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x19]\n"
- "fmla v11.4s, v22.4s, v7.4s\n"
- "ldr q22, [x23, x17]\n"
- "fmla v10.4s, v26.4s, v1.4s\n"
- "add x21, x21, #16\n"
- "fmla v14.4s, v25.4s, v6.4s\n"
- "ldr q25, [%[wbptr]]\n"
- "fmla v9.4s, v26.4s, v2.4s\n"
- "ldr q16, [%[wbptr], #16]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "prfm pldl1keep, [x21, #64]\n"
- "fmla v10.4s, v27.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x19]\n"
- "fmla v12.4s, v27.4s, v1.4s\n"
- "add x22, x22, #16\n"
- "str q13, [x24]\n"
- "fmla v9.4s, v27.4s, v4.4s\n"
- "fmla v11.4s, v27.4s, v2.4s\n"
- "ldr q26, [%[inptr0]]\n"
- "fmla v8.4s, v27.4s, v5.4s\n"
- "ldr q28, [x20]\n"
- "fmla v15.4s, v21.4s, v0.4s\n"
- "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v14.4s, v21.4s, v1.4s\n"
- "add x23, x23, #16\n"
- "str q15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "ldr q5, [%[wbptr], #64]\n"
- "fmla v8.4s, v21.4s, v7.4s\n"
- "ldr q27, [x21]\n"
- "fmla v14.4s, v20.4s, v3.4s\n"
- "ldr q21, [x20, %[input_col_stride1]]\n"
- "fmla v11.4s, v20.4s, v6.4s\n"
- "ldr q20, [%[inptr0], x13]\n"
- "fmla v10.4s, v19.4s, v0.4s\n"
- "subs x28, x28, #1\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v8.4s, v19.4s, v2.4s\n"
- "fmla v12.4s, v23.4s, v0.4s\n"
- "ldr q7, [%[wbptr], #32]\n"
- "str q10, [x25]\n"
- "fmla v11.4s, v23.4s, v1.4s\n"
- "fmla v9.4s, v23.4s, v3.4s\n"
- "ldr q2, [%[wbptr], #112]\n"
- "str q12, [x24, %[output_col_stride1]]\n"
- "fmla v8.4s, v23.4s, v4.4s\n"
- "fmla v14.4s, v17.4s, v0.4s\n"
- "ldr q23, [x22]\n"
- "fmla v11.4s, v17.4s, v3.4s\n"
- "ldr q19, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v17.4s, v6.4s\n"
- "ldr q4, [%[wbptr], #80]\n"
- "str q14, [%[outptr0], x26]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "fmla v11.4s, v18.4s, v0.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v8.4s, v24.4s, v1.4s\n"
- "ldr q6, [%[wbptr], #48]\n"
- "str q9, [x25, %[output_col_stride1]]\n"
- "mov v17.16b, v25.16b\n"
- "str q11, [x24, x26]\n"
- "mov v13.16b, v25.16b\n"
- "fmla v8.4s, v18.4s, v3.4s\n"
- "ldr q1, [%[wbptr], #128]\n"
- "mov v15.16b, v25.16b\n"
- "add x24, x24, #16\n"
- "mov v10.16b, v25.16b\n"
- "mov v12.16b, v25.16b\n"
- "fmla v8.4s, v22.4s, v0.4s\n"
- "ldr q3, [%[wbptr], #96]\n"
- "mov v14.16b, v25.16b\n"
- "mov v9.16b, v25.16b\n"
- "mov v11.16b, v25.16b\n"
- "fmla v17.4s, v26.4s, v16.4s\n"
- "str q8, [x25, x26]\n"
- "fmla v13.4s, v28.4s, v16.4s\n"
- "mov v8.16b, v25.16b\n"
- "ldr q0, [%[wbptr], #144]\n"
- "fmla v17.4s, v28.4s, v5.4s\n"
- "fmla v15.4s, v29.4s, v16.4s\n"
- "add x25, x25, #16\n"
- "fmla v17.4s, v29.4s, v7.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "ldr q30, [x20, x13]\n"
- "fmla v13.4s, v27.4s, v5.4s\n"
- "ldr q29, [%[inptr0], x15]\n"
- "fmla v10.4s, v27.4s, v16.4s\n"
- "ldr q28, [x23]\n"
- "fmla v17.4s, v21.4s, v4.4s\n"
- "ldr q24, [x22, %[input_col_stride1]]\n"
- "fmla v13.4s, v21.4s, v7.4s\n"
- "ldr q18, [x21, x13]\n"
- "fmla v15.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [x20, x14]\n"
- "fmla v12.4s, v21.4s, v16.4s\n"
- "ldr q22, [x20, x15]\n"
- "fmla v17.4s, v20.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v15.4s, v20.4s, v7.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v14.4s, v20.4s, v16.4s\n"
- "ldr q25, [%[inptr0], x17]\n"
- "fmla v13.4s, v23.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x19]\n"
- "fmla v10.4s, v23.4s, v5.4s\n"
- "ldr q26, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v19.4s, v1.4s\n"
- "prfm pldl1keep, [x21, x14]\n"
- "fmla v13.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x9]\n"
- "fmla v10.4s, v19.4s, v7.4s\n"
- "prfm pldl1keep, [x23, x19]\n"
- "fmla v12.4s, v19.4s, v5.4s\n"
- "prfm pldl1keep, [x22, x14]\n"
- "fmla v9.4s, v19.4s, v16.4s\n"
- "ldr q27, [x22, x13]\n"
- "fmla v17.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x9]\n"
- "fmla v15.4s, v30.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x14]\n"
- "fmla v12.4s, v30.4s, v7.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x9]\n"
- "fmla v11.4s, v30.4s, v16.4s\n"
- "ldr q21, [x21, x15]\n"
- "fmla v15.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr q20, [x20, x17]\n"
- "fmla v10.4s, v28.4s, v2.4s\n"
- "ldr q19, [x23, x13]\n"
- "fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x9]\n"
- "fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x9]\n"
- "fmla v10.4s, v24.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v9.4s, v24.4s, v5.4s\n"
- "ldr q23, [x22, x15]\n"
- "fmla v17.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v13.4s, v18.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v15.4s, v18.4s, v1.4s\n"
- "add x20, x20, #16\n"
- "str q17, [%[outptr0]]\n"
- "fmla v10.4s, v18.4s, v6.4s\n"
- "fmla v12.4s, v18.4s, v4.4s\n"
- "ldr q17, [x21, x17]\n"
- "fmla v14.4s, v18.4s, v2.4s\n"
- "add x21, x21, #16\n"
- "fmla v9.4s, v18.4s, v7.4s\n"
- "fmla v11.4s, v18.4s, v5.4s\n"
- "fmla v8.4s, v18.4s, v16.4s\n"
- "ldr q24, [x23, x15]\n"
- "fmla v15.4s, v22.4s, v3.4s\n"
- "ldr q18, [x22, x17]\n"
- "fmla v12.4s, v22.4s, v6.4s\n"
- "add x22, x22, #16\n"
- "fmla v14.4s, v22.4s, v4.4s\n"
- "fmla v11.4s, v22.4s, v7.4s\n"
- "fmla v10.4s, v26.4s, v1.4s\n"
- "ldr q22, [x23, x17]\n"
- "fmla v9.4s, v26.4s, v2.4s\n"
- "add x23, x23, #16\n"
- "fmla v14.4s, v25.4s, v6.4s\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "fmla v10.4s, v27.4s, v3.4s\n"
- "fmla v12.4s, v27.4s, v1.4s\n"
- "fmla v9.4s, v27.4s, v4.4s\n"
- "fmla v11.4s, v27.4s, v2.4s\n"
- "str q13, [x24]\n"
- "fmla v8.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v21.4s, v0.4s\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "fmla v14.4s, v21.4s, v1.4s\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "fmla v8.4s, v21.4s, v7.4s\n"
- "str q15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v19.4s, v0.4s\n"
- "fmla v14.4s, v20.4s, v3.4s\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v11.4s, v20.4s, v6.4s\n"
- "fmla v8.4s, v19.4s, v2.4s\n"
- "str q10, [x25]\n"
- "fmla v12.4s, v23.4s, v0.4s\n"
- "fmla v9.4s, v23.4s, v3.4s\n"
- "fmla v14.4s, v17.4s, v0.4s\n"
- "fmla v11.4s, v23.4s, v1.4s\n"
- "fmla v8.4s, v23.4s, v4.4s\n"
- "str q12, [x24, %[output_col_stride1]]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "str q14, [%[outptr0], x26]\n"
- "fmla v11.4s, v17.4s, v3.4s\n"
- "fmla v8.4s, v17.4s, v6.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "str q9, [x25, %[output_col_stride1]]\n"
- "fmla v11.4s, v18.4s, v0.4s\n"
- "fmla v8.4s, v24.4s, v1.4s\n"
- "str q11, [x24, x26]\n"
- "fmla v8.4s, v18.4s, v3.4s\n"
- "add x24, x24, #16\n"
- "fmla v8.4s, v22.4s, v0.4s\n"
- "str q8, [x25, x26]\n"
- "add x25, x25, #16\n"
- "4:\n"
- "cbz x27, 7f\n"
- "ldr s25, [%[wbptr]]\n"
- "mov v17.16b, v25.16b\n"
- "ldr s16, [%[wbptr], #4]\n"
- "mov v13.16b, v25.16b\n"
- "ldr s7, [%[wbptr], #8]\n"
- "mov v15.16b, v25.16b\n"
- "ldr s6, [%[wbptr], #12]\n"
- "mov v10.16b, v25.16b\n"
- "ldr s5, [%[wbptr], #16]\n"
- "mov v12.16b, v25.16b\n"
- "ldr s4, [%[wbptr], #20]\n"
- "mov v14.16b, v25.16b\n"
- "ldr s3, [%[wbptr], #24]\n"
- "mov v9.16b, v25.16b\n"
- "ldr s2, [%[wbptr], #28]\n"
- "mov v11.16b, v25.16b\n"
- "ldr s1, [%[wbptr], #32]\n"
- "mov v8.16b, v25.16b\n"
- "ldr s0, [%[wbptr], #36]\n"
- "ldr s26, [%[inptr0]]\n"
- "subs x27, x27, #1\n"
- "fmla v17.4s, v26.4s, v16.4s\n"
- "ldr s28, [x20]\n"
- "fmla v13.4s, v28.4s, v16.4s\n"
- "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v15.4s, v29.4s, v16.4s\n"
- "ldr s27, [x21]\n"
- "fmla v17.4s, v28.4s, v5.4s\n"
- "ldr s21, [x20, %[input_col_stride1]]\n"
- "ldr s20, [%[inptr0], x13]\n"
- "ldr s23, [x22]\n"
- "ldr s19, [x21, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v17.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [x20, #64]\n"
- "prfm pldl1keep, [%[inptr0], x19]\n"
- "prfm pldl1keep, [x21, #64]\n"
- "prfm pldl1keep, [x20, x19]\n"
- "prfm pldl1keep, [%[inptr0], x14]\n"
- "prfm pldl1keep, [x22, #64]\n"
- "prfm pldl1keep, [x21, x19]\n"
- "beq 6f\n"
- "5:\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "ldr s30, [x20, x13]\n"
- "fmla v13.4s, v27.4s, v5.4s\n"
- "ldr s29, [%[inptr0], x15]\n"
- "fmla v10.4s, v27.4s, v16.4s\n"
- "ldr s28, [x23]\n"
- "fmla v17.4s, v21.4s, v4.4s\n"
- "ldr s24, [x22, %[input_col_stride1]]\n"
- "fmla v13.4s, v21.4s, v7.4s\n"
- "ldr s18, [x21, x13]\n"
- "fmla v15.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [x20, x14]\n"
- "fmla v12.4s, v21.4s, v16.4s\n"
- "ldr s22, [x20, x15]\n"
- "fmla v17.4s, v20.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v15.4s, v20.4s, v7.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v14.4s, v20.4s, v16.4s\n"
- "ldr s25, [%[inptr0], x17]\n"
- "fmla v13.4s, v23.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x19]\n"
- "fmla v10.4s, v23.4s, v5.4s\n"
- "ldr s26, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v19.4s, v1.4s\n"
- "prfm pldl1keep, [x21, x14]\n"
- "fmla v13.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x9]\n"
- "fmla v10.4s, v19.4s, v7.4s\n"
- "prfm pldl1keep, [x23, x19]\n"
- "fmla v12.4s, v19.4s, v5.4s\n"
- "prfm pldl1keep, [x22, x14]\n"
- "fmla v9.4s, v19.4s, v16.4s\n"
- "ldr s27, [x22, x13]\n"
- "fmla v17.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x9]\n"
- "fmla v15.4s, v30.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x14]\n"
- "fmla v12.4s, v30.4s, v7.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x9]\n"
- "fmla v11.4s, v30.4s, v16.4s\n"
- "ldr s21, [x21, x15]\n"
- "fmla v15.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr s20, [x20, x17]\n"
- "fmla v10.4s, v28.4s, v2.4s\n"
- "ldr s19, [x23, x13]\n"
- "fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x9]\n"
- "fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x9]\n"
- "fmla v10.4s, v24.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v9.4s, v24.4s, v5.4s\n"
- "ldr s23, [x22, x15]\n"
- "fmla v17.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v13.4s, v18.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v15.4s, v18.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "str s17, [%[outptr0]]\n"
- "fmla v10.4s, v18.4s, v6.4s\n"
- "fmla v12.4s, v18.4s, v4.4s\n"
- "ldr s17, [x21, x17]\n"
- "fmla v14.4s, v18.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x19]\n"
- "fmla v9.4s, v18.4s, v7.4s\n"
- "prfm pldl1keep, [%[inptr0], x14]\n"
- "fmla v11.4s, v18.4s, v5.4s\n"
- "add x20, x20, #4\n"
- "fmla v8.4s, v18.4s, v16.4s\n"
- "ldr s24, [x23, x15]\n"
- "fmla v15.4s, v22.4s, v3.4s\n"
- "ldr s18, [x22, x17]\n"
- "fmla v12.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, #64]\n"
- "fmla v14.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x19]\n"
- "fmla v11.4s, v22.4s, v7.4s\n"
- "ldr s22, [x23, x17]\n"
- "fmla v10.4s, v26.4s, v1.4s\n"
- "add x21, x21, #4\n"
- "fmla v14.4s, v25.4s, v6.4s\n"
- "ldr s25, [%[wbptr]]\n"
- "fmla v9.4s, v26.4s, v2.4s\n"
- "ldr s16, [%[wbptr], #4]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "prfm pldl1keep, [x21, #64]\n"
- "fmla v10.4s, v27.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x19]\n"
- "fmla v12.4s, v27.4s, v1.4s\n"
- "add x22, x22, #4\n"
- "str s13, [x24]\n"
- "fmla v9.4s, v27.4s, v4.4s\n"
- "fmla v11.4s, v27.4s, v2.4s\n"
- "ldr s26, [%[inptr0]]\n"
- "fmla v8.4s, v27.4s, v5.4s\n"
- "ldr s28, [x20]\n"
- "fmla v15.4s, v21.4s, v0.4s\n"
- "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v14.4s, v21.4s, v1.4s\n"
- "add x23, x23, #4\n"
- "str s15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "ldr s5, [%[wbptr], #16]\n"
- "fmla v8.4s, v21.4s, v7.4s\n"
- "ldr s27, [x21]\n"
- "fmla v14.4s, v20.4s, v3.4s\n"
- "ldr s21, [x20, %[input_col_stride1]]\n"
- "fmla v11.4s, v20.4s, v6.4s\n"
- "ldr s20, [%[inptr0], x13]\n"
- "fmla v10.4s, v19.4s, v0.4s\n"
- "subs x27, x27, #1\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v8.4s, v19.4s, v2.4s\n"
- "fmla v12.4s, v23.4s, v0.4s\n"
- "ldr s7, [%[wbptr], #8]\n"
- "str s10, [x25]\n"
- "fmla v11.4s, v23.4s, v1.4s\n"
- "fmla v9.4s, v23.4s, v3.4s\n"
- "ldr s2, [%[wbptr], #28]\n"
- "str s12, [x24, %[output_col_stride1]]\n"
- "fmla v8.4s, v23.4s, v4.4s\n"
- "fmla v14.4s, v17.4s, v0.4s\n"
- "ldr s23, [x22]\n"
- "fmla v11.4s, v17.4s, v3.4s\n"
- "ldr s19, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v17.4s, v6.4s\n"
- "ldr s4, [%[wbptr], #20]\n"
- "str s14, [%[outptr0], x26]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "fmla v11.4s, v18.4s, v0.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v8.4s, v24.4s, v1.4s\n"
- "ldr s6, [%[wbptr], #12]\n"
- "str s9, [x25, %[output_col_stride1]]\n"
- "mov v17.16b, v25.16b\n"
- "str s11, [x24, x26]\n"
- "mov v13.16b, v25.16b\n"
- "fmla v8.4s, v18.4s, v3.4s\n"
- "ldr s1, [%[wbptr], #32]\n"
- "mov v15.16b, v25.16b\n"
- "add x24, x24, #4\n"
- "mov v10.16b, v25.16b\n"
- "mov v12.16b, v25.16b\n"
- "fmla v8.4s, v22.4s, v0.4s\n"
- "ldr s3, [%[wbptr], #24]\n"
- "mov v14.16b, v25.16b\n"
- "mov v9.16b, v25.16b\n"
- "mov v11.16b, v25.16b\n"
- "fmla v17.4s, v26.4s, v16.4s\n"
- "str s8, [x25, x26]\n"
- "fmla v13.4s, v28.4s, v16.4s\n"
- "mov v8.16b, v25.16b\n"
- "ldr s0, [%[wbptr], #36]\n"
- "fmla v17.4s, v28.4s, v5.4s\n"
- "fmla v15.4s, v29.4s, v16.4s\n"
- "add x25, x25, #4\n"
- "fmla v17.4s, v29.4s, v7.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "ldr s30, [x20, x13]\n"
- "fmla v13.4s, v27.4s, v5.4s\n"
- "ldr s29, [%[inptr0], x15]\n"
- "fmla v10.4s, v27.4s, v16.4s\n"
- "ldr s28, [x23]\n"
- "fmla v17.4s, v21.4s, v4.4s\n"
- "ldr s24, [x22, %[input_col_stride1]]\n"
- "fmla v13.4s, v21.4s, v7.4s\n"
- "ldr s18, [x21, x13]\n"
- "fmla v15.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [x20, x14]\n"
- "fmla v12.4s, v21.4s, v16.4s\n"
- "ldr s22, [x20, x15]\n"
- "fmla v17.4s, v20.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v15.4s, v20.4s, v7.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v14.4s, v20.4s, v16.4s\n"
- "ldr s25, [%[inptr0], x17]\n"
- "fmla v13.4s, v23.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x19]\n"
- "fmla v10.4s, v23.4s, v5.4s\n"
- "ldr s26, [x23, %[input_col_stride1]]\n"
- "fmla v17.4s, v19.4s, v1.4s\n"
- "prfm pldl1keep, [x21, x14]\n"
- "fmla v13.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x9]\n"
- "fmla v10.4s, v19.4s, v7.4s\n"
- "prfm pldl1keep, [x23, x19]\n"
- "fmla v12.4s, v19.4s, v5.4s\n"
- "prfm pldl1keep, [x22, x14]\n"
- "fmla v9.4s, v19.4s, v16.4s\n"
- "ldr s27, [x22, x13]\n"
- "fmla v17.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x9]\n"
- "fmla v15.4s, v30.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x14]\n"
- "fmla v12.4s, v30.4s, v7.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x9]\n"
- "fmla v11.4s, v30.4s, v16.4s\n"
- "ldr s21, [x21, x15]\n"
- "fmla v15.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr s20, [x20, x17]\n"
- "fmla v10.4s, v28.4s, v2.4s\n"
- "ldr s19, [x23, x13]\n"
- "fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x9]\n"
- "fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x9]\n"
- "fmla v10.4s, v24.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v9.4s, v24.4s, v5.4s\n"
- "ldr s23, [x22, x15]\n"
- "fmla v17.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v13.4s, v18.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v15.4s, v18.4s, v1.4s\n"
- "add x20, x20, #4\n"
- "str s17, [%[outptr0]]\n"
- "fmla v10.4s, v18.4s, v6.4s\n"
- "fmla v12.4s, v18.4s, v4.4s\n"
- "ldr s17, [x21, x17]\n"
- "fmla v14.4s, v18.4s, v2.4s\n"
- "add x21, x21, #4\n"
- "fmla v9.4s, v18.4s, v7.4s\n"
- "fmla v11.4s, v18.4s, v5.4s\n"
- "fmla v8.4s, v18.4s, v16.4s\n"
- "ldr s24, [x23, x15]\n"
- "fmla v15.4s, v22.4s, v3.4s\n"
- "ldr s18, [x22, x17]\n"
- "fmla v12.4s, v22.4s, v6.4s\n"
- "add x22, x22, #4\n"
- "fmla v14.4s, v22.4s, v4.4s\n"
- "fmla v11.4s, v22.4s, v7.4s\n"
- "fmla v10.4s, v26.4s, v1.4s\n"
- "ldr s22, [x23, x17]\n"
- "fmla v9.4s, v26.4s, v2.4s\n"
- "add x23, x23, #4\n"
- "fmla v14.4s, v25.4s, v6.4s\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "fmla v10.4s, v27.4s, v3.4s\n"
- "fmla v12.4s, v27.4s, v1.4s\n"
- "fmla v9.4s, v27.4s, v4.4s\n"
- "fmla v11.4s, v27.4s, v2.4s\n"
- "str s13, [x24]\n"
- "fmla v8.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v21.4s, v0.4s\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "fmla v14.4s, v21.4s, v1.4s\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "fmla v8.4s, v21.4s, v7.4s\n"
- "str s15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v10.4s, v19.4s, v0.4s\n"
- "fmla v14.4s, v20.4s, v3.4s\n"
- "fmla v9.4s, v19.4s, v1.4s\n"
- "fmla v11.4s, v20.4s, v6.4s\n"
- "fmla v8.4s, v19.4s, v2.4s\n"
- "str s10, [x25]\n"
- "fmla v12.4s, v23.4s, v0.4s\n"
- "fmla v9.4s, v23.4s, v3.4s\n"
- "fmla v14.4s, v17.4s, v0.4s\n"
- "fmla v11.4s, v23.4s, v1.4s\n"
- "fmla v8.4s, v23.4s, v4.4s\n"
- "str s12, [x24, %[output_col_stride1]]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "str s14, [%[outptr0], x26]\n"
- "fmla v11.4s, v17.4s, v3.4s\n"
- "fmla v8.4s, v17.4s, v6.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "str s9, [x25, %[output_col_stride1]]\n"
- "fmla v11.4s, v18.4s, v0.4s\n"
- "fmla v8.4s, v24.4s, v1.4s\n"
- "str s11, [x24, x26]\n"
- "fmla v8.4s, v18.4s, v3.4s\n"
- "add x24, x24, #4\n"
- "fmla v8.4s, v22.4s, v0.4s\n"
- "str s8, [x25, x26]\n"
- "add x25, x25, #4\n"
- "7:\n"
- : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x25, %[inptr0], %[input_row_stride]\n"
- "add x16, %[input_col_stride1], %[input_col_stride1]\n"
- "add x21, %[outptr0], %[output_row_stride]\n"
- "add x22, x25, %[input_row_stride]\n"
- "add x23, x16, #64\n"
- "add x26, x16, %[input_col_stride1]\n"
- "add x13, x22, %[input_row_stride]\n"
- "add x20, x26, #64\n"
- "add x9, x26, %[input_col_stride1]\n"
- "add x24, x13, %[input_row_stride]\n"
- "add x15, x9, #64\n"
- "add x14, x21, %[output_row_stride]\n"
- "add x19, %[output_col_stride1], %[output_col_stride1]\n"
- "and x27, %[n_channels], #3\n"
- "lsr x28, %[n_channels], #2\n"
- "cbz x28, 4f\n"
- "1:\n"
- "ldr q20, [%[wbptr]]\n"
- "subs x28, x28, #1\n"
- "mov v4.16b, v20.16b\n"
- "ldr q15, [%[wbptr], #16]\n"
- "mov v1.16b, v20.16b\n"
- "ldr q0, [%[wbptr], #32]\n"
- "mov v3.16b, v20.16b\n"
- "ldr q13, [%[wbptr], #48]\n"
- "mov v7.16b, v20.16b\n"
- "ldr q16, [%[wbptr], #64]\n"
- "mov v9.16b, v20.16b\n"
- "ldr q12, [%[wbptr], #80]\n"
- "mov v2.16b, v20.16b\n"
- "ldr q17, [%[wbptr], #96]\n"
- "mov v6.16b, v20.16b\n"
- "ldr q11, [%[wbptr], #112]\n"
- "mov v8.16b, v20.16b\n"
- "ldr q10, [%[wbptr], #128]\n"
- "mov v5.16b, v20.16b\n"
- "ldr q14, [%[wbptr], #144]\n"
- "ldr q27, [%[inptr0]]\n"
- "ldr q24, [x25]\n"
- "fmla v4.4s, v27.4s, v15.4s\n"
- "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q21, [x22]\n"
- "ldr q19, [x25, %[input_col_stride1]]\n"
- "ldr q31, [%[inptr0], x16]\n"
- "ldr q28, [x13]\n"
- "fmla v4.4s, v24.4s, v16.4s\n"
- "ldr q18, [x22, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x25, #64]\n"
- "prfm pldl1keep, [%[inptr0], x17]\n"
- "prfm pldl1keep, [x22, #64]\n"
- "prfm pldl1keep, [x25, x17]\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "prfm pldl1keep, [x13, #64]\n"
- "prfm pldl1keep, [x22, x17]\n"
- "beq 3f\n"
- "2:\n"
- "fmla v1.4s, v24.4s, v15.4s\n"
- "ldr q24, [x25, x16]\n"
- "fmla v4.4s, v22.4s, v0.4s\n"
- "ldr q29, [%[inptr0], x26]\n"
- "fmla v3.4s, v22.4s, v15.4s\n"
- "ldr q30, [x24]\n"
- "fmla v1.4s, v21.4s, v16.4s\n"
- "ldr q25, [x13, %[input_col_stride1]]\n"
- "fmla v4.4s, v21.4s, v11.4s\n"
- "prfm pldl1keep, [x25, x23]\n"
- "fmla v7.4s, v21.4s, v15.4s\n"
- "ldr q26, [x22, x16]\n"
- "fmla v1.4s, v19.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v4.4s, v19.4s, v12.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v3.4s, v19.4s, v16.4s\n"
- "prfm pldl1keep, [x13, x17]\n"
- "fmla v9.4s, v19.4s, v15.4s\n"
- "ldr q23, [x25, x26]\n"
- "fmla v4.4s, v31.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x23]\n"
- "fmla v3.4s, v31.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x20]\n"
- "fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x9]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "ldr q28, [x24, %[input_col_stride1]]\n"
- "fmla v4.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x17]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "prfm pldl1keep, [x13, x23]\n"
- "fmla v3.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x22, x20]\n"
- "fmla v7.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x15]\n"
- "fmla v9.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x23]\n"
- "fmla v6.4s, v18.4s, v15.4s\n"
- "ldr q27, [x13, x16]\n"
- "fmla v4.4s, v24.4s, v17.4s\n"
- "prfm pldl1keep, [x13, x20]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x15]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "prfm pldl1keep, [x24, x20]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "prfm pldl1keep, [x13, x15]\n"
- "fmla v2.4s, v24.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v8.4s, v24.4s, v15.4s\n"
- "ldr q24, [x22, x26]\n"
- "fmla v3.4s, v29.4s, v13.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x9]\n"
- "fmla v7.4s, v30.4s, v11.4s\n"
- "ldr q21, [x24, x16]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v25.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v7.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v6.4s, v25.4s, v16.4s\n"
- "ldr q19, [x13, x26]\n"
- "fmla v4.4s, v26.4s, v14.4s\n"
- "prfm pldl1keep, [%[inptr0], x17]\n"
- "fmla v1.4s, v26.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v3.4s, v26.4s, v10.4s\n"
- "add x25, x25, #16\n"
- "fmla v7.4s, v26.4s, v13.4s\n"
- "prfm pldl1keep, [x25, #64]\n"
- "fmla v9.4s, v26.4s, v12.4s\n"
- "prfm pldl1keep, [x25, x17]\n"
- "fmla v2.4s, v26.4s, v11.4s\n"
- "subs x28, x28, #1\n"
- "fmla v6.4s, v26.4s, v0.4s\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x9]\n"
- "fmla v3.4s, v23.4s, v17.4s\n"
- "ldr q18, [x24, x26]\n"
- "fmla v9.4s, v23.4s, v13.4s\n"
- "add x22, x22, #16\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v8.4s, v23.4s, v0.4s\n"
- "ldr q23, [x13, x9]\n"
- "fmla v7.4s, v28.4s, v10.4s\n"
- "prfm pldl1keep, [x22, x17]\n"
- "fmla v2.4s, v20.4s, v13.4s\n"
- "ldr q25, [x24, x9]\n"
- "fmla v6.4s, v28.4s, v11.4s\n"
- "ldr q20, [%[wbptr]]\n"
- "fmla v1.4s, v27.4s, v14.4s\n"
- "add x13, x13, #16\n"
- "fmla v7.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [x13, #64]\n"
- "fmla v9.4s, v27.4s, v10.4s\n"
- "add x24, x24, #16\n"
- "fmla v6.4s, v27.4s, v12.4s\n"
- "fmla v8.4s, v27.4s, v11.4s\n"
- "fmla v5.4s, v27.4s, v16.4s\n"
- "ldr q15, [%[wbptr], #16]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "ldr q27, [%[inptr0]]\n"
- "fmla v9.4s, v24.4s, v17.4s\n"
- "fmla v2.4s, v24.4s, v10.4s\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "fmla v8.4s, v24.4s, v12.4s\n"
- "fmla v5.4s, v24.4s, v0.4s\n"
- "ldr q16, [%[wbptr], #64]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "ldr q24, [x25]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v7.4s, v21.4s, v14.4s\n"
- "fmla v6.4s, v21.4s, v10.4s\n"
- "fmla v5.4s, v21.4s, v11.4s\n"
- "ldr q0, [%[wbptr], #32]\n"
- "fmla v9.4s, v19.4s, v14.4s\n"
- "ldr q21, [x22]\n"
- "fmla v6.4s, v19.4s, v17.4s\n"
- "fmla v8.4s, v19.4s, v10.4s\n"
- "fmla v5.4s, v19.4s, v12.4s\n"
- "ldr q11, [%[wbptr], #112]\n"
- "fmla v2.4s, v26.4s, v14.4s\n"
- "movi v29.16b, #0\n"
- "fmla v8.4s, v26.4s, v17.4s\n"
- "fmla v6.4s, v18.4s, v14.4s\n"
- "fmla v5.4s, v26.4s, v13.4s\n"
- "ldr q12, [%[wbptr], #80]\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "ldr q19, [x25, %[input_col_stride1]]\n"
- "fmla v8.4s, v23.4s, v14.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "str q4, [%[outptr0]]\n"
- "fmla v5.4s, v18.4s, v10.4s\n"
- "str q3, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr q13, [%[wbptr], #48]\n"
- "str q2, [%[outptr0], x19]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "str q1, [x21]\n"
- "fmax v9.4s, v9.4s, v29.4s\n"
- "fmax v8.4s, v8.4s, v29.4s\n"
- "ldr q10, [%[wbptr], #128]\n"
- "str q9, [x21, %[output_col_stride1]]\n"
- "fmla v5.4s, v25.4s, v14.4s\n"
- "str q8, [x21, x19]\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "ldr q17, [%[wbptr], #96]\n"
- "str q7, [x14]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "str q6, [x14, %[output_col_stride1]]\n"
- "mov v4.16b, v20.16b\n"
- "str q5, [x14, x19]\n"
- "mov v1.16b, v20.16b\n"
- "mov v3.16b, v20.16b\n"
- "ldr q14, [%[wbptr], #144]\n"
- "mov v7.16b, v20.16b\n"
- "ldr q31, [%[inptr0], x16]\n"
- "mov v9.16b, v20.16b\n"
- "ldr q28, [x13]\n"
- "mov v2.16b, v20.16b\n"
- "ldr q18, [x22, %[input_col_stride1]]\n"
- "mov v6.16b, v20.16b\n"
- "add %[outptr0], %[outptr0], #16\n"
- "mov v8.16b, v20.16b\n"
- "add x21, x21, #16\n"
- "mov v5.16b, v20.16b\n"
- "add x14, x14, #16\n"
- "fmla v4.4s, v27.4s, v15.4s\n"
- "fmla v4.4s, v24.4s, v16.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v1.4s, v24.4s, v15.4s\n"
- "ldr q24, [x25, x16]\n"
- "fmla v4.4s, v22.4s, v0.4s\n"
- "ldr q29, [%[inptr0], x26]\n"
- "fmla v3.4s, v22.4s, v15.4s\n"
- "ldr q30, [x24]\n"
- "fmla v1.4s, v21.4s, v16.4s\n"
- "ldr q25, [x13, %[input_col_stride1]]\n"
- "fmla v4.4s, v21.4s, v11.4s\n"
- "prfm pldl1keep, [x25, x23]\n"
- "fmla v7.4s, v21.4s, v15.4s\n"
- "ldr q26, [x22, x16]\n"
- "fmla v1.4s, v19.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v4.4s, v19.4s, v12.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v3.4s, v19.4s, v16.4s\n"
- "prfm pldl1keep, [x13, x17]\n"
- "fmla v9.4s, v19.4s, v15.4s\n"
- "ldr q23, [x25, x26]\n"
- "fmla v4.4s, v31.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x23]\n"
- "fmla v3.4s, v31.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x20]\n"
- "fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x9]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "ldr q28, [x24, %[input_col_stride1]]\n"
- "fmla v4.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x17]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "prfm pldl1keep, [x13, x23]\n"
- "fmla v3.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x22, x20]\n"
- "fmla v7.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x15]\n"
- "fmla v9.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x23]\n"
- "fmla v6.4s, v18.4s, v15.4s\n"
- "ldr q27, [x13, x16]\n"
- "fmla v4.4s, v24.4s, v17.4s\n"
- "prfm pldl1keep, [x13, x20]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x15]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "prfm pldl1keep, [x24, x20]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "prfm pldl1keep, [x13, x15]\n"
- "fmla v2.4s, v24.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v8.4s, v24.4s, v15.4s\n"
- "ldr q24, [x22, x26]\n"
- "fmla v3.4s, v29.4s, v13.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x9]\n"
- "fmla v7.4s, v30.4s, v11.4s\n"
- "ldr q21, [x24, x16]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v25.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v7.4s, v25.4s, v12.4s\n"
- "add x25, x25, #16\n"
- "fmla v6.4s, v25.4s, v16.4s\n"
- "ldr q19, [x13, x26]\n"
- "fmla v4.4s, v26.4s, v14.4s\n"
- "fmla v1.4s, v26.4s, v17.4s\n"
- "fmla v3.4s, v26.4s, v10.4s\n"
- "fmla v7.4s, v26.4s, v13.4s\n"
- "fmla v9.4s, v26.4s, v12.4s\n"
- "fmla v2.4s, v26.4s, v11.4s\n"
- "fmla v6.4s, v26.4s, v0.4s\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x9]\n"
- "fmla v3.4s, v23.4s, v17.4s\n"
- "ldr q18, [x24, x26]\n"
- "fmla v9.4s, v23.4s, v13.4s\n"
- "add x22, x22, #16\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "fmla v8.4s, v23.4s, v0.4s\n"
- "fmla v7.4s, v28.4s, v10.4s\n"
- "ldr q23, [x13, x9]\n"
- "fmla v6.4s, v28.4s, v11.4s\n"
- "ldr q25, [x24, x9]\n"
- "fmla v2.4s, v20.4s, v13.4s\n"
- "add x13, x13, #16\n"
- "fmla v1.4s, v27.4s, v14.4s\n"
- "add x24, x24, #16\n"
- "fmla v7.4s, v27.4s, v17.4s\n"
- "fmla v9.4s, v27.4s, v10.4s\n"
- "fmla v6.4s, v27.4s, v12.4s\n"
- "fmla v8.4s, v27.4s, v11.4s\n"
- "fmla v5.4s, v27.4s, v16.4s\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "fmla v9.4s, v24.4s, v17.4s\n"
- "fmla v2.4s, v24.4s, v10.4s\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "fmla v8.4s, v24.4s, v12.4s\n"
- "fmla v5.4s, v24.4s, v0.4s\n"
- "fmla v7.4s, v21.4s, v14.4s\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "fmla v9.4s, v19.4s, v14.4s\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "fmla v6.4s, v21.4s, v10.4s\n"
- "fmla v5.4s, v21.4s, v11.4s\n"
- "movi v29.16b, #0\n"
- "fmla v2.4s, v26.4s, v14.4s\n"
- "fmla v6.4s, v19.4s, v17.4s\n"
- "fmla v8.4s, v19.4s, v10.4s\n"
- "fmla v5.4s, v19.4s, v12.4s\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "fmla v6.4s, v18.4s, v14.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "str q4, [%[outptr0]]\n"
- "fmla v8.4s, v26.4s, v17.4s\n"
- "str q3, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v5.4s, v26.4s, v13.4s\n"
- "str q2, [%[outptr0], x19]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "fmla v8.4s, v23.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "str q1, [x21]\n"
- "fmla v5.4s, v18.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v29.4s\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v8.4s, v8.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "str q9, [x21, %[output_col_stride1]]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "str q8, [x21, x19]\n"
- "str q7, [x14]\n"
- "str q6, [x14, %[output_col_stride1]]\n"
- "add x21, x21, #16\n"
- "fmla v5.4s, v25.4s, v14.4s\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "str q5, [x14, x19]\n"
- "add x14, x14, #16\n"
- "4:\n"
- "cbz x27, 7f\n"
- "ldr s20, [%[wbptr]]\n"
- "mov v4.16b, v20.16b\n"
- "ldr s15, [%[wbptr], #4]\n"
- "mov v1.16b, v20.16b\n"
- "ldr s0, [%[wbptr], #8]\n"
- "mov v3.16b, v20.16b\n"
- "ldr s13, [%[wbptr], #12]\n"
- "mov v7.16b, v20.16b\n"
- "ldr s16, [%[wbptr], #16]\n"
- "mov v9.16b, v20.16b\n"
- "ldr s12, [%[wbptr], #20]\n"
- "mov v2.16b, v20.16b\n"
- "ldr s17, [%[wbptr], #24]\n"
- "mov v6.16b, v20.16b\n"
- "ldr s11, [%[wbptr], #28]\n"
- "mov v8.16b, v20.16b\n"
- "ldr s10, [%[wbptr], #32]\n"
- "mov v5.16b, v20.16b\n"
- "ldr s14, [%[wbptr], #36]\n"
- "ldr s27, [%[inptr0]]\n"
- "subs x27, x27, #1\n"
- "fmla v4.4s, v27.4s, v15.4s\n"
- "ldr s24, [x25]\n"
- "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s21, [x22]\n"
- "ldr s19, [x25, %[input_col_stride1]]\n"
- "ldr s31, [%[inptr0], x16]\n"
- "fmla v4.4s, v24.4s, v16.4s\n"
- "ldr s28, [x13]\n"
- "ldr s18, [x22, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x25, #64]\n"
- "prfm pldl1keep, [%[inptr0], x17]\n"
- "prfm pldl1keep, [x22, #64]\n"
- "prfm pldl1keep, [x25, x17]\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "prfm pldl1keep, [x13, #64]\n"
- "prfm pldl1keep, [x22, x17]\n"
- "beq 6f\n"
- "5:\n"
- "fmla v1.4s, v24.4s, v15.4s\n"
- "ldr s24, [x25, x16]\n"
- "fmla v4.4s, v22.4s, v0.4s\n"
- "ldr s29, [%[inptr0], x26]\n"
- "fmla v3.4s, v22.4s, v15.4s\n"
- "ldr s30, [x24]\n"
- "fmla v1.4s, v21.4s, v16.4s\n"
- "ldr s25, [x13, %[input_col_stride1]]\n"
- "fmla v4.4s, v21.4s, v11.4s\n"
- "prfm pldl1keep, [x25, x23]\n"
- "fmla v7.4s, v21.4s, v15.4s\n"
- "ldr s26, [x22, x16]\n"
- "fmla v1.4s, v19.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v4.4s, v19.4s, v12.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v3.4s, v19.4s, v16.4s\n"
- "prfm pldl1keep, [x13, x17]\n"
- "fmla v9.4s, v19.4s, v15.4s\n"
- "ldr s23, [x25, x26]\n"
- "fmla v4.4s, v31.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x23]\n"
- "fmla v3.4s, v31.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x20]\n"
- "fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x9]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "ldr s28, [x24, %[input_col_stride1]]\n"
- "fmla v4.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x17]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "prfm pldl1keep, [x13, x23]\n"
- "fmla v3.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x22, x20]\n"
- "fmla v7.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x15]\n"
- "fmla v9.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x23]\n"
- "fmla v6.4s, v18.4s, v15.4s\n"
- "ldr s27, [x13, x16]\n"
- "fmla v4.4s, v24.4s, v17.4s\n"
- "prfm pldl1keep, [x13, x20]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x15]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "prfm pldl1keep, [x24, x20]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "prfm pldl1keep, [x13, x15]\n"
- "fmla v2.4s, v24.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v8.4s, v24.4s, v15.4s\n"
- "ldr s24, [x22, x26]\n"
- "fmla v3.4s, v29.4s, v13.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x9]\n"
- "fmla v7.4s, v30.4s, v11.4s\n"
- "ldr s21, [x24, x16]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v25.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v7.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v6.4s, v25.4s, v16.4s\n"
- "ldr s19, [x13, x26]\n"
- "fmla v4.4s, v26.4s, v14.4s\n"
- "prfm pldl1keep, [%[inptr0], x17]\n"
- "fmla v1.4s, v26.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v3.4s, v26.4s, v10.4s\n"
- "add x25, x25, #4\n"
- "fmla v7.4s, v26.4s, v13.4s\n"
- "prfm pldl1keep, [x25, #64]\n"
- "fmla v9.4s, v26.4s, v12.4s\n"
- "prfm pldl1keep, [x25, x17]\n"
- "fmla v2.4s, v26.4s, v11.4s\n"
- "subs x27, x27, #1\n"
- "fmla v6.4s, v26.4s, v0.4s\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x9]\n"
- "fmla v3.4s, v23.4s, v17.4s\n"
- "ldr s18, [x24, x26]\n"
- "fmla v9.4s, v23.4s, v13.4s\n"
- "add x22, x22, #4\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v8.4s, v23.4s, v0.4s\n"
- "ldr s23, [x13, x9]\n"
- "fmla v7.4s, v28.4s, v10.4s\n"
- "prfm pldl1keep, [x22, x17]\n"
- "fmla v2.4s, v20.4s, v13.4s\n"
- "ldr s25, [x24, x9]\n"
- "fmla v6.4s, v28.4s, v11.4s\n"
- "ldr s20, [%[wbptr]]\n"
- "fmla v1.4s, v27.4s, v14.4s\n"
- "add x13, x13, #4\n"
- "fmla v7.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [x13, #64]\n"
- "fmla v9.4s, v27.4s, v10.4s\n"
- "add x24, x24, #4\n"
- "fmla v6.4s, v27.4s, v12.4s\n"
- "fmla v8.4s, v27.4s, v11.4s\n"
- "fmla v5.4s, v27.4s, v16.4s\n"
- "ldr s15, [%[wbptr], #4]\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "ldr s27, [%[inptr0]]\n"
- "fmla v9.4s, v24.4s, v17.4s\n"
- "fmla v2.4s, v24.4s, v10.4s\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "fmla v8.4s, v24.4s, v12.4s\n"
- "fmla v5.4s, v24.4s, v0.4s\n"
- "ldr s16, [%[wbptr], #16]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "ldr s24, [x25]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v7.4s, v21.4s, v14.4s\n"
- "fmla v6.4s, v21.4s, v10.4s\n"
- "fmla v5.4s, v21.4s, v11.4s\n"
- "ldr s0, [%[wbptr], #8]\n"
- "fmla v9.4s, v19.4s, v14.4s\n"
- "ldr s21, [x22]\n"
- "fmla v6.4s, v19.4s, v17.4s\n"
- "fmla v8.4s, v19.4s, v10.4s\n"
- "fmla v5.4s, v19.4s, v12.4s\n"
- "ldr s11, [%[wbptr], #28]\n"
- "fmla v2.4s, v26.4s, v14.4s\n"
- "movi v29.16b, #0\n"
- "fmla v8.4s, v26.4s, v17.4s\n"
- "fmla v6.4s, v18.4s, v14.4s\n"
- "fmla v5.4s, v26.4s, v13.4s\n"
- "ldr s12, [%[wbptr], #20]\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "ldr s19, [x25, %[input_col_stride1]]\n"
- "fmla v8.4s, v23.4s, v14.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "str s4, [%[outptr0]]\n"
- "fmla v5.4s, v18.4s, v10.4s\n"
- "str s3, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr s13, [%[wbptr], #12]\n"
- "str s2, [%[outptr0], x19]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "str s1, [x21]\n"
- "fmax v9.4s, v9.4s, v29.4s\n"
- "fmax v8.4s, v8.4s, v29.4s\n"
- "ldr s10, [%[wbptr], #32]\n"
- "str s9, [x21, %[output_col_stride1]]\n"
- "fmla v5.4s, v25.4s, v14.4s\n"
- "str s8, [x21, x19]\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "ldr s17, [%[wbptr], #24]\n"
- "str s7, [x14]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "str s6, [x14, %[output_col_stride1]]\n"
- "mov v4.16b, v20.16b\n"
- "str s5, [x14, x19]\n"
- "mov v1.16b, v20.16b\n"
- "mov v3.16b, v20.16b\n"
- "ldr s14, [%[wbptr], #36]\n"
- "mov v7.16b, v20.16b\n"
- "ldr s31, [%[inptr0], x16]\n"
- "mov v9.16b, v20.16b\n"
- "ldr s28, [x13]\n"
- "mov v2.16b, v20.16b\n"
- "ldr s18, [x22, %[input_col_stride1]]\n"
- "mov v6.16b, v20.16b\n"
- "add %[outptr0], %[outptr0], #4\n"
- "mov v8.16b, v20.16b\n"
- "add x21, x21, #4\n"
- "mov v5.16b, v20.16b\n"
- "add x14, x14, #4\n"
- "fmla v4.4s, v27.4s, v15.4s\n"
- "fmla v4.4s, v24.4s, v16.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v1.4s, v24.4s, v15.4s\n"
- "ldr s24, [x25, x16]\n"
- "fmla v4.4s, v22.4s, v0.4s\n"
- "ldr s29, [%[inptr0], x26]\n"
- "fmla v3.4s, v22.4s, v15.4s\n"
- "ldr s30, [x24]\n"
- "fmla v1.4s, v21.4s, v16.4s\n"
- "ldr s25, [x13, %[input_col_stride1]]\n"
- "fmla v4.4s, v21.4s, v11.4s\n"
- "prfm pldl1keep, [x25, x23]\n"
- "fmla v7.4s, v21.4s, v15.4s\n"
- "ldr s26, [x22, x16]\n"
- "fmla v1.4s, v19.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v4.4s, v19.4s, v12.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v3.4s, v19.4s, v16.4s\n"
- "prfm pldl1keep, [x13, x17]\n"
- "fmla v9.4s, v19.4s, v15.4s\n"
- "ldr s23, [x25, x26]\n"
- "fmla v4.4s, v31.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x23]\n"
- "fmla v3.4s, v31.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x20]\n"
- "fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x9]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "ldr s28, [x24, %[input_col_stride1]]\n"
- "fmla v4.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x17]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "prfm pldl1keep, [x13, x23]\n"
- "fmla v3.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x22, x20]\n"
- "fmla v7.4s, v18.4s, v0.4s\n"
- "prfm pldl1keep, [x25, x15]\n"
- "fmla v9.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x23]\n"
- "fmla v6.4s, v18.4s, v15.4s\n"
- "ldr s27, [x13, x16]\n"
- "fmla v4.4s, v24.4s, v17.4s\n"
- "prfm pldl1keep, [x13, x20]\n"
- "fmla v1.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x22, x15]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "prfm pldl1keep, [x24, x20]\n"
- "fmla v9.4s, v24.4s, v0.4s\n"
- "prfm pldl1keep, [x13, x15]\n"
- "fmla v2.4s, v24.4s, v16.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v8.4s, v24.4s, v15.4s\n"
- "ldr s24, [x22, x26]\n"
- "fmla v3.4s, v29.4s, v13.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x9]\n"
- "fmla v7.4s, v30.4s, v11.4s\n"
- "ldr s21, [x24, x16]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v9.4s, v25.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v7.4s, v25.4s, v12.4s\n"
- "add x25, x25, #4\n"
- "fmla v6.4s, v25.4s, v16.4s\n"
- "ldr s19, [x13, x26]\n"
- "fmla v4.4s, v26.4s, v14.4s\n"
- "fmla v1.4s, v26.4s, v17.4s\n"
- "fmla v3.4s, v26.4s, v10.4s\n"
- "fmla v7.4s, v26.4s, v13.4s\n"
- "fmla v9.4s, v26.4s, v12.4s\n"
- "fmla v2.4s, v26.4s, v11.4s\n"
- "fmla v6.4s, v26.4s, v0.4s\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x9]\n"
- "fmla v3.4s, v23.4s, v17.4s\n"
- "ldr s18, [x24, x26]\n"
- "fmla v9.4s, v23.4s, v13.4s\n"
- "add x22, x22, #4\n"
- "fmla v2.4s, v23.4s, v12.4s\n"
- "fmla v8.4s, v23.4s, v0.4s\n"
- "fmla v7.4s, v28.4s, v10.4s\n"
- "ldr s23, [x13, x9]\n"
- "fmla v6.4s, v28.4s, v11.4s\n"
- "ldr s25, [x24, x9]\n"
- "fmla v2.4s, v20.4s, v13.4s\n"
- "add x13, x13, #4\n"
- "fmla v1.4s, v27.4s, v14.4s\n"
- "add x24, x24, #4\n"
- "fmla v7.4s, v27.4s, v17.4s\n"
- "fmla v9.4s, v27.4s, v10.4s\n"
- "fmla v6.4s, v27.4s, v12.4s\n"
- "fmla v8.4s, v27.4s, v11.4s\n"
- "fmla v5.4s, v27.4s, v16.4s\n"
- "fmla v3.4s, v24.4s, v14.4s\n"
- "fmla v9.4s, v24.4s, v17.4s\n"
- "fmla v2.4s, v24.4s, v10.4s\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "fmla v8.4s, v24.4s, v12.4s\n"
- "fmla v5.4s, v24.4s, v0.4s\n"
- "fmla v7.4s, v21.4s, v14.4s\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "fmla v9.4s, v19.4s, v14.4s\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "fmla v6.4s, v21.4s, v10.4s\n"
- "fmla v5.4s, v21.4s, v11.4s\n"
- "movi v29.16b, #0\n"
- "fmla v2.4s, v26.4s, v14.4s\n"
- "fmla v6.4s, v19.4s, v17.4s\n"
- "fmla v8.4s, v19.4s, v10.4s\n"
- "fmla v5.4s, v19.4s, v12.4s\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "fmla v6.4s, v18.4s, v14.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "str s4, [%[outptr0]]\n"
- "fmla v8.4s, v26.4s, v17.4s\n"
- "str s3, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v5.4s, v26.4s, v13.4s\n"
- "str s2, [%[outptr0], x19]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "fmla v8.4s, v23.4s, v14.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "str s1, [x21]\n"
- "fmla v5.4s, v18.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v29.4s\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v8.4s, v8.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "str s9, [x21, %[output_col_stride1]]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "str s8, [x21, x19]\n"
- "str s7, [x14]\n"
- "str s6, [x14, %[output_col_stride1]]\n"
- "add x21, x21, #4\n"
- "fmla v5.4s, v25.4s, v14.4s\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "str s5, [x14, x19]\n"
- "add x14, x14, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x17, %[inptr0], %[input_row_stride]\n"
- "add x9, %[input_col_stride1], %[input_col_stride1]\n"
- "add x25, %[outptr0], %[output_row_stride]\n"
- "add x14, x17, %[input_row_stride]\n"
- "add x22, x9, #64\n"
- "add x15, x9, %[input_col_stride1]\n"
- "add x21, x14, %[input_row_stride]\n"
- "add x16, x15, #64\n"
- "add x24, x15, %[input_col_stride1]\n"
- "add x26, x21, %[input_row_stride]\n"
- "add x23, x24, #64\n"
- "add x13, x25, %[output_row_stride]\n"
- "add x27, %[output_col_stride1], %[output_col_stride1]\n"
- "and x19, %[n_channels], #3\n"
- "lsr x20, %[n_channels], #2\n"
- "cbz x20, 4f\n"
- "1:\n"
- "ldr q19, [%[wbptr]]\n"
- "subs x20, x20, #1\n"
- "mov v8.16b, v19.16b\n"
- "ldr q17, [%[wbptr], #16]\n"
- "mov v5.16b, v19.16b\n"
- "ldr q16, [%[wbptr], #32]\n"
- "mov v7.16b, v19.16b\n"
- "ldr q15, [%[wbptr], #48]\n"
- "mov v2.16b, v19.16b\n"
- "ldr q14, [%[wbptr], #64]\n"
- "mov v4.16b, v19.16b\n"
- "ldr q13, [%[wbptr], #80]\n"
- "mov v6.16b, v19.16b\n"
- "ldr q12, [%[wbptr], #96]\n"
- "mov v1.16b, v19.16b\n"
- "ldr q11, [%[wbptr], #112]\n"
- "mov v3.16b, v19.16b\n"
- "ldr q10, [%[wbptr], #128]\n"
- "mov v0.16b, v19.16b\n"
- "ldr q9, [%[wbptr], #144]\n"
- "ldr q25, [%[inptr0]]\n"
- "ldr q27, [x17]\n"
- "fmla v8.4s, v25.4s, v17.4s\n"
- "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q20, [x14]\n"
- "ldr q22, [x17, %[input_col_stride1]]\n"
- "ldr q28, [%[inptr0], x9]\n"
- "ldr q23, [x21]\n"
- "fmla v8.4s, v27.4s, v14.4s\n"
- "ldr q18, [x14, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x17, #64]\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "prfm pldl1keep, [x14, #64]\n"
- "prfm pldl1keep, [x17, x28]\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "prfm pldl1keep, [x21, #64]\n"
- "prfm pldl1keep, [x14, x28]\n"
- "beq 3f\n"
- "2:\n"
- "fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x9]\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "ldr q30, [%[inptr0], x15]\n"
- "fmla v7.4s, v26.4s, v17.4s\n"
- "ldr q31, [x26]\n"
- "fmla v5.4s, v20.4s, v14.4s\n"
- "ldr q24, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x17, x22]\n"
- "fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x9]\n"
- "fmla v5.4s, v22.4s, v16.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v7.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x21, x28]\n"
- "fmla v4.4s, v22.4s, v17.4s\n"
- "ldr q21, [x17, x15]\n"
- "fmla v8.4s, v28.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x22]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x16]\n"
- "fmla v6.4s, v28.4s, v17.4s\n"
- "ldr q19, [%[inptr0], x24]\n"
- "fmla v5.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "ldr q28, [x26, %[input_col_stride1]]\n"
- "fmla v8.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x26, x28]\n"
- "fmla v5.4s, v18.4s, v13.4s\n"
- "prfm pldl1keep, [x21, x22]\n"
- "fmla v7.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x14, x16]\n"
- "fmla v2.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x23]\n"
- "fmla v4.4s, v18.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x22]\n"
- "fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x9]\n"
- "fmla v8.4s, v27.4s, v12.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v5.4s, v27.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x23]\n"
- "fmla v7.4s, v27.4s, v13.4s\n"
- "prfm pldl1keep, [x26, x16]\n"
- "fmla v4.4s, v27.4s, v16.4s\n"
- "prfm pldl1keep, [x21, x23]\n"
- "fmla v6.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x23]\n"
- "fmla v3.4s, v27.4s, v17.4s\n"
- "ldr q27, [x14, x15]\n"
- "fmla v7.4s, v30.4s, v15.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v6.4s, v30.4s, v16.4s\n"
- "ldr q26, [x17, x24]\n"
- "fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x9]\n"
- "fmla v5.4s, v24.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v4.4s, v24.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v2.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v1.4s, v24.4s, v14.4s\n"
- "ldr q18, [x21, x15]\n"
- "fmla v8.4s, v29.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "fmla v5.4s, v29.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v7.4s, v29.4s, v10.4s\n"
- "add x17, x17, #16\n"
- "fmla v2.4s, v29.4s, v15.4s\n"
- "prfm pldl1keep, [x17, #64]\n"
- "fmla v4.4s, v29.4s, v13.4s\n"
- "prfm pldl1keep, [x17, x28]\n"
- "fmla v6.4s, v29.4s, v11.4s\n"
- "subs x20, x20, #1\n"
- "fmla v1.4s, v29.4s, v16.4s\n"
- "fmla v3.4s, v29.4s, v14.4s\n"
- "fmla v0.4s, v29.4s, v17.4s\n"
- "ldr q22, [x14, x24]\n"
- "fmla v7.4s, v21.4s, v12.4s\n"
- "ldr q23, [x26, x15]\n"
- "fmla v4.4s, v21.4s, v15.4s\n"
- "add x14, x14, #16\n"
- "fmla v6.4s, v21.4s, v13.4s\n"
- "prfm pldl1keep, [x14, #64]\n"
- "fmla v3.4s, v21.4s, v16.4s\n"
- "ldr q24, [x21, x24]\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "prfm pldl1keep, [x14, x28]\n"
- "fmla v6.4s, v19.4s, v15.4s\n"
- "ldr q21, [x26, x24]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "ldr q19, [%[wbptr]]\n"
- "fmla v5.4s, v25.4s, v9.4s\n"
- "add x21, x21, #16\n"
- "fmla v2.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x21, #64]\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "add x26, x26, #16\n"
- "fmla v1.4s, v25.4s, v13.4s\n"
- "fmla v3.4s, v25.4s, v11.4s\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "ldr q17, [%[wbptr], #16]\n"
- "fmla v7.4s, v27.4s, v9.4s\n"
- "ldr q25, [%[inptr0]]\n"
- "fmla v4.4s, v27.4s, v12.4s\n"
- "fmla v6.4s, v27.4s, v10.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v3.4s, v27.4s, v13.4s\n"
- "fmla v0.4s, v27.4s, v16.4s\n"
- "ldr q14, [%[wbptr], #64]\n"
- "fmla v6.4s, v26.4s, v12.4s\n"
- "ldr q27, [x17]\n"
- "fmla v3.4s, v26.4s, v15.4s\n"
- "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v20.4s, v9.4s\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v0.4s, v20.4s, v11.4s\n"
- "ldr q16, [%[wbptr], #32]\n"
- "fmla v4.4s, v18.4s, v9.4s\n"
- "ldr q20, [x14]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v18.4s, v13.4s\n"
- "ldr q11, [%[wbptr], #112]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "movi v30.16b, #0\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v0.4s, v22.4s, v15.4s\n"
- "ldr q13, [%[wbptr], #80]\n"
- "fmov v29.4s, #6.0\n"
- "fmax v8.4s, v8.4s, v30.4s\n"
- "fmla v3.4s, v24.4s, v9.4s\n"
- "fmax v7.4s, v7.4s, v30.4s\n"
- "fmla v0.4s, v23.4s, v10.4s\n"
- "ldr q15, [%[wbptr], #48]\n"
- "fmin v8.4s, v8.4s, v29.4s\n"
- "ldr q22, [x17, %[input_col_stride1]]\n"
- "fmin v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v30.4s\n"
- "str q8, [%[outptr0]]\n"
- "fmla v0.4s, v24.4s, v12.4s\n"
- "str q7, [%[outptr0], %[output_col_stride1]]\n"
- "fmin v6.4s, v6.4s, v29.4s\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
- "ldr q10, [%[wbptr], #128]\n"
- "str q6, [%[outptr0], x27]\n"
- "fmla v0.4s, v21.4s, v9.4s\n"
- "fmin v5.4s, v5.4s, v29.4s\n"
- "ldr q12, [%[wbptr], #96]\n"
- "fmax v4.4s, v4.4s, v30.4s\n"
- "ldr q28, [%[inptr0], x9]\n"
- "str q5, [x25]\n"
- "fmax v3.4s, v3.4s, v30.4s\n"
- "fmin v4.4s, v4.4s, v29.4s\n"
- "ldr q9, [%[wbptr], #144]\n"
- "fmin v3.4s, v3.4s, v29.4s\n"
- "ldr q23, [x21]\n"
- "str q4, [x25, %[output_col_stride1]]\n"
- "fmax v2.4s, v2.4s, v30.4s\n"
- "str q3, [x25, x27]\n"
- "fmax v1.4s, v1.4s, v30.4s\n"
- "fmin v2.4s, v2.4s, v29.4s\n"
- "ldr q18, [x14, %[input_col_stride1]]\n"
- "fmin v1.4s, v1.4s, v29.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "str q2, [x13]\n"
- "fmax v0.4s, v0.4s, v30.4s\n"
- "str q1, [x13, %[output_col_stride1]]\n"
- "mov v8.16b, v19.16b\n"
- "fmin v0.4s, v0.4s, v29.4s\n"
- "add x25, x25, #16\n"
- "mov v5.16b, v19.16b\n"
- "mov v7.16b, v19.16b\n"
- "str q0, [x13, x27]\n"
- "mov v2.16b, v19.16b\n"
- "mov v4.16b, v19.16b\n"
- "add x13, x13, #16\n"
- "mov v6.16b, v19.16b\n"
- "mov v1.16b, v19.16b\n"
- "mov v3.16b, v19.16b\n"
- "mov v0.16b, v19.16b\n"
- "fmla v8.4s, v25.4s, v17.4s\n"
- "fmla v8.4s, v27.4s, v14.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x9]\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "ldr q30, [%[inptr0], x15]\n"
- "fmla v7.4s, v26.4s, v17.4s\n"
- "ldr q31, [x26]\n"
- "fmla v5.4s, v20.4s, v14.4s\n"
- "ldr q24, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x17, x22]\n"
- "fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x9]\n"
- "fmla v5.4s, v22.4s, v16.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v7.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x21, x28]\n"
- "fmla v4.4s, v22.4s, v17.4s\n"
- "ldr q21, [x17, x15]\n"
- "fmla v8.4s, v28.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x22]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x16]\n"
- "fmla v6.4s, v28.4s, v17.4s\n"
- "ldr q19, [%[inptr0], x24]\n"
- "fmla v5.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "ldr q28, [x26, %[input_col_stride1]]\n"
- "fmla v8.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x26, x28]\n"
- "fmla v5.4s, v18.4s, v13.4s\n"
- "prfm pldl1keep, [x21, x22]\n"
- "fmla v7.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x14, x16]\n"
- "fmla v2.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x23]\n"
- "fmla v4.4s, v18.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x22]\n"
- "fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x9]\n"
- "fmla v8.4s, v27.4s, v12.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v5.4s, v27.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x23]\n"
- "fmla v7.4s, v27.4s, v13.4s\n"
- "prfm pldl1keep, [x26, x16]\n"
- "fmla v4.4s, v27.4s, v16.4s\n"
- "prfm pldl1keep, [x21, x23]\n"
- "fmla v6.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x23]\n"
- "fmla v3.4s, v27.4s, v17.4s\n"
- "ldr q27, [x14, x15]\n"
- "fmla v7.4s, v30.4s, v15.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v6.4s, v30.4s, v16.4s\n"
- "ldr q26, [x17, x24]\n"
- "fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x9]\n"
- "fmla v5.4s, v24.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v4.4s, v24.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v2.4s, v24.4s, v13.4s\n"
- "add x17, x17, #16\n"
- "fmla v1.4s, v24.4s, v14.4s\n"
- "ldr q18, [x21, x15]\n"
- "fmla v8.4s, v29.4s, v9.4s\n"
- "fmla v5.4s, v29.4s, v12.4s\n"
- "fmla v7.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v15.4s\n"
- "fmla v4.4s, v29.4s, v13.4s\n"
- "fmla v6.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v16.4s\n"
- "fmla v3.4s, v29.4s, v14.4s\n"
- "fmla v0.4s, v29.4s, v17.4s\n"
- "ldr q22, [x14, x24]\n"
- "fmla v7.4s, v21.4s, v12.4s\n"
- "ldr q23, [x26, x15]\n"
- "fmla v4.4s, v21.4s, v15.4s\n"
- "add x14, x14, #16\n"
- "fmla v6.4s, v21.4s, v13.4s\n"
- "fmla v3.4s, v21.4s, v16.4s\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "ldr q24, [x21, x24]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "ldr q21, [x26, x24]\n"
- "fmla v6.4s, v19.4s, v15.4s\n"
- "add x21, x21, #16\n"
- "fmla v5.4s, v25.4s, v9.4s\n"
- "add x26, x26, #16\n"
- "fmla v2.4s, v25.4s, v12.4s\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v13.4s\n"
- "fmla v3.4s, v25.4s, v11.4s\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "fmla v7.4s, v27.4s, v9.4s\n"
- "fmla v4.4s, v27.4s, v12.4s\n"
- "fmla v6.4s, v27.4s, v10.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v3.4s, v27.4s, v13.4s\n"
- "fmla v0.4s, v27.4s, v16.4s\n"
- "fmla v2.4s, v20.4s, v9.4s\n"
- "fmla v6.4s, v26.4s, v12.4s\n"
- "fmla v4.4s, v18.4s, v9.4s\n"
- "fmla v3.4s, v26.4s, v15.4s\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v0.4s, v20.4s, v11.4s\n"
- "movi v30.16b, #0\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "fmov v29.4s, #6.0\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v18.4s, v13.4s\n"
- "fmax v8.4s, v8.4s, v30.4s\n"
- "fmax v7.4s, v7.4s, v30.4s\n"
- "fmax v6.4s, v6.4s, v30.4s\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v0.4s, v22.4s, v15.4s\n"
- "fmin v8.4s, v8.4s, v29.4s\n"
- "fmin v7.4s, v7.4s, v29.4s\n"
- "fmin v6.4s, v6.4s, v29.4s\n"
- "str q8, [%[outptr0]]\n"
- "fmla v3.4s, v24.4s, v9.4s\n"
- "str q7, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v23.4s, v10.4s\n"
- "str q6, [%[outptr0], x27]\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
- "fmax v4.4s, v4.4s, v30.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v0.4s, v24.4s, v12.4s\n"
- "fmin v5.4s, v5.4s, v29.4s\n"
- "fmin v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v30.4s\n"
- "str q5, [x25]\n"
- "fmax v2.4s, v2.4s, v30.4s\n"
- "str q4, [x25, %[output_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v9.4s\n"
- "fmin v3.4s, v3.4s, v29.4s\n"
- "fmin v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v30.4s\n"
- "str q3, [x25, x27]\n"
- "str q2, [x13]\n"
- "fmin v1.4s, v1.4s, v29.4s\n"
- "fmax v0.4s, v0.4s, v30.4s\n"
- "add x25, x25, #16\n"
- "str q1, [x13, %[output_col_stride1]]\n"
- "fmin v0.4s, v0.4s, v29.4s\n"
- "str q0, [x13, x27]\n"
- "add x13, x13, #16\n"
- "4:\n"
- "cbz x19, 7f\n"
- "ldr s19, [%[wbptr]]\n"
- "mov v8.16b, v19.16b\n"
- "ldr s17, [%[wbptr], #4]\n"
- "mov v5.16b, v19.16b\n"
- "ldr s16, [%[wbptr], #8]\n"
- "mov v7.16b, v19.16b\n"
- "ldr s15, [%[wbptr], #12]\n"
- "mov v2.16b, v19.16b\n"
- "ldr s14, [%[wbptr], #16]\n"
- "mov v4.16b, v19.16b\n"
- "ldr s13, [%[wbptr], #20]\n"
- "mov v6.16b, v19.16b\n"
- "ldr s12, [%[wbptr], #24]\n"
- "mov v1.16b, v19.16b\n"
- "ldr s11, [%[wbptr], #28]\n"
- "mov v3.16b, v19.16b\n"
- "ldr s10, [%[wbptr], #32]\n"
- "mov v0.16b, v19.16b\n"
- "ldr s9, [%[wbptr], #36]\n"
- "ldr s25, [%[inptr0]]\n"
- "subs x19, x19, #1\n"
- "fmla v8.4s, v25.4s, v17.4s\n"
- "ldr s27, [x17]\n"
- "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s20, [x14]\n"
- "ldr s22, [x17, %[input_col_stride1]]\n"
- "ldr s28, [%[inptr0], x9]\n"
- "fmla v8.4s, v27.4s, v14.4s\n"
- "ldr s23, [x21]\n"
- "ldr s18, [x14, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x17, #64]\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "prfm pldl1keep, [x14, #64]\n"
- "prfm pldl1keep, [x17, x28]\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "prfm pldl1keep, [x21, #64]\n"
- "prfm pldl1keep, [x14, x28]\n"
- "beq 6f\n"
- "5:\n"
- "fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x9]\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "ldr s30, [%[inptr0], x15]\n"
- "fmla v7.4s, v26.4s, v17.4s\n"
- "ldr s31, [x26]\n"
- "fmla v5.4s, v20.4s, v14.4s\n"
- "ldr s24, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x17, x22]\n"
- "fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x9]\n"
- "fmla v5.4s, v22.4s, v16.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v7.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x21, x28]\n"
- "fmla v4.4s, v22.4s, v17.4s\n"
- "ldr s21, [x17, x15]\n"
- "fmla v8.4s, v28.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x22]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x16]\n"
- "fmla v6.4s, v28.4s, v17.4s\n"
- "ldr s19, [%[inptr0], x24]\n"
- "fmla v5.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "ldr s28, [x26, %[input_col_stride1]]\n"
- "fmla v8.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x26, x28]\n"
- "fmla v5.4s, v18.4s, v13.4s\n"
- "prfm pldl1keep, [x21, x22]\n"
- "fmla v7.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x14, x16]\n"
- "fmla v2.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x23]\n"
- "fmla v4.4s, v18.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x22]\n"
- "fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x9]\n"
- "fmla v8.4s, v27.4s, v12.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v5.4s, v27.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x23]\n"
- "fmla v7.4s, v27.4s, v13.4s\n"
- "prfm pldl1keep, [x26, x16]\n"
- "fmla v4.4s, v27.4s, v16.4s\n"
- "prfm pldl1keep, [x21, x23]\n"
- "fmla v6.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x23]\n"
- "fmla v3.4s, v27.4s, v17.4s\n"
- "ldr s27, [x14, x15]\n"
- "fmla v7.4s, v30.4s, v15.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v6.4s, v30.4s, v16.4s\n"
- "ldr s26, [x17, x24]\n"
- "fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x9]\n"
- "fmla v5.4s, v24.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v4.4s, v24.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v2.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v1.4s, v24.4s, v14.4s\n"
- "ldr s18, [x21, x15]\n"
- "fmla v8.4s, v29.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "fmla v5.4s, v29.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v7.4s, v29.4s, v10.4s\n"
- "add x17, x17, #4\n"
- "fmla v2.4s, v29.4s, v15.4s\n"
- "prfm pldl1keep, [x17, #64]\n"
- "fmla v4.4s, v29.4s, v13.4s\n"
- "prfm pldl1keep, [x17, x28]\n"
- "fmla v6.4s, v29.4s, v11.4s\n"
- "subs x19, x19, #1\n"
- "fmla v1.4s, v29.4s, v16.4s\n"
- "fmla v3.4s, v29.4s, v14.4s\n"
- "fmla v0.4s, v29.4s, v17.4s\n"
- "ldr s22, [x14, x24]\n"
- "fmla v7.4s, v21.4s, v12.4s\n"
- "ldr s23, [x26, x15]\n"
- "fmla v4.4s, v21.4s, v15.4s\n"
- "add x14, x14, #4\n"
- "fmla v6.4s, v21.4s, v13.4s\n"
- "prfm pldl1keep, [x14, #64]\n"
- "fmla v3.4s, v21.4s, v16.4s\n"
- "ldr s24, [x21, x24]\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "prfm pldl1keep, [x14, x28]\n"
- "fmla v6.4s, v19.4s, v15.4s\n"
- "ldr s21, [x26, x24]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "ldr s19, [%[wbptr]]\n"
- "fmla v5.4s, v25.4s, v9.4s\n"
- "add x21, x21, #4\n"
- "fmla v2.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x21, #64]\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "add x26, x26, #4\n"
- "fmla v1.4s, v25.4s, v13.4s\n"
- "fmla v3.4s, v25.4s, v11.4s\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "ldr s17, [%[wbptr], #4]\n"
- "fmla v7.4s, v27.4s, v9.4s\n"
- "ldr s25, [%[inptr0]]\n"
- "fmla v4.4s, v27.4s, v12.4s\n"
- "fmla v6.4s, v27.4s, v10.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v3.4s, v27.4s, v13.4s\n"
- "fmla v0.4s, v27.4s, v16.4s\n"
- "ldr s14, [%[wbptr], #16]\n"
- "fmla v6.4s, v26.4s, v12.4s\n"
- "ldr s27, [x17]\n"
- "fmla v3.4s, v26.4s, v15.4s\n"
- "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v2.4s, v20.4s, v9.4s\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v0.4s, v20.4s, v11.4s\n"
- "ldr s16, [%[wbptr], #8]\n"
- "fmla v4.4s, v18.4s, v9.4s\n"
- "ldr s20, [x14]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v18.4s, v13.4s\n"
- "ldr s11, [%[wbptr], #28]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "movi v30.16b, #0\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v0.4s, v22.4s, v15.4s\n"
- "ldr s13, [%[wbptr], #20]\n"
- "fmov v29.4s, #6.0\n"
- "fmax v8.4s, v8.4s, v30.4s\n"
- "fmla v3.4s, v24.4s, v9.4s\n"
- "fmax v7.4s, v7.4s, v30.4s\n"
- "fmla v0.4s, v23.4s, v10.4s\n"
- "ldr s15, [%[wbptr], #12]\n"
- "fmin v8.4s, v8.4s, v29.4s\n"
- "ldr s22, [x17, %[input_col_stride1]]\n"
- "fmin v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v30.4s\n"
- "str s8, [%[outptr0]]\n"
- "fmla v0.4s, v24.4s, v12.4s\n"
- "str s7, [%[outptr0], %[output_col_stride1]]\n"
- "fmin v6.4s, v6.4s, v29.4s\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
- "ldr s10, [%[wbptr], #32]\n"
- "str s6, [%[outptr0], x27]\n"
- "fmla v0.4s, v21.4s, v9.4s\n"
- "fmin v5.4s, v5.4s, v29.4s\n"
- "ldr s12, [%[wbptr], #24]\n"
- "fmax v4.4s, v4.4s, v30.4s\n"
- "ldr s28, [%[inptr0], x9]\n"
- "str s5, [x25]\n"
- "fmax v3.4s, v3.4s, v30.4s\n"
- "fmin v4.4s, v4.4s, v29.4s\n"
- "ldr s9, [%[wbptr], #36]\n"
- "fmin v3.4s, v3.4s, v29.4s\n"
- "ldr s23, [x21]\n"
- "str s4, [x25, %[output_col_stride1]]\n"
- "fmax v2.4s, v2.4s, v30.4s\n"
- "str s3, [x25, x27]\n"
- "fmax v1.4s, v1.4s, v30.4s\n"
- "fmin v2.4s, v2.4s, v29.4s\n"
- "ldr s18, [x14, %[input_col_stride1]]\n"
- "fmin v1.4s, v1.4s, v29.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "str s2, [x13]\n"
- "fmax v0.4s, v0.4s, v30.4s\n"
- "str s1, [x13, %[output_col_stride1]]\n"
- "mov v8.16b, v19.16b\n"
- "fmin v0.4s, v0.4s, v29.4s\n"
- "add x25, x25, #4\n"
- "mov v5.16b, v19.16b\n"
- "mov v7.16b, v19.16b\n"
- "str s0, [x13, x27]\n"
- "mov v2.16b, v19.16b\n"
- "mov v4.16b, v19.16b\n"
- "add x13, x13, #4\n"
- "mov v6.16b, v19.16b\n"
- "mov v1.16b, v19.16b\n"
- "mov v3.16b, v19.16b\n"
- "mov v0.16b, v19.16b\n"
- "fmla v8.4s, v25.4s, v17.4s\n"
- "fmla v8.4s, v27.4s, v14.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x9]\n"
- "fmla v8.4s, v26.4s, v16.4s\n"
- "ldr s30, [%[inptr0], x15]\n"
- "fmla v7.4s, v26.4s, v17.4s\n"
- "ldr s31, [x26]\n"
- "fmla v5.4s, v20.4s, v14.4s\n"
- "ldr s24, [x21, %[input_col_stride1]]\n"
- "fmla v8.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x17, x22]\n"
- "fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x9]\n"
- "fmla v5.4s, v22.4s, v16.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v8.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v7.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x21, x28]\n"
- "fmla v4.4s, v22.4s, v17.4s\n"
- "ldr s21, [x17, x15]\n"
- "fmla v8.4s, v28.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x22]\n"
- "fmla v7.4s, v28.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x16]\n"
- "fmla v6.4s, v28.4s, v17.4s\n"
- "ldr s19, [%[inptr0], x24]\n"
- "fmla v5.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [%[inptr0], x23]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "ldr s28, [x26, %[input_col_stride1]]\n"
- "fmla v8.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x26, x28]\n"
- "fmla v5.4s, v18.4s, v13.4s\n"
- "prfm pldl1keep, [x21, x22]\n"
- "fmla v7.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x14, x16]\n"
- "fmla v2.4s, v18.4s, v16.4s\n"
- "prfm pldl1keep, [x17, x23]\n"
- "fmla v4.4s, v18.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x22]\n"
- "fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x9]\n"
- "fmla v8.4s, v27.4s, v12.4s\n"
- "prfm pldl1keep, [x21, x16]\n"
- "fmla v5.4s, v27.4s, v15.4s\n"
- "prfm pldl1keep, [x14, x23]\n"
- "fmla v7.4s, v27.4s, v13.4s\n"
- "prfm pldl1keep, [x26, x16]\n"
- "fmla v4.4s, v27.4s, v16.4s\n"
- "prfm pldl1keep, [x21, x23]\n"
- "fmla v6.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x23]\n"
- "fmla v3.4s, v27.4s, v17.4s\n"
- "ldr s27, [x14, x15]\n"
- "fmla v7.4s, v30.4s, v15.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v6.4s, v30.4s, v16.4s\n"
- "ldr s26, [x17, x24]\n"
- "fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x9]\n"
- "fmla v5.4s, v24.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v4.4s, v24.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v2.4s, v24.4s, v13.4s\n"
- "add x17, x17, #4\n"
- "fmla v1.4s, v24.4s, v14.4s\n"
- "ldr s18, [x21, x15]\n"
- "fmla v8.4s, v29.4s, v9.4s\n"
- "fmla v5.4s, v29.4s, v12.4s\n"
- "fmla v7.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v15.4s\n"
- "fmla v4.4s, v29.4s, v13.4s\n"
- "fmla v6.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v16.4s\n"
- "fmla v3.4s, v29.4s, v14.4s\n"
- "fmla v0.4s, v29.4s, v17.4s\n"
- "ldr s22, [x14, x24]\n"
- "fmla v7.4s, v21.4s, v12.4s\n"
- "ldr s23, [x26, x15]\n"
- "fmla v4.4s, v21.4s, v15.4s\n"
- "add x14, x14, #4\n"
- "fmla v6.4s, v21.4s, v13.4s\n"
- "fmla v3.4s, v21.4s, v16.4s\n"
- "fmla v2.4s, v28.4s, v10.4s\n"
- "ldr s24, [x21, x24]\n"
- "fmla v1.4s, v28.4s, v11.4s\n"
- "ldr s21, [x26, x24]\n"
- "fmla v6.4s, v19.4s, v15.4s\n"
- "add x21, x21, #4\n"
- "fmla v5.4s, v25.4s, v9.4s\n"
- "add x26, x26, #4\n"
- "fmla v2.4s, v25.4s, v12.4s\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v13.4s\n"
- "fmla v3.4s, v25.4s, v11.4s\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "fmla v7.4s, v27.4s, v9.4s\n"
- "fmla v4.4s, v27.4s, v12.4s\n"
- "fmla v6.4s, v27.4s, v10.4s\n"
- "fmla v1.4s, v27.4s, v15.4s\n"
- "fmla v3.4s, v27.4s, v13.4s\n"
- "fmla v0.4s, v27.4s, v16.4s\n"
- "fmla v2.4s, v20.4s, v9.4s\n"
- "fmla v6.4s, v26.4s, v12.4s\n"
- "fmla v4.4s, v18.4s, v9.4s\n"
- "fmla v3.4s, v26.4s, v15.4s\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v0.4s, v20.4s, v11.4s\n"
- "movi v30.16b, #0\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "fmov v29.4s, #6.0\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v18.4s, v13.4s\n"
- "fmax v8.4s, v8.4s, v30.4s\n"
- "fmax v7.4s, v7.4s, v30.4s\n"
- "fmax v6.4s, v6.4s, v30.4s\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v0.4s, v22.4s, v15.4s\n"
- "fmin v8.4s, v8.4s, v29.4s\n"
- "fmin v7.4s, v7.4s, v29.4s\n"
- "fmin v6.4s, v6.4s, v29.4s\n"
- "str s8, [%[outptr0]]\n"
- "fmla v3.4s, v24.4s, v9.4s\n"
- "str s7, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v0.4s, v23.4s, v10.4s\n"
- "str s6, [%[outptr0], x27]\n"
- "fmax v5.4s, v5.4s, v30.4s\n"
- "fmax v4.4s, v4.4s, v30.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v0.4s, v24.4s, v12.4s\n"
- "fmin v5.4s, v5.4s, v29.4s\n"
- "fmin v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v30.4s\n"
- "str s5, [x25]\n"
- "fmax v2.4s, v2.4s, v30.4s\n"
- "str s4, [x25, %[output_col_stride1]]\n"
- "fmla v0.4s, v21.4s, v9.4s\n"
- "fmin v3.4s, v3.4s, v29.4s\n"
- "fmin v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v30.4s\n"
- "str s3, [x25, x27]\n"
- "str s2, [x13]\n"
- "fmin v1.4s, v1.4s, v29.4s\n"
- "fmax v0.4s, v0.4s, v30.4s\n"
- "add x25, x25, #4\n"
- "str s1, [x13, %[output_col_stride1]]\n"
- "fmin v0.4s, v0.4s, v29.4s\n"
- "str s0, [x13, x27]\n"
- "add x13, x13, #4\n"
- "7:\n"
- : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
- : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-#endif // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index b798b8cdbe..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void* weight_bias_ptr,
- const float* input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float* output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x15, %[inptr0], %[input_row_stride]\n"
- "add x26, %[input_col_stride1], %[input_col_stride1]\n"
- "add x21, %[outptr0], %[output_row_stride]\n"
- "add x16, x15, %[input_row_stride]\n"
- "add x27, x26, %[input_col_stride1]\n"
- "add x22, x21, %[output_row_stride]\n"
- "add x17, x16, %[input_row_stride]\n"
- "add x28, x27, %[input_col_stride1]\n"
- "add x23, %[output_col_stride1], %[output_col_stride1]\n"
- "add x9, x17, %[input_row_stride]\n"
- "add x13, x28, %[input_col_stride1]\n"
- "and x24, %[n_channels], #3\n"
- "add x19, x9, %[input_row_stride]\n"
- "add x14, x13, %[input_col_stride1]\n"
- "lsr x25, %[n_channels], #2\n"
- "add x20, x19, %[input_row_stride]\n"
- "cbz x25, 4f\n"
- "1:\n"
- "ldr q27, [%[wbptr]]\n"
- "subs x25, x25, #1\n"
- "mov v17.16b, v27.16b\n"
- "ldr q6, [%[wbptr], #16]\n"
- "mov v16.16b, v27.16b\n"
- "ldr q14, [%[wbptr], #32]\n"
- "mov v15.16b, v27.16b\n"
- "ldr q13, [%[wbptr], #48]\n"
- "mov v2.16b, v27.16b\n"
- "ldr q12, [%[wbptr], #64]\n"
- "mov v4.16b, v27.16b\n"
- "ldr q11, [%[wbptr], #80]\n"
- "mov v5.16b, v27.16b\n"
- "ldr q10, [%[wbptr], #96]\n"
- "mov v1.16b, v27.16b\n"
- "ldr q9, [%[wbptr], #112]\n"
- "mov v3.16b, v27.16b\n"
- "ldr q8, [%[wbptr], #128]\n"
- "mov v0.16b, v27.16b\n"
- "ldr q7, [%[wbptr], #144]\n"
- "ldr q29, [%[inptr0]]\n"
- "ldr q28, [x15]\n"
- "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q22, [x16]\n"
- "ldr q20, [x15, %[input_col_stride1]]\n"
- "ldr q19, [%[inptr0], x26]\n"
- "ldr q30, [x17]\n"
- "ldr q18, [x16, %[input_col_stride1]]\n"
- "beq 3f\n"
- "2:\n"
- "fmla v17.4s, v29.4s, v6.4s\n"
- "ldr q21, [x15, x26]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "ldr q27, [%[inptr0], x27]\n"
- "fmla v15.4s, v19.4s, v6.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v28.4s, v12.4s\n"
- "ldr q25, [x9]\n"
- "fmla v16.4s, v30.4s, v12.4s\n"
- "ldr q24, [x17, %[input_col_stride1]]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v17.4s, v26.4s, v14.4s\n"
- "ldr q23, [x16, x26]\n"
- "fmla v16.4s, v18.4s, v14.4s\n"
- "subs x25, x25, #1\n"
- "fmla v15.4s, v27.4s, v14.4s\n"
- "ldr q26, [x15, x27]\n"
- "fmla v17.4s, v22.4s, v9.4s\n"
- "ldr q22, [%[inptr0], x28]\n"
- "fmla v16.4s, v25.4s, v9.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v23.4s, v9.4s\n"
- "ldr q30, [x19]\n"
- "fmla v17.4s, v20.4s, v11.4s\n"
- "ldr q29, [x9, %[input_col_stride1]]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "ldr q28, [x17, x26]\n"
- "fmla v4.4s, v23.4s, v6.4s\n"
- "fmla v15.4s, v26.4s, v11.4s\n"
- "fmla v17.4s, v19.4s, v13.4s\n"
- "ldr q24, [x16, x27]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "ldr q25, [x15, x28]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "fmla v5.4s, v22.4s, v6.4s\n"
- "fmla v17.4s, v18.4s, v8.4s\n"
- "ldr q19, [%[inptr0], x13]\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr q18, [x20]\n"
- "fmla v16.4s, v29.4s, v8.4s\n"
- "ldr q22, [x19, %[input_col_stride1]]\n"
- "fmla v17.4s, v21.4s, v10.4s\n"
- "ldr q26, [x9, x26]\n"
- "fmla v2.4s, v29.4s, v14.4s\n"
- "ldr q20, [x17, x27]\n"
- "fmla v16.4s, v28.4s, v10.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v17.4s, v23.4s, v7.4s\n"
- "ldr q27, [x16, x28]\n"
- "fmla v15.4s, v24.4s, v8.4s\n"
- "ldr q30, [x15, x13]\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "ldr q24, [%[inptr0], x14]\n"
- "str q17, [%[outptr0]]\n"
- "fmla v5.4s, v25.4s, v12.4s\n"
- "fmla v15.4s, v25.4s, v10.4s\n"
- "ldr q28, [x20, %[input_col_stride1]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "ldr q17, [x19, x26]\n"
- "fmla v5.4s, v19.4s, v14.4s\n"
- "ldr q18, [x9, x27]\n"
- "fmla v16.4s, v26.4s, v7.4s\n"
- "ldr q25, [x17, x28]\n"
- "fmla v2.4s, v22.4s, v11.4s\n"
- "ldr q22, [x16, x13]\n"
- "fmla v4.4s, v26.4s, v9.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "str q16, [x21]\n"
- "fmla v1.4s, v26.4s, v6.4s\n"
- "fmla v2.4s, v26.4s, v13.4s\n"
- "ldr q21, [x15, x14]\n"
- "fmla v4.4s, v20.4s, v11.4s\n"
- "ldr q23, [x20, x26]\n"
- "fmla v15.4s, v27.4s, v7.4s\n"
- "ldr q19, [x19, x27]\n"
- "fmla v5.4s, v27.4s, v9.4s\n"
- "add x15, x15, #16\n"
- "fmla v4.4s, v27.4s, v13.4s\n"
- "fmla v3.4s, v27.4s, v6.4s\n"
- "str q15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v2.4s, v28.4s, v8.4s\n"
- "fmla v5.4s, v30.4s, v11.4s\n"
- "ldr q29, [x9, x28]\n"
- "fmla v1.4s, v17.4s, v12.4s\n"
- "ldr q27, [x17, x13]\n"
- "fmla v2.4s, v17.4s, v10.4s\n"
- "ldr q28, [x16, x14]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "ldr q26, [x20, x27]\n"
- "fmla v4.4s, v18.4s, v8.4s\n"
- "ldr q20, [x19, x28]\n"
- "fmla v1.4s, v18.4s, v14.4s\n"
- "ldr q17, [x9, x13]\n"
- "fmla v3.4s, v25.4s, v12.4s\n"
- "ldr q18, [x17, x14]\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "ldr q16, [x20, x28]\n"
- "fmla v5.4s, v22.4s, v8.4s\n"
- "add x16, x16, #16\n"
- "fmla v3.4s, v22.4s, v14.4s\n"
- "ldr q15, [x19, x13]\n"
- "fmla v2.4s, v23.4s, v7.4s\n"
- "add x17, x17, #16\n"
- "fmla v5.4s, v21.4s, v10.4s\n"
- "ldr q21, [x9, x14]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr q23, [x20, x13]\n"
- "str q2, [x22]\n"
- "fmla v4.4s, v29.4s, v7.4s\n"
- "fmla v3.4s, v29.4s, v9.4s\n"
- "ldr q24, [x19, x14]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "ldr q25, [x20, x14]\n"
- "str q4, [x21, %[output_col_stride1]]\n"
- "fmla v0.4s, v29.4s, v6.4s\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "ldr q27, [%[wbptr]]\n"
- "fmla v1.4s, v29.4s, v13.4s\n"
- "ldr q29, [%[inptr0]]\n"
- "fmla v5.4s, v28.4s, v7.4s\n"
- "ldr q6, [%[wbptr], #16]\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "ldr q28, [x15]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
- "str q5, [%[outptr0], x23]\n"
- "fmla v0.4s, v20.4s, v12.4s\n"
- "fmla v3.4s, v17.4s, v8.4s\n"
- "ldr q22, [x16]\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "ldr q20, [x15, %[input_col_stride1]]\n"
- "fmla v0.4s, v17.4s, v14.4s\n"
- "ldr q12, [%[wbptr], #64]\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "ldr q19, [%[inptr0], x26]\n"
- "fmla v1.4s, v16.4s, v7.4s\n"
- "ldr q30, [x17]\n"
- "fmla v0.4s, v16.4s, v9.4s\n"
- "ldr q14, [%[wbptr], #32]\n"
- "fmla v3.4s, v21.4s, v7.4s\n"
- "ldr q18, [x16, %[input_col_stride1]]\n"
- "str q1, [x22, %[output_col_stride1]]\n"
- "mov v17.16b, v27.16b\n"
- "fmla v0.4s, v15.4s, v11.4s\n"
- "ldr q9, [%[wbptr], #112]\n"
- "str q3, [x21, x23]\n"
- "mov v16.16b, v27.16b\n"
- "mov v15.16b, v27.16b\n"
- "add x9, x9, #16\n"
- "fmla v0.4s, v21.4s, v13.4s\n"
- "ldr q11, [%[wbptr], #80]\n"
- "mov v2.16b, v27.16b\n"
- "add x19, x19, #16\n"
- "mov v4.16b, v27.16b\n"
- "add x20, x20, #16\n"
- "fmla v0.4s, v23.4s, v8.4s\n"
- "ldr q13, [%[wbptr], #48]\n"
- "mov v5.16b, v27.16b\n"
- "add %[outptr0], %[outptr0], #16\n"
- "mov v1.16b, v27.16b\n"
- "add x21, x21, #16\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "ldr q8, [%[wbptr], #128]\n"
- "mov v3.16b, v27.16b\n"
- "fmla v0.4s, v25.4s, v7.4s\n"
- "ldr q10, [%[wbptr], #96]\n"
- "str q0, [x22, x23]\n"
- "mov v0.16b, v27.16b\n"
- "ldr q7, [%[wbptr], #144]\n"
- "add x22, x22, #16\n"
- "bne 2b\n"
- "3:\n"
- "fmla v17.4s, v29.4s, v6.4s\n"
- "ldr q21, [x15, x26]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "ldr q27, [%[inptr0], x27]\n"
- "fmla v15.4s, v19.4s, v6.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v28.4s, v12.4s\n"
- "ldr q25, [x9]\n"
- "fmla v16.4s, v30.4s, v12.4s\n"
- "ldr q24, [x17, %[input_col_stride1]]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v17.4s, v26.4s, v14.4s\n"
- "ldr q23, [x16, x26]\n"
- "fmla v16.4s, v18.4s, v14.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v27.4s, v14.4s\n"
- "ldr q26, [x15, x27]\n"
- "fmla v17.4s, v22.4s, v9.4s\n"
- "ldr q22, [%[inptr0], x28]\n"
- "fmla v16.4s, v25.4s, v9.4s\n"
- "ldr q30, [x19]\n"
- "fmla v15.4s, v23.4s, v9.4s\n"
- "fmla v4.4s, v23.4s, v6.4s\n"
- "fmla v17.4s, v20.4s, v11.4s\n"
- "ldr q29, [x9, %[input_col_stride1]]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "ldr q28, [x17, x26]\n"
- "fmla v15.4s, v26.4s, v11.4s\n"
- "ldr q24, [x16, x27]\n"
- "fmla v17.4s, v19.4s, v13.4s\n"
- "ldr q25, [x15, x28]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "ldr q19, [%[inptr0], x13]\n"
- "fmla v17.4s, v18.4s, v8.4s\n"
- "ldr q18, [x20]\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr q22, [x19, %[input_col_stride1]]\n"
- "fmla v16.4s, v29.4s, v8.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v17.4s, v21.4s, v10.4s\n"
- "ldr q26, [x9, x26]\n"
- "fmla v2.4s, v29.4s, v14.4s\n"
- "ldr q20, [x17, x27]\n"
- "fmla v16.4s, v28.4s, v10.4s\n"
- "ldr q27, [x16, x28]\n"
- "fmla v17.4s, v23.4s, v7.4s\n"
- "ldr q30, [x15, x13]\n"
- "fmla v15.4s, v24.4s, v8.4s\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "fmla v5.4s, v25.4s, v12.4s\n"
- "ldr q24, [%[inptr0], x14]\n"
- "str q17, [%[outptr0]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "fmla v15.4s, v25.4s, v10.4s\n"
- "ldr q28, [x20, %[input_col_stride1]]\n"
- "fmla v5.4s, v19.4s, v14.4s\n"
- "ldr q17, [x19, x26]\n"
- "fmla v2.4s, v22.4s, v11.4s\n"
- "ldr q18, [x9, x27]\n"
- "fmla v16.4s, v26.4s, v7.4s\n"
- "ldr q25, [x17, x28]\n"
- "fmla v4.4s, v26.4s, v9.4s\n"
- "ldr q22, [x16, x13]\n"
- "fmla v2.4s, v26.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "str q16, [x21]\n"
- "fmla v1.4s, v26.4s, v6.4s\n"
- "fmla v4.4s, v20.4s, v11.4s\n"
- "ldr q21, [x15, x14]\n"
- "fmla v15.4s, v27.4s, v7.4s\n"
- "ldr q23, [x20, x26]\n"
- "fmla v5.4s, v27.4s, v9.4s\n"
- "ldr q19, [x19, x27]\n"
- "fmla v4.4s, v27.4s, v13.4s\n"
- "add x15, x15, #16\n"
- "str q15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v3.4s, v27.4s, v6.4s\n"
- "fmla v5.4s, v30.4s, v11.4s\n"
- "ldr q29, [x9, x28]\n"
- "fmla v2.4s, v28.4s, v8.4s\n"
- "ldr q27, [x17, x13]\n"
- "fmla v1.4s, v17.4s, v12.4s\n"
- "ldr q28, [x16, x14]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "ldr q26, [x20, x27]\n"
- "fmla v2.4s, v17.4s, v10.4s\n"
- "ldr q20, [x19, x28]\n"
- "fmla v4.4s, v18.4s, v8.4s\n"
- "ldr q17, [x9, x13]\n"
- "fmla v1.4s, v18.4s, v14.4s\n"
- "ldr q18, [x17, x14]\n"
- "fmla v3.4s, v25.4s, v12.4s\n"
- "add x16, x16, #16\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "ldr q16, [x20, x28]\n"
- "fmla v5.4s, v22.4s, v8.4s\n"
- "add x17, x17, #16\n"
- "fmla v3.4s, v22.4s, v14.4s\n"
- "ldr q15, [x19, x13]\n"
- "fmla v2.4s, v23.4s, v7.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v5.4s, v21.4s, v10.4s\n"
- "ldr q21, [x9, x14]\n"
- "fmla v4.4s, v29.4s, v7.4s\n"
- "ldr q23, [x20, x13]\n"
- "str q2, [x22]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v3.4s, v29.4s, v9.4s\n"
- "ldr q24, [x19, x14]\n"
- "str q4, [x21, %[output_col_stride1]]\n"
- "fmla v0.4s, v29.4s, v6.4s\n"
- "fmla v1.4s, v29.4s, v13.4s\n"
- "ldr q25, [x20, x14]\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "add x9, x9, #16\n"
- "fmla v5.4s, v28.4s, v7.4s\n"
- "add x19, x19, #16\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "add x20, x20, #16\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "fmla v0.4s, v20.4s, v12.4s\n"
- "str q5, [%[outptr0], x23]\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v3.4s, v17.4s, v8.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v0.4s, v17.4s, v14.4s\n"
- "fmla v1.4s, v16.4s, v7.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v16.4s, v9.4s\n"
- "str q1, [x22, %[output_col_stride1]]\n"
- "fmla v3.4s, v21.4s, v7.4s\n"
- "fmla v0.4s, v15.4s, v11.4s\n"
- "str q3, [x21, x23]\n"
- "fmla v0.4s, v21.4s, v13.4s\n"
- "add x21, x21, #16\n"
- "fmla v0.4s, v23.4s, v8.4s\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v0.4s, v25.4s, v7.4s\n"
- "str q0, [x22, x23]\n"
- "add x22, x22, #16\n"
- "4:\n"
- "cbz x24, 7f\n"
- "ldr s27, [%[wbptr]]\n"
- "mov v17.16b, v27.16b\n"
- "ldr s6, [%[wbptr], #4]\n"
- "mov v16.16b, v27.16b\n"
- "ldr s14, [%[wbptr], #8]\n"
- "mov v15.16b, v27.16b\n"
- "ldr s13, [%[wbptr], #12]\n"
- "mov v2.16b, v27.16b\n"
- "ldr s12, [%[wbptr], #16]\n"
- "mov v4.16b, v27.16b\n"
- "ldr s11, [%[wbptr], #20]\n"
- "mov v5.16b, v27.16b\n"
- "ldr s10, [%[wbptr], #24]\n"
- "mov v1.16b, v27.16b\n"
- "ldr s9, [%[wbptr], #28]\n"
- "mov v3.16b, v27.16b\n"
- "ldr s8, [%[wbptr], #32]\n"
- "mov v0.16b, v27.16b\n"
- "ldr s7, [%[wbptr], #36]\n"
- "ldr s29, [%[inptr0]]\n"
- "subs x24, x24, #1\n"
- "ldr s28, [x15]\n"
- "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
- "ldr s22, [x16]\n"
- "ldr s20, [x15, %[input_col_stride1]]\n"
- "ldr s19, [%[inptr0], x26]\n"
- "ldr s30, [x17]\n"
- "ldr s18, [x16, %[input_col_stride1]]\n"
- "beq 6f\n"
- "5:\n"
- "fmla v17.4s, v29.4s, v6.4s\n"
- "ldr s21, [x15, x26]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "ldr s27, [%[inptr0], x27]\n"
- "fmla v15.4s, v19.4s, v6.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v28.4s, v12.4s\n"
- "ldr s25, [x9]\n"
- "fmla v16.4s, v30.4s, v12.4s\n"
- "ldr s24, [x17, %[input_col_stride1]]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v17.4s, v26.4s, v14.4s\n"
- "ldr s23, [x16, x26]\n"
- "fmla v16.4s, v18.4s, v14.4s\n"
- "subs x24, x24, #1\n"
- "fmla v15.4s, v27.4s, v14.4s\n"
- "ldr s26, [x15, x27]\n"
- "fmla v17.4s, v22.4s, v9.4s\n"
- "ldr s22, [%[inptr0], x28]\n"
- "fmla v16.4s, v25.4s, v9.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v23.4s, v9.4s\n"
- "ldr s30, [x19]\n"
- "fmla v17.4s, v20.4s, v11.4s\n"
- "ldr s29, [x9, %[input_col_stride1]]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "ldr s28, [x17, x26]\n"
- "fmla v4.4s, v23.4s, v6.4s\n"
- "fmla v15.4s, v26.4s, v11.4s\n"
- "fmla v17.4s, v19.4s, v13.4s\n"
- "ldr s24, [x16, x27]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "ldr s25, [x15, x28]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "fmla v5.4s, v22.4s, v6.4s\n"
- "fmla v17.4s, v18.4s, v8.4s\n"
- "ldr s19, [%[inptr0], x13]\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr s18, [x20]\n"
- "fmla v16.4s, v29.4s, v8.4s\n"
- "ldr s22, [x19, %[input_col_stride1]]\n"
- "fmla v17.4s, v21.4s, v10.4s\n"
- "ldr s26, [x9, x26]\n"
- "fmla v2.4s, v29.4s, v14.4s\n"
- "ldr s20, [x17, x27]\n"
- "fmla v16.4s, v28.4s, v10.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v17.4s, v23.4s, v7.4s\n"
- "ldr s27, [x16, x28]\n"
- "fmla v15.4s, v24.4s, v8.4s\n"
- "ldr s30, [x15, x13]\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "ldr s24, [%[inptr0], x14]\n"
- "str s17, [%[outptr0]]\n"
- "fmla v5.4s, v25.4s, v12.4s\n"
- "fmla v15.4s, v25.4s, v10.4s\n"
- "ldr s28, [x20, %[input_col_stride1]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "ldr s17, [x19, x26]\n"
- "fmla v5.4s, v19.4s, v14.4s\n"
- "ldr s18, [x9, x27]\n"
- "fmla v16.4s, v26.4s, v7.4s\n"
- "ldr s25, [x17, x28]\n"
- "fmla v2.4s, v22.4s, v11.4s\n"
- "ldr s22, [x16, x13]\n"
- "fmla v4.4s, v26.4s, v9.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "str s16, [x21]\n"
- "fmla v1.4s, v26.4s, v6.4s\n"
- "fmla v2.4s, v26.4s, v13.4s\n"
- "ldr s21, [x15, x14]\n"
- "fmla v4.4s, v20.4s, v11.4s\n"
- "ldr s23, [x20, x26]\n"
- "fmla v15.4s, v27.4s, v7.4s\n"
- "ldr s19, [x19, x27]\n"
- "fmla v5.4s, v27.4s, v9.4s\n"
- "add x15, x15, #4\n"
- "fmla v4.4s, v27.4s, v13.4s\n"
- "fmla v3.4s, v27.4s, v6.4s\n"
- "str s15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v2.4s, v28.4s, v8.4s\n"
- "fmla v5.4s, v30.4s, v11.4s\n"
- "ldr s29, [x9, x28]\n"
- "fmla v1.4s, v17.4s, v12.4s\n"
- "ldr s27, [x17, x13]\n"
- "fmla v2.4s, v17.4s, v10.4s\n"
- "ldr s28, [x16, x14]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "ldr s26, [x20, x27]\n"
- "fmla v4.4s, v18.4s, v8.4s\n"
- "ldr s20, [x19, x28]\n"
- "fmla v1.4s, v18.4s, v14.4s\n"
- "ldr s17, [x9, x13]\n"
- "fmla v3.4s, v25.4s, v12.4s\n"
- "ldr s18, [x17, x14]\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "ldr s16, [x20, x28]\n"
- "fmla v5.4s, v22.4s, v8.4s\n"
- "add x16, x16, #4\n"
- "fmla v3.4s, v22.4s, v14.4s\n"
- "ldr s15, [x19, x13]\n"
- "fmla v2.4s, v23.4s, v7.4s\n"
- "add x17, x17, #4\n"
- "fmla v5.4s, v21.4s, v10.4s\n"
- "ldr s21, [x9, x14]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr s23, [x20, x13]\n"
- "str s2, [x22]\n"
- "fmla v4.4s, v29.4s, v7.4s\n"
- "fmla v3.4s, v29.4s, v9.4s\n"
- "ldr s24, [x19, x14]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "ldr s25, [x20, x14]\n"
- "str s4, [x21, %[output_col_stride1]]\n"
- "fmla v0.4s, v29.4s, v6.4s\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "ldr s27, [%[wbptr]]\n"
- "fmla v1.4s, v29.4s, v13.4s\n"
- "ldr s29, [%[inptr0]]\n"
- "fmla v5.4s, v28.4s, v7.4s\n"
- "ldr s6, [%[wbptr], #4]\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "ldr s28, [x15]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
- "str s5, [%[outptr0], x23]\n"
- "fmla v0.4s, v20.4s, v12.4s\n"
- "fmla v3.4s, v17.4s, v8.4s\n"
- "ldr s22, [x16]\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "ldr s20, [x15, %[input_col_stride1]]\n"
- "fmla v0.4s, v17.4s, v14.4s\n"
- "ldr s12, [%[wbptr], #16]\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "ldr s19, [%[inptr0], x26]\n"
- "fmla v1.4s, v16.4s, v7.4s\n"
- "ldr s30, [x17]\n"
- "fmla v0.4s, v16.4s, v9.4s\n"
- "ldr s14, [%[wbptr], #8]\n"
- "fmla v3.4s, v21.4s, v7.4s\n"
- "ldr s18, [x16, %[input_col_stride1]]\n"
- "str s1, [x22, %[output_col_stride1]]\n"
- "mov v17.16b, v27.16b\n"
- "fmla v0.4s, v15.4s, v11.4s\n"
- "ldr s9, [%[wbptr], #28]\n"
- "str s3, [x21, x23]\n"
- "mov v16.16b, v27.16b\n"
- "mov v15.16b, v27.16b\n"
- "add x9, x9, #4\n"
- "fmla v0.4s, v21.4s, v13.4s\n"
- "ldr s11, [%[wbptr], #20]\n"
- "mov v2.16b, v27.16b\n"
- "add x19, x19, #4\n"
- "mov v4.16b, v27.16b\n"
- "add x20, x20, #4\n"
- "fmla v0.4s, v23.4s, v8.4s\n"
- "ldr s13, [%[wbptr], #12]\n"
- "mov v5.16b, v27.16b\n"
- "add %[outptr0], %[outptr0], #4\n"
- "mov v1.16b, v27.16b\n"
- "add x21, x21, #4\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "ldr s8, [%[wbptr], #32]\n"
- "mov v3.16b, v27.16b\n"
- "fmla v0.4s, v25.4s, v7.4s\n"
- "ldr s10, [%[wbptr], #24]\n"
- "str s0, [x22, x23]\n"
- "mov v0.16b, v27.16b\n"
- "ldr s7, [%[wbptr], #36]\n"
- "add x22, x22, #4\n"
- "bne 5b\n"
- "6:\n"
- "fmla v17.4s, v29.4s, v6.4s\n"
- "ldr s21, [x15, x26]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "ldr s27, [%[inptr0], x27]\n"
- "fmla v15.4s, v19.4s, v6.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v28.4s, v12.4s\n"
- "ldr s25, [x9]\n"
- "fmla v16.4s, v30.4s, v12.4s\n"
- "ldr s24, [x17, %[input_col_stride1]]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v17.4s, v26.4s, v14.4s\n"
- "ldr s23, [x16, x26]\n"
- "fmla v16.4s, v18.4s, v14.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v27.4s, v14.4s\n"
- "ldr s26, [x15, x27]\n"
- "fmla v17.4s, v22.4s, v9.4s\n"
- "ldr s22, [%[inptr0], x28]\n"
- "fmla v16.4s, v25.4s, v9.4s\n"
- "ldr s30, [x19]\n"
- "fmla v15.4s, v23.4s, v9.4s\n"
- "fmla v4.4s, v23.4s, v6.4s\n"
- "fmla v17.4s, v20.4s, v11.4s\n"
- "ldr s29, [x9, %[input_col_stride1]]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "ldr s28, [x17, x26]\n"
- "fmla v15.4s, v26.4s, v11.4s\n"
- "ldr s24, [x16, x27]\n"
- "fmla v17.4s, v19.4s, v13.4s\n"
- "ldr s25, [x15, x28]\n"
- "fmla v16.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "ldr s19, [%[inptr0], x13]\n"
- "fmla v17.4s, v18.4s, v8.4s\n"
- "ldr s18, [x20]\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr s22, [x19, %[input_col_stride1]]\n"
- "fmla v16.4s, v29.4s, v8.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v17.4s, v21.4s, v10.4s\n"
- "ldr s26, [x9, x26]\n"
- "fmla v2.4s, v29.4s, v14.4s\n"
- "ldr s20, [x17, x27]\n"
- "fmla v16.4s, v28.4s, v10.4s\n"
- "ldr s27, [x16, x28]\n"
- "fmla v17.4s, v23.4s, v7.4s\n"
- "ldr s30, [x15, x13]\n"
- "fmla v15.4s, v24.4s, v8.4s\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "fmla v5.4s, v25.4s, v12.4s\n"
- "ldr s24, [%[inptr0], x14]\n"
- "str s17, [%[outptr0]]\n"
- "fmla v2.4s, v18.4s, v9.4s\n"
- "fmla v15.4s, v25.4s, v10.4s\n"
- "ldr s28, [x20, %[input_col_stride1]]\n"
- "fmla v5.4s, v19.4s, v14.4s\n"
- "ldr s17, [x19, x26]\n"
- "fmla v2.4s, v22.4s, v11.4s\n"
- "ldr s18, [x9, x27]\n"
- "fmla v16.4s, v26.4s, v7.4s\n"
- "ldr s25, [x17, x28]\n"
- "fmla v4.4s, v26.4s, v9.4s\n"
- "ldr s22, [x16, x13]\n"
- "fmla v2.4s, v26.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "str s16, [x21]\n"
- "fmla v1.4s, v26.4s, v6.4s\n"
- "fmla v4.4s, v20.4s, v11.4s\n"
- "ldr s21, [x15, x14]\n"
- "fmla v15.4s, v27.4s, v7.4s\n"
- "ldr s23, [x20, x26]\n"
- "fmla v5.4s, v27.4s, v9.4s\n"
- "ldr s19, [x19, x27]\n"
- "fmla v4.4s, v27.4s, v13.4s\n"
- "add x15, x15, #4\n"
- "str s15, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v3.4s, v27.4s, v6.4s\n"
- "fmla v5.4s, v30.4s, v11.4s\n"
- "ldr s29, [x9, x28]\n"
- "fmla v2.4s, v28.4s, v8.4s\n"
- "ldr s27, [x17, x13]\n"
- "fmla v1.4s, v17.4s, v12.4s\n"
- "ldr s28, [x16, x14]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "ldr s26, [x20, x27]\n"
- "fmla v2.4s, v17.4s, v10.4s\n"
- "ldr s20, [x19, x28]\n"
- "fmla v4.4s, v18.4s, v8.4s\n"
- "ldr s17, [x9, x13]\n"
- "fmla v1.4s, v18.4s, v14.4s\n"
- "ldr s18, [x17, x14]\n"
- "fmla v3.4s, v25.4s, v12.4s\n"
- "add x16, x16, #4\n"
- "fmla v4.4s, v25.4s, v10.4s\n"
- "ldr s16, [x20, x28]\n"
- "fmla v5.4s, v22.4s, v8.4s\n"
- "add x17, x17, #4\n"
- "fmla v3.4s, v22.4s, v14.4s\n"
- "ldr s15, [x19, x13]\n"
- "fmla v2.4s, v23.4s, v7.4s\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "fmla v5.4s, v21.4s, v10.4s\n"
- "ldr s21, [x9, x14]\n"
- "fmla v4.4s, v29.4s, v7.4s\n"
- "ldr s23, [x20, x13]\n"
- "str s2, [x22]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v3.4s, v29.4s, v9.4s\n"
- "ldr s24, [x19, x14]\n"
- "str s4, [x21, %[output_col_stride1]]\n"
- "fmla v0.4s, v29.4s, v6.4s\n"
- "fmla v1.4s, v29.4s, v13.4s\n"
- "ldr s25, [x20, x14]\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "add x9, x9, #4\n"
- "fmla v5.4s, v28.4s, v7.4s\n"
- "add x19, x19, #4\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "add x20, x20, #4\n"
- "fmla v3.4s, v28.4s, v13.4s\n"
- "fmla v0.4s, v20.4s, v12.4s\n"
- "str s5, [%[outptr0], x23]\n"
- "fmla v1.4s, v20.4s, v10.4s\n"
- "fmla v3.4s, v17.4s, v8.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v0.4s, v17.4s, v14.4s\n"
- "fmla v1.4s, v16.4s, v7.4s\n"
- "fmla v3.4s, v18.4s, v10.4s\n"
- "fmla v0.4s, v16.4s, v9.4s\n"
- "str s1, [x22, %[output_col_stride1]]\n"
- "fmla v3.4s, v21.4s, v7.4s\n"
- "fmla v0.4s, v15.4s, v11.4s\n"
- "str s3, [x21, x23]\n"
- "fmla v0.4s, v21.4s, v13.4s\n"
- "add x21, x21, #4\n"
- "fmla v0.4s, v23.4s, v8.4s\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v0.4s, v25.4s, v7.4s\n"
- "str s0, [x22, x23]\n"
- "add x22, x22, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
- : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
- );
-}
-#endif // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 89d1f2238b..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ /dev/null
@@ -1,6018 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x8, %[inptr0], %[input_row_stride]\n"
- "add x15, %[input_col_stride1], %[input_col_stride1]\n"
- "add x23, %[outptr0], %[output_row_stride]\n"
- "add x9, x8, %[input_row_stride]\n"
- "add x16, x15, #64\n"
- "add x17, x15, %[input_col_stride1]\n"
- "add x10, x9, %[input_row_stride]\n"
- "add x7, x17, #64\n"
- "add x19, x17, %[input_col_stride1]\n"
- "add x11, x10, %[input_row_stride]\n"
- "add x20, x19, #64\n"
- "add x21, x19, %[input_col_stride1]\n"
- "add x12, x11, %[input_row_stride]\n"
- "add x22, x21, #64\n"
- "add x24, x23, %[output_row_stride]\n"
- "add x25, x24, %[output_row_stride]\n"
- "add x26, %[output_col_stride1], %[output_col_stride1]\n"
- "and x13, %[n_channels], #3\n"
- "add x27, x26, %[output_col_stride1]\n"
- "lsr x14, %[n_channels], #2\n"
- "cbz x14, 4f\n"
- "1:\n"
- "ldr q14, [%[wbptr]]\n"
- "subs x14, x14, #1\n"
- "mov v17.16b, v14.16b\n"
- "ldr q12, [%[wbptr], #16]\n"
- "mov v23.16b, v14.16b\n"
- "ldr q11, [%[wbptr], #32]\n"
- "mov v24.16b, v14.16b\n"
- "ldr q10, [%[wbptr], #48]\n"
- "mov v20.16b, v14.16b\n"
- "ldr q9, [%[wbptr], #64]\n"
- "mov v16.16b, v14.16b\n"
- "ldr q8, [%[wbptr], #80]\n"
- "mov v13.16b, v14.16b\n"
- "ldr q7, [%[wbptr], #96]\n"
- "mov v0.16b, v14.16b\n"
- "ldr q6, [%[wbptr], #112]\n"
- "mov v1.16b, v14.16b\n"
- "ldr q5, [%[wbptr], #128]\n"
- "mov v2.16b, v14.16b\n"
- "ldr q4, [%[wbptr], #144]\n"
- "mov v3.16b, v14.16b\n"
- "ldr q29, [%[inptr0]]\n"
- "fmla v17.4s, v29.4s, v12.4s\n"
- "ldr q28, [x8]\n"
- "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
- "ldr q25, [x9]\n"
- "ldr q26, [x8, %[input_col_stride1]]\n"
- "ldr q27, [%[inptr0], x15]\n"
- "ldr q15, [x10]\n"
- "ldr q18, [x9, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x8, #64]\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "prfm pldl1keep, [x9, #64]\n"
- "prfm pldl1keep, [x8, x28]\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "prfm pldl1keep, [x10, #64]\n"
- "prfm pldl1keep, [x9, x28]\n"
- "beq 3f\n"
- "2:\n"
- "fmla v17.4s, v28.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x16]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr q22, [x8, x15]\n"
- "fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "ldr q29, [%[inptr0], x17]\n"
- "fmla v23.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x11, #64]\n"
- "fmla v20.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x10, x28]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "ldr q25, [x11]\n"
- "fmla v23.4s, v26.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x7]\n"
- "fmla v17.4s, v26.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v16.4s, v26.4s, v12.4s\n"
- "ldr q28, [x10, %[input_col_stride1]]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "prfm pldl1keep, [x12, #64]\n"
- "fmla v17.4s, v27.4s, v10.4s\n"
- "prfm pldl1keep, [x11, x28]\n"
- "fmla v13.4s, v27.4s, v12.4s\n"
- "ldr q19, [x9, x15]\n"
- "fmla v23.4s, v15.4s, v6.4s\n"
- "prfm pldl1keep, [x10, x16]\n"
- "fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v0.4s, v15.4s, v12.4s\n"
- "ldr q21, [x8, x17]\n"
- "fmla v17.4s, v18.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x20]\n"
- "fmla v23.4s, v18.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v24.4s, v18.4s, v6.4s\n"
- "prfm pldl1keep, [x12, x28]\n"
- "fmla v20.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x16]\n"
- "fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x7]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "ldr q27, [%[inptr0], x19]\n"
- "fmla v17.4s, v22.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x20]\n"
- "fmla v23.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x8, x22]\n"
- "fmla v24.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x16]\n"
- "fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x7]\n"
- "fmla v13.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x20]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr q18, [x12]\n"
- "fmla v24.4s, v29.4s, v10.4s\n"
- "prfm pldl1keep, [x9, x22]\n"
- "fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x7]\n"
- "fmla v3.4s, v29.4s, v12.4s\n"
- "ldr q22, [x11, %[input_col_stride1]]\n"
- "fmla v20.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x20]\n"
- "fmla v0.4s, v25.4s, v9.4s\n"
- "ldr q25, [x10, x15]\n"
- "fmla v23.4s, v28.4s, v5.4s\n"
- "prfm pldl1keep, [x10, x22]\n"
- "fmla v20.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x20]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x22]\n"
- "fmla v0.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x22]\n"
- "fmla v1.4s, v28.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v23.4s, v19.4s, v7.4s\n"
- "subs x14, x14, #1\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v10.4s\n"
- "str q17, [%[outptr0]]\n"
- "mov v15.16b, v14.16b\n"
- "fmla v16.4s, v19.4s, v8.4s\n"
- "fmla v13.4s, v19.4s, v6.4s\n"
- "fmla v15.4s, v28.4s, v12.4s\n"
- "ldr q29, [x9, x17]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v2.4s, v19.4s, v9.4s\n"
- "fmla v24.4s, v21.4s, v7.4s\n"
- "fmla v16.4s, v21.4s, v10.4s\n"
- "fmla v13.4s, v21.4s, v8.4s\n"
- "fmla v3.4s, v21.4s, v9.4s\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v18.16b, v14.16b\n"
- "fmla v20.4s, v22.4s, v5.4s\n"
- "fmla v13.4s, v27.4s, v10.4s\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "mov v17.16b, v14.16b\n"
- "fmla v18.4s, v19.4s, v12.4s\n"
- "mov v19.16b, v14.16b\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "fmla v17.4s, v21.4s, v12.4s\n"
- "ldr q26, [x8, x19]\n"
- "fmla v1.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v9.4s\n"
- "mov v22.16b, v14.16b\n"
- "mov v21.16b, v14.16b\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v20.4s, v25.4s, v7.4s\n"
- "fmla v16.4s, v25.4s, v5.4s\n"
- "fmla v0.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "str q23, [x23]\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "ldr q28, [%[inptr0], x21]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr q30, [x12, %[input_col_stride1]]\n"
- "fmla v24.4s, v29.4s, v4.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v16.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "str q24, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v1.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "ldr q27, [x11, x15]\n"
- "fmla v3.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "fmla v22.4s, v29.4s, v12.4s\n"
- "ldr q23, [x10, x17]\n"
- "fmla v13.4s, v26.4s, v7.4s\n"
- "fmla v2.4s, v26.4s, v10.4s\n"
- "fmla v3.4s, v26.4s, v8.4s\n"
- "fmla v17.4s, v26.4s, v11.4s\n"
- "fmla v0.4s, v30.4s, v5.4s\n"
- "ldr q24, [x9, x19]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "ldr q29, [x8, x21]\n"
- "fmla v3.4s, v28.4s, v10.4s\n"
- "ldr q14, [x12, x15]\n"
- "fmla v20.4s, v27.4s, v4.4s\n"
- "add x8, x8, #16\n"
- "fmla v0.4s, v27.4s, v7.4s\n"
- "prfm pldl1keep, [x8, #64]\n"
- "fmla v1.4s, v27.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x28]\n"
- "str q20, [x24]\n"
- "fmla v15.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v6.4s\n"
- "ldr q25, [x11, x17]\n"
- "fmla v19.4s, v27.4s, v9.4s\n"
- "ldr q30, [x10, x19]\n"
- "fmla v16.4s, v23.4s, v4.4s\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "fmla v2.4s, v23.4s, v5.4s\n"
- "fmla v15.4s, v23.4s, v10.4s\n"
- "fmla v18.4s, v23.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v6.4s\n"
- "str q16, [x23, %[output_col_stride1]]\n"
- "fmla v19.4s, v23.4s, v11.4s\n"
- "fmla v22.4s, v23.4s, v9.4s\n"
- "ldr q26, [x9, x21]\n"
- "fmla v21.4s, v23.4s, v12.4s\n"
- "ldr q27, [x12, x17]\n"
- "fmla v13.4s, v24.4s, v4.4s\n"
- "ldr q20, [x11, x19]\n"
- "fmla v2.4s, v24.4s, v7.4s\n"
- "add x9, x9, #16\n"
- "fmla v3.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "str q13, [%[outptr0], x26]\n"
- "fmla v18.4s, v24.4s, v10.4s\n"
- "fmla v17.4s, v24.4s, v8.4s\n"
- "ldr q23, [x10, x21]\n"
- "fmla v22.4s, v24.4s, v11.4s\n"
- "ldr q24, [x12, x19]\n"
- "fmla v3.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x28]\n"
- "fmla v17.4s, v29.4s, v10.4s\n"
- "ldr q16, [x11, x21]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "add x10, x10, #16\n"
- "fmla v15.4s, v14.4s, v5.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v19.4s, v14.4s, v6.4s\n"
- "ldr q13, [x12, x21]\n"
- "str q0, [x25]\n"
- "fmla v1.4s, v25.4s, v4.4s\n"
- "fmla v15.4s, v25.4s, v7.4s\n"
- "ldr q14, [%[wbptr]]\n"
- "fmla v18.4s, v25.4s, v5.4s\n"
- "add x11, x11, #16\n"
- "str q1, [x24, %[output_col_stride1]]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "fmla v22.4s, v25.4s, v6.4s\n"
- "ldr q12, [%[wbptr], #16]\n"
- "fmla v21.4s, v25.4s, v9.4s\n"
- "ldr q29, [%[inptr0]]\n"
- "fmla v2.4s, v30.4s, v4.4s\n"
- "ldr q28, [x8]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "add x12, x12, #16\n"
- "fmla v17.4s, v30.4s, v5.4s\n"
- "fmla v19.4s, v30.4s, v10.4s\n"
- "str q2, [x23, x26]\n"
- "fmla v22.4s, v30.4s, v8.4s\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr q9, [%[wbptr], #64]\n"
- "fmla v3.4s, v26.4s, v4.4s\n"
- "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v17.4s, v26.4s, v7.4s\n"
- "ldr q25, [x9]\n"
- "fmla v22.4s, v26.4s, v10.4s\n"
- "ldr q11, [%[wbptr], #32]\n"
- "str q3, [%[outptr0], x27]\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v19.4s, v27.4s, v5.4s\n"
- "ldr q26, [x8, %[input_col_stride1]]\n"
- "fmla v21.4s, v27.4s, v6.4s\n"
- "ldr q27, [%[inptr0], x15]\n"
- "str q15, [x25, %[output_col_stride1]]\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v19.4s, v20.4s, v7.4s\n"
- "ldr q15, [x10]\n"
- "fmla v22.4s, v20.4s, v5.4s\n"
- "ldr q6, [%[wbptr], #112]\n"
- "str q18, [x24, x26]\n"
- "fmla v21.4s, v20.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "ldr q18, [x9, %[input_col_stride1]]\n"
- "fmla v22.4s, v23.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v21.4s, v23.4s, v10.4s\n"
- "ldr q8, [%[wbptr], #80]\n"
- "str q17, [x23, x27]\n"
- "fmla v19.4s, v24.4s, v4.4s\n"
- "fmla v22.4s, v16.4s, v4.4s\n"
- "add x23, x23, #16\n"
- "fmla v21.4s, v24.4s, v5.4s\n"
- "ldr q10, [%[wbptr], #48]\n"
- "str q19, [x25, x26]\n"
- "mov v17.16b, v14.16b\n"
- "str q22, [x24, x27]\n"
- "mov v23.16b, v14.16b\n"
- "fmla v21.4s, v16.4s, v7.4s\n"
- "ldr q5, [%[wbptr], #128]\n"
- "mov v24.16b, v14.16b\n"
- "add x24, x24, #16\n"
- "mov v20.16b, v14.16b\n"
- "mov v16.16b, v14.16b\n"
- "fmla v21.4s, v13.4s, v4.4s\n"
- "ldr q7, [%[wbptr], #96]\n"
- "mov v13.16b, v14.16b\n"
- "mov v0.16b, v14.16b\n"
- "mov v1.16b, v14.16b\n"
- "mov v2.16b, v14.16b\n"
- "str q21, [x25, x27]\n"
- "mov v3.16b, v14.16b\n"
- "ldr q4, [%[wbptr], #144]\n"
- "add x25, x25, #16\n"
- "fmla v17.4s, v29.4s, v12.4s\n"
- "bne 2b\n"
- "3:\n"
- "fmla v17.4s, v28.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x16]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr q22, [x8, x15]\n"
- "fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "ldr q29, [%[inptr0], x17]\n"
- "fmla v23.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x11, #64]\n"
- "fmla v20.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x10, x28]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "ldr q25, [x11]\n"
- "fmla v23.4s, v26.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x7]\n"
- "fmla v17.4s, v26.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v16.4s, v26.4s, v12.4s\n"
- "ldr q28, [x10, %[input_col_stride1]]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "prfm pldl1keep, [x12, #64]\n"
- "fmla v17.4s, v27.4s, v10.4s\n"
- "prfm pldl1keep, [x11, x28]\n"
- "fmla v13.4s, v27.4s, v12.4s\n"
- "ldr q19, [x9, x15]\n"
- "fmla v23.4s, v15.4s, v6.4s\n"
- "prfm pldl1keep, [x10, x16]\n"
- "fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v0.4s, v15.4s, v12.4s\n"
- "ldr q21, [x8, x17]\n"
- "fmla v17.4s, v18.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x20]\n"
- "fmla v23.4s, v18.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v24.4s, v18.4s, v6.4s\n"
- "prfm pldl1keep, [x12, x28]\n"
- "fmla v20.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x16]\n"
- "fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x7]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "ldr q27, [%[inptr0], x19]\n"
- "fmla v17.4s, v22.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x20]\n"
- "fmla v23.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x8, x22]\n"
- "fmla v24.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x16]\n"
- "fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x7]\n"
- "fmla v13.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x20]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr q18, [x12]\n"
- "fmla v24.4s, v29.4s, v10.4s\n"
- "prfm pldl1keep, [x9, x22]\n"
- "fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x7]\n"
- "fmla v3.4s, v29.4s, v12.4s\n"
- "ldr q22, [x11, %[input_col_stride1]]\n"
- "fmla v20.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x20]\n"
- "fmla v0.4s, v25.4s, v9.4s\n"
- "ldr q25, [x10, x15]\n"
- "fmla v23.4s, v28.4s, v5.4s\n"
- "prfm pldl1keep, [x10, x22]\n"
- "fmla v20.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x20]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x22]\n"
- "fmla v0.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x22]\n"
- "fmla v1.4s, v28.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v17.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v23.4s, v19.4s, v7.4s\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v10.4s\n"
- "fmla v16.4s, v19.4s, v8.4s\n"
- "str q17, [%[outptr0]]\n"
- "mov v15.16b, v14.16b\n"
- "fmla v13.4s, v19.4s, v6.4s\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v15.4s, v28.4s, v12.4s\n"
- "ldr q29, [x9, x17]\n"
- "fmla v2.4s, v19.4s, v9.4s\n"
- "fmla v24.4s, v21.4s, v7.4s\n"
- "fmla v16.4s, v21.4s, v10.4s\n"
- "fmla v13.4s, v21.4s, v8.4s\n"
- "fmla v3.4s, v21.4s, v9.4s\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v18.16b, v14.16b\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "fmla v13.4s, v27.4s, v10.4s\n"
- "fmla v20.4s, v22.4s, v5.4s\n"
- "fmla v18.4s, v19.4s, v12.4s\n"
- "ldr q26, [x8, x19]\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "ldr q28, [%[inptr0], x21]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v1.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v9.4s\n"
- "mov v17.16b, v14.16b\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v20.4s, v25.4s, v7.4s\n"
- "fmla v16.4s, v25.4s, v5.4s\n"
- "fmla v17.4s, v21.4s, v12.4s\n"
- "ldr q30, [x12, %[input_col_stride1]]\n"
- "str q23, [x23]\n"
- "mov v19.16b, v14.16b\n"
- "fmla v0.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "mov v22.16b, v14.16b\n"
- "mov v21.16b, v14.16b\n"
- "fmla v24.4s, v29.4s, v4.4s\n"
- "fmla v16.4s, v29.4s, v7.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v1.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v3.4s, v29.4s, v6.4s\n"
- "str q24, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "ldr q27, [x11, x15]\n"
- "fmla v22.4s, v29.4s, v12.4s\n"
- "ldr q23, [x10, x17]\n"
- "fmla v13.4s, v26.4s, v7.4s\n"
- "fmla v2.4s, v26.4s, v10.4s\n"
- "fmla v3.4s, v26.4s, v8.4s\n"
- "fmla v17.4s, v26.4s, v11.4s\n"
- "fmla v0.4s, v30.4s, v5.4s\n"
- "ldr q24, [x9, x19]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "ldr q29, [x8, x21]\n"
- "fmla v3.4s, v28.4s, v10.4s\n"
- "ldr q14, [x12, x15]\n"
- "fmla v20.4s, v27.4s, v4.4s\n"
- "add x8, x8, #16\n"
- "fmla v0.4s, v27.4s, v7.4s\n"
- "fmla v1.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v6.4s\n"
- "str q20, [x24]\n"
- "fmla v19.4s, v27.4s, v9.4s\n"
- "fmla v16.4s, v23.4s, v4.4s\n"
- "ldr q25, [x11, x17]\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "ldr q30, [x10, x19]\n"
- "fmla v2.4s, v23.4s, v5.4s\n"
- "fmla v15.4s, v23.4s, v10.4s\n"
- "str q16, [x23, %[output_col_stride1]]\n"
- "fmla v18.4s, v23.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v6.4s\n"
- "ldr q26, [x9, x21]\n"
- "fmla v19.4s, v23.4s, v11.4s\n"
- "add x9, x9, #16\n"
- "fmla v22.4s, v23.4s, v9.4s\n"
- "fmla v21.4s, v23.4s, v12.4s\n"
- "fmla v13.4s, v24.4s, v4.4s\n"
- "ldr q27, [x12, x17]\n"
- "fmla v2.4s, v24.4s, v7.4s\n"
- "ldr q20, [x11, x19]\n"
- "fmla v3.4s, v24.4s, v5.4s\n"
- "fmla v18.4s, v24.4s, v10.4s\n"
- "str q13, [%[outptr0], x26]\n"
- "fmla v17.4s, v24.4s, v8.4s\n"
- "fmla v22.4s, v24.4s, v11.4s\n"
- "ldr q23, [x10, x21]\n"
- "fmla v3.4s, v29.4s, v7.4s\n"
- "ldr q24, [x12, x19]\n"
- "fmla v17.4s, v29.4s, v10.4s\n"
- "ldr q16, [x11, x21]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "add x10, x10, #16\n"
- "fmla v15.4s, v14.4s, v5.4s\n"
- "add x11, x11, #16\n"
- "fmla v19.4s, v14.4s, v6.4s\n"
- "ldr q13, [x12, x21]\n"
- "str q0, [x25]\n"
- "fmla v1.4s, v25.4s, v4.4s\n"
- "fmla v15.4s, v25.4s, v7.4s\n"
- "add x12, x12, #16\n"
- "fmla v18.4s, v25.4s, v5.4s\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "str q1, [x24, %[output_col_stride1]]\n"
- "fmla v22.4s, v25.4s, v6.4s\n"
- "fmla v21.4s, v25.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v4.4s\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "fmla v17.4s, v30.4s, v5.4s\n"
- "fmla v19.4s, v30.4s, v10.4s\n"
- "fmla v22.4s, v30.4s, v8.4s\n"
- "str q2, [x23, x26]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "fmla v3.4s, v26.4s, v4.4s\n"
- "fmla v17.4s, v26.4s, v7.4s\n"
- "fmla v22.4s, v26.4s, v10.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v19.4s, v27.4s, v5.4s\n"
- "fmla v21.4s, v27.4s, v6.4s\n"
- "str q3, [%[outptr0], x27]\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "str q15, [x25, %[output_col_stride1]]\n"
- "fmla v22.4s, v20.4s, v5.4s\n"
- "fmla v19.4s, v20.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "str q18, [x24, x26]\n"
- "fmla v21.4s, v20.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "fmla v22.4s, v23.4s, v7.4s\n"
- "fmla v19.4s, v24.4s, v4.4s\n"
- "fmla v21.4s, v23.4s, v10.4s\n"
- "str q17, [x23, x27]\n"
- "fmla v22.4s, v16.4s, v4.4s\n"
- "str q19, [x25, x26]\n"
- "add x23, x23, #16\n"
- "fmla v21.4s, v24.4s, v5.4s\n"
- "str q22, [x24, x27]\n"
- "add x24, x24, #16\n"
- "fmla v21.4s, v16.4s, v7.4s\n"
- "fmla v21.4s, v13.4s, v4.4s\n"
- "str q21, [x25, x27]\n"
- "add x25, x25, #16\n"
- "4:\n"
- "cbz x13, 7f\n"
- "ldr s14, [%[wbptr]]\n"
- "mov v17.16b, v14.16b\n"
- "ldr s12, [%[wbptr], #4]\n"
- "mov v23.16b, v14.16b\n"
- "ldr s11, [%[wbptr], #8]\n"
- "mov v24.16b, v14.16b\n"
- "ldr s10, [%[wbptr], #12]\n"
- "mov v20.16b, v14.16b\n"
- "ldr s9, [%[wbptr], #16]\n"
- "mov v16.16b, v14.16b\n"
- "ldr s8, [%[wbptr], #20]\n"
- "mov v13.16b, v14.16b\n"
- "ldr s7, [%[wbptr], #24]\n"
- "mov v0.16b, v14.16b\n"
- "ldr s6, [%[wbptr], #28]\n"
- "mov v1.16b, v14.16b\n"
- "ldr s5, [%[wbptr], #32]\n"
- "mov v2.16b, v14.16b\n"
- "ldr s4, [%[wbptr], #36]\n"
- "mov v3.16b, v14.16b\n"
- "ldr s29, [%[inptr0]]\n"
- "fmla v17.4s, v29.4s, v12.4s\n"
- "ldr s28, [x8]\n"
- "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
- "subs x13, x13, #1\n"
- "ldr s25, [x9]\n"
- "ldr s26, [x8, %[input_col_stride1]]\n"
- "ldr s27, [%[inptr0], x15]\n"
- "ldr s15, [x10]\n"
- "ldr s18, [x9, %[input_col_stride1]]\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x8, #64]\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "prfm pldl1keep, [x9, #64]\n"
- "prfm pldl1keep, [x8, x28]\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "prfm pldl1keep, [x10, #64]\n"
- "prfm pldl1keep, [x9, x28]\n"
- "beq 6f\n"
- "5:\n"
- "fmla v17.4s, v28.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x16]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr s22, [x8, x15]\n"
- "fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "ldr s29, [%[inptr0], x17]\n"
- "fmla v23.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x11, #64]\n"
- "fmla v20.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x10, x28]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "ldr s25, [x11]\n"
- "fmla v23.4s, v26.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x7]\n"
- "fmla v17.4s, v26.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v16.4s, v26.4s, v12.4s\n"
- "ldr s28, [x10, %[input_col_stride1]]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "prfm pldl1keep, [x12, #64]\n"
- "fmla v17.4s, v27.4s, v10.4s\n"
- "prfm pldl1keep, [x11, x28]\n"
- "fmla v13.4s, v27.4s, v12.4s\n"
- "ldr s19, [x9, x15]\n"
- "fmla v23.4s, v15.4s, v6.4s\n"
- "prfm pldl1keep, [x10, x16]\n"
- "fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v0.4s, v15.4s, v12.4s\n"
- "ldr s21, [x8, x17]\n"
- "fmla v17.4s, v18.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x20]\n"
- "fmla v23.4s, v18.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v24.4s, v18.4s, v6.4s\n"
- "prfm pldl1keep, [x12, x28]\n"
- "fmla v20.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x16]\n"
- "fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x7]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "ldr s27, [%[inptr0], x19]\n"
- "fmla v17.4s, v22.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x20]\n"
- "fmla v23.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x8, x22]\n"
- "fmla v24.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x16]\n"
- "fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x7]\n"
- "fmla v13.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x20]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr s18, [x12]\n"
- "fmla v24.4s, v29.4s, v10.4s\n"
- "prfm pldl1keep, [x9, x22]\n"
- "fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x7]\n"
- "fmla v3.4s, v29.4s, v12.4s\n"
- "ldr s22, [x11, %[input_col_stride1]]\n"
- "fmla v20.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x20]\n"
- "fmla v0.4s, v25.4s, v9.4s\n"
- "ldr s25, [x10, x15]\n"
- "fmla v23.4s, v28.4s, v5.4s\n"
- "prfm pldl1keep, [x10, x22]\n"
- "fmla v20.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x20]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x22]\n"
- "fmla v0.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x22]\n"
- "fmla v1.4s, v28.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v23.4s, v19.4s, v7.4s\n"
- "subs x13, x13, #1\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v10.4s\n"
- "str s17, [%[outptr0]]\n"
- "mov v15.16b, v14.16b\n"
- "fmla v16.4s, v19.4s, v8.4s\n"
- "fmla v13.4s, v19.4s, v6.4s\n"
- "fmla v15.4s, v28.4s, v12.4s\n"
- "ldr s29, [x9, x17]\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v2.4s, v19.4s, v9.4s\n"
- "fmla v24.4s, v21.4s, v7.4s\n"
- "fmla v16.4s, v21.4s, v10.4s\n"
- "fmla v13.4s, v21.4s, v8.4s\n"
- "fmla v3.4s, v21.4s, v9.4s\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v18.16b, v14.16b\n"
- "fmla v20.4s, v22.4s, v5.4s\n"
- "fmla v13.4s, v27.4s, v10.4s\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "mov v17.16b, v14.16b\n"
- "fmla v18.4s, v19.4s, v12.4s\n"
- "mov v19.16b, v14.16b\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "fmla v17.4s, v21.4s, v12.4s\n"
- "ldr s26, [x8, x19]\n"
- "fmla v1.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v9.4s\n"
- "mov v22.16b, v14.16b\n"
- "mov v21.16b, v14.16b\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v20.4s, v25.4s, v7.4s\n"
- "fmla v16.4s, v25.4s, v5.4s\n"
- "fmla v0.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "str s23, [x23]\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "ldr s28, [%[inptr0], x21]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr s30, [x12, %[input_col_stride1]]\n"
- "fmla v24.4s, v29.4s, v4.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v16.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x28]\n"
- "str s24, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v1.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "ldr s27, [x11, x15]\n"
- "fmla v3.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "fmla v22.4s, v29.4s, v12.4s\n"
- "ldr s23, [x10, x17]\n"
- "fmla v13.4s, v26.4s, v7.4s\n"
- "fmla v2.4s, v26.4s, v10.4s\n"
- "fmla v3.4s, v26.4s, v8.4s\n"
- "fmla v17.4s, v26.4s, v11.4s\n"
- "fmla v0.4s, v30.4s, v5.4s\n"
- "ldr s24, [x9, x19]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "ldr s29, [x8, x21]\n"
- "fmla v3.4s, v28.4s, v10.4s\n"
- "ldr s14, [x12, x15]\n"
- "fmla v20.4s, v27.4s, v4.4s\n"
- "add x8, x8, #4\n"
- "fmla v0.4s, v27.4s, v7.4s\n"
- "prfm pldl1keep, [x8, #64]\n"
- "fmla v1.4s, v27.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x28]\n"
- "str s20, [x24]\n"
- "fmla v15.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v6.4s\n"
- "ldr s25, [x11, x17]\n"
- "fmla v19.4s, v27.4s, v9.4s\n"
- "ldr s30, [x10, x19]\n"
- "fmla v16.4s, v23.4s, v4.4s\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "fmla v2.4s, v23.4s, v5.4s\n"
- "fmla v15.4s, v23.4s, v10.4s\n"
- "fmla v18.4s, v23.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v6.4s\n"
- "str s16, [x23, %[output_col_stride1]]\n"
- "fmla v19.4s, v23.4s, v11.4s\n"
- "fmla v22.4s, v23.4s, v9.4s\n"
- "ldr s26, [x9, x21]\n"
- "fmla v21.4s, v23.4s, v12.4s\n"
- "ldr s27, [x12, x17]\n"
- "fmla v13.4s, v24.4s, v4.4s\n"
- "ldr s20, [x11, x19]\n"
- "fmla v2.4s, v24.4s, v7.4s\n"
- "add x9, x9, #4\n"
- "fmla v3.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "str s13, [%[outptr0], x26]\n"
- "fmla v18.4s, v24.4s, v10.4s\n"
- "fmla v17.4s, v24.4s, v8.4s\n"
- "ldr s23, [x10, x21]\n"
- "fmla v22.4s, v24.4s, v11.4s\n"
- "ldr s24, [x12, x19]\n"
- "fmla v3.4s, v29.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x28]\n"
- "fmla v17.4s, v29.4s, v10.4s\n"
- "ldr s16, [x11, x21]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "add x10, x10, #4\n"
- "fmla v15.4s, v14.4s, v5.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v19.4s, v14.4s, v6.4s\n"
- "ldr s13, [x12, x21]\n"
- "str s0, [x25]\n"
- "fmla v1.4s, v25.4s, v4.4s\n"
- "fmla v15.4s, v25.4s, v7.4s\n"
- "ldr s14, [%[wbptr]]\n"
- "fmla v18.4s, v25.4s, v5.4s\n"
- "add x11, x11, #4\n"
- "str s1, [x24, %[output_col_stride1]]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "fmla v22.4s, v25.4s, v6.4s\n"
- "ldr s12, [%[wbptr], #4]\n"
- "fmla v21.4s, v25.4s, v9.4s\n"
- "ldr s29, [%[inptr0]]\n"
- "fmla v2.4s, v30.4s, v4.4s\n"
- "ldr s28, [x8]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "add x12, x12, #4\n"
- "fmla v17.4s, v30.4s, v5.4s\n"
- "fmla v19.4s, v30.4s, v10.4s\n"
- "str s2, [x23, x26]\n"
- "fmla v22.4s, v30.4s, v8.4s\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr s9, [%[wbptr], #16]\n"
- "fmla v3.4s, v26.4s, v4.4s\n"
- "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v17.4s, v26.4s, v7.4s\n"
- "ldr s25, [x9]\n"
- "fmla v22.4s, v26.4s, v10.4s\n"
- "ldr s11, [%[wbptr], #8]\n"
- "str s3, [%[outptr0], x27]\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v19.4s, v27.4s, v5.4s\n"
- "ldr s26, [x8, %[input_col_stride1]]\n"
- "fmla v21.4s, v27.4s, v6.4s\n"
- "ldr s27, [%[inptr0], x15]\n"
- "str s15, [x25, %[output_col_stride1]]\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v19.4s, v20.4s, v7.4s\n"
- "ldr s15, [x10]\n"
- "fmla v22.4s, v20.4s, v5.4s\n"
- "ldr s6, [%[wbptr], #28]\n"
- "str s18, [x24, x26]\n"
- "fmla v21.4s, v20.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "ldr s18, [x9, %[input_col_stride1]]\n"
- "fmla v22.4s, v23.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v21.4s, v23.4s, v10.4s\n"
- "ldr s8, [%[wbptr], #20]\n"
- "str s17, [x23, x27]\n"
- "fmla v19.4s, v24.4s, v4.4s\n"
- "fmla v22.4s, v16.4s, v4.4s\n"
- "add x23, x23, #4\n"
- "fmla v21.4s, v24.4s, v5.4s\n"
- "ldr s10, [%[wbptr], #12]\n"
- "str s19, [x25, x26]\n"
- "mov v17.16b, v14.16b\n"
- "str s22, [x24, x27]\n"
- "mov v23.16b, v14.16b\n"
- "fmla v21.4s, v16.4s, v7.4s\n"
- "ldr s5, [%[wbptr], #32]\n"
- "mov v24.16b, v14.16b\n"
- "add x24, x24, #4\n"
- "mov v20.16b, v14.16b\n"
- "mov v16.16b, v14.16b\n"
- "fmla v21.4s, v13.4s, v4.4s\n"
- "ldr s7, [%[wbptr], #24]\n"
- "mov v13.16b, v14.16b\n"
- "mov v0.16b, v14.16b\n"
- "mov v1.16b, v14.16b\n"
- "mov v2.16b, v14.16b\n"
- "str s21, [x25, x27]\n"
- "mov v3.16b, v14.16b\n"
- "ldr s4, [%[wbptr], #36]\n"
- "add x25, x25, #4\n"
- "fmla v17.4s, v29.4s, v12.4s\n"
- "bne 5b\n"
- "6:\n"
- "fmla v17.4s, v28.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x16]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr s22, [x8, x15]\n"
- "fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "ldr s29, [%[inptr0], x17]\n"
- "fmla v23.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x11, #64]\n"
- "fmla v20.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x10, x28]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "ldr s25, [x11]\n"
- "fmla v23.4s, v26.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x7]\n"
- "fmla v17.4s, v26.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x20]\n"
- "fmla v16.4s, v26.4s, v12.4s\n"
- "ldr s28, [x10, %[input_col_stride1]]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "prfm pldl1keep, [x12, #64]\n"
- "fmla v17.4s, v27.4s, v10.4s\n"
- "prfm pldl1keep, [x11, x28]\n"
- "fmla v13.4s, v27.4s, v12.4s\n"
- "ldr s19, [x9, x15]\n"
- "fmla v23.4s, v15.4s, v6.4s\n"
- "prfm pldl1keep, [x10, x16]\n"
- "fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v0.4s, v15.4s, v12.4s\n"
- "ldr s21, [x8, x17]\n"
- "fmla v17.4s, v18.4s, v5.4s\n"
- "prfm pldl1keep, [x8, x20]\n"
- "fmla v23.4s, v18.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x22]\n"
- "fmla v24.4s, v18.4s, v6.4s\n"
- "prfm pldl1keep, [x12, x28]\n"
- "fmla v20.4s, v18.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x16]\n"
- "fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x7]\n"
- "fmla v1.4s, v18.4s, v12.4s\n"
- "ldr s27, [%[inptr0], x19]\n"
- "fmla v17.4s, v22.4s, v7.4s\n"
- "prfm pldl1keep, [x9, x20]\n"
- "fmla v23.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x8, x22]\n"
- "fmla v24.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x16]\n"
- "fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x7]\n"
- "fmla v13.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x20]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "ldr s18, [x12]\n"
- "fmla v24.4s, v29.4s, v10.4s\n"
- "prfm pldl1keep, [x9, x22]\n"
- "fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x7]\n"
- "fmla v3.4s, v29.4s, v12.4s\n"
- "ldr s22, [x11, %[input_col_stride1]]\n"
- "fmla v20.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x20]\n"
- "fmla v0.4s, v25.4s, v9.4s\n"
- "ldr s25, [x10, x15]\n"
- "fmla v23.4s, v28.4s, v5.4s\n"
- "prfm pldl1keep, [x10, x22]\n"
- "fmla v20.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x12, x20]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "prfm pldl1keep, [x11, x22]\n"
- "fmla v0.4s, v28.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x22]\n"
- "fmla v1.4s, v28.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v17.4s, v19.4s, v4.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v23.4s, v19.4s, v7.4s\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v10.4s\n"
- "fmla v16.4s, v19.4s, v8.4s\n"
- "str s17, [%[outptr0]]\n"
- "mov v15.16b, v14.16b\n"
- "fmla v13.4s, v19.4s, v6.4s\n"
- "fmla v1.4s, v19.4s, v11.4s\n"
- "fmla v15.4s, v28.4s, v12.4s\n"
- "ldr s29, [x9, x17]\n"
- "fmla v2.4s, v19.4s, v9.4s\n"
- "fmla v24.4s, v21.4s, v7.4s\n"
- "fmla v16.4s, v21.4s, v10.4s\n"
- "fmla v13.4s, v21.4s, v8.4s\n"
- "fmla v3.4s, v21.4s, v9.4s\n"
- "fmla v0.4s, v18.4s, v6.4s\n"
- "mov v18.16b, v14.16b\n"
- "fmla v2.4s, v21.4s, v11.4s\n"
- "fmla v13.4s, v27.4s, v10.4s\n"
- "fmla v20.4s, v22.4s, v5.4s\n"
- "fmla v18.4s, v19.4s, v12.4s\n"
- "ldr s26, [x8, x19]\n"
- "fmla v3.4s, v27.4s, v11.4s\n"
- "ldr s28, [%[inptr0], x21]\n"
- "fmla v0.4s, v22.4s, v8.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v1.4s, v22.4s, v6.4s\n"
- "fmla v15.4s, v22.4s, v9.4s\n"
- "mov v17.16b, v14.16b\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v20.4s, v25.4s, v7.4s\n"
- "fmla v16.4s, v25.4s, v5.4s\n"
- "fmla v17.4s, v21.4s, v12.4s\n"
- "ldr s30, [x12, %[input_col_stride1]]\n"
- "str s23, [x23]\n"
- "mov v19.16b, v14.16b\n"
- "fmla v0.4s, v25.4s, v10.4s\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "fmla v2.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "mov v22.16b, v14.16b\n"
- "mov v21.16b, v14.16b\n"
- "fmla v24.4s, v29.4s, v4.4s\n"
- "fmla v16.4s, v29.4s, v7.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v1.4s, v29.4s, v10.4s\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v3.4s, v29.4s, v6.4s\n"
- "str s24, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "ldr s27, [x11, x15]\n"
- "fmla v22.4s, v29.4s, v12.4s\n"
- "ldr s23, [x10, x17]\n"
- "fmla v13.4s, v26.4s, v7.4s\n"
- "fmla v2.4s, v26.4s, v10.4s\n"
- "fmla v3.4s, v26.4s, v8.4s\n"
- "fmla v17.4s, v26.4s, v11.4s\n"
- "fmla v0.4s, v30.4s, v5.4s\n"
- "ldr s24, [x9, x19]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "ldr s29, [x8, x21]\n"
- "fmla v3.4s, v28.4s, v10.4s\n"
- "ldr s14, [x12, x15]\n"
- "fmla v20.4s, v27.4s, v4.4s\n"
- "add x8, x8, #4\n"
- "fmla v0.4s, v27.4s, v7.4s\n"
- "fmla v1.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v6.4s\n"
- "str s20, [x24]\n"
- "fmla v19.4s, v27.4s, v9.4s\n"
- "fmla v16.4s, v23.4s, v4.4s\n"
- "ldr s25, [x11, x17]\n"
- "fmla v1.4s, v23.4s, v7.4s\n"
- "ldr s30, [x10, x19]\n"
- "fmla v2.4s, v23.4s, v5.4s\n"
- "fmla v15.4s, v23.4s, v10.4s\n"
- "str s16, [x23, %[output_col_stride1]]\n"
- "fmla v18.4s, v23.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v6.4s\n"
- "ldr s26, [x9, x21]\n"
- "fmla v19.4s, v23.4s, v11.4s\n"
- "add x9, x9, #4\n"
- "fmla v22.4s, v23.4s, v9.4s\n"
- "fmla v21.4s, v23.4s, v12.4s\n"
- "fmla v13.4s, v24.4s, v4.4s\n"
- "ldr s27, [x12, x17]\n"
- "fmla v2.4s, v24.4s, v7.4s\n"
- "ldr s20, [x11, x19]\n"
- "fmla v3.4s, v24.4s, v5.4s\n"
- "fmla v18.4s, v24.4s, v10.4s\n"
- "str s13, [%[outptr0], x26]\n"
- "fmla v17.4s, v24.4s, v8.4s\n"
- "fmla v22.4s, v24.4s, v11.4s\n"
- "ldr s23, [x10, x21]\n"
- "fmla v3.4s, v29.4s, v7.4s\n"
- "ldr s24, [x12, x19]\n"
- "fmla v17.4s, v29.4s, v10.4s\n"
- "ldr s16, [x11, x21]\n"
- "fmla v0.4s, v14.4s, v4.4s\n"
- "add x10, x10, #4\n"
- "fmla v15.4s, v14.4s, v5.4s\n"
- "add x11, x11, #4\n"
- "fmla v19.4s, v14.4s, v6.4s\n"
- "ldr s13, [x12, x21]\n"
- "str s0, [x25]\n"
- "fmla v1.4s, v25.4s, v4.4s\n"
- "fmla v15.4s, v25.4s, v7.4s\n"
- "add x12, x12, #4\n"
- "fmla v18.4s, v25.4s, v5.4s\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "str s1, [x24, %[output_col_stride1]]\n"
- "fmla v22.4s, v25.4s, v6.4s\n"
- "fmla v21.4s, v25.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v4.4s\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "fmla v17.4s, v30.4s, v5.4s\n"
- "fmla v19.4s, v30.4s, v10.4s\n"
- "fmla v22.4s, v30.4s, v8.4s\n"
- "str s2, [x23, x26]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "fmla v3.4s, v26.4s, v4.4s\n"
- "fmla v17.4s, v26.4s, v7.4s\n"
- "fmla v22.4s, v26.4s, v10.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v19.4s, v27.4s, v5.4s\n"
- "fmla v21.4s, v27.4s, v6.4s\n"
- "str s3, [%[outptr0], x27]\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "str s15, [x25, %[output_col_stride1]]\n"
- "fmla v22.4s, v20.4s, v5.4s\n"
- "fmla v19.4s, v20.4s, v7.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "str s18, [x24, x26]\n"
- "fmla v21.4s, v20.4s, v8.4s\n"
- "fmla v17.4s, v23.4s, v4.4s\n"
- "fmla v22.4s, v23.4s, v7.4s\n"
- "fmla v19.4s, v24.4s, v4.4s\n"
- "fmla v21.4s, v23.4s, v10.4s\n"
- "str s17, [x23, x27]\n"
- "fmla v22.4s, v16.4s, v4.4s\n"
- "str s19, [x25, x26]\n"
- "add x23, x23, #4\n"
- "fmla v21.4s, v24.4s, v5.4s\n"
- "str s22, [x24, x27]\n"
- "add x24, x24, #4\n"
- "fmla v21.4s, v16.4s, v7.4s\n"
- "fmla v21.4s, v13.4s, v4.4s\n"
- "str s21, [x25, x27]\n"
- "add x25, x25, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *inptrs[6][6],
- float *outptrs[4][4]
-)
-{
- __asm __volatile(
- "mov x27, xzr\n"
- "mov x28, xzr\n"
- "and x15, %[n_channels], #3\n"
- "lsr x16, %[n_channels], #2\n"
- "cbz x16, 4f\n"
- "1:\n"
- "ldr q13, [%[wbptr]]\n"
- "ldr x17, [%[inptrs], 0]\n"
- "mov v18.16b, v13.16b\n"
- "ldr q12, [%[wbptr], #16]\n"
- "mov v22.16b, v13.16b\n"
- "ldr q11, [%[wbptr], #32]\n"
- "mov v23.16b, v13.16b\n"
- "ldr q10, [%[wbptr], #48]\n"
- "mov v19.16b, v13.16b\n"
- "ldr q9, [%[wbptr], #64]\n"
- "mov v17.16b, v13.16b\n"
- "ldr q8, [%[wbptr], #80]\n"
- "mov v14.16b, v13.16b\n"
- "ldr q7, [%[wbptr], #96]\n"
- "mov v0.16b, v13.16b\n"
- "ldr q6, [%[wbptr], #112]\n"
- "mov v1.16b, v13.16b\n"
- "ldr q5, [%[wbptr], #128]\n"
- "mov v2.16b, v13.16b\n"
- "ldr q4, [%[wbptr], #144]\n"
- "ldr q29, [x17, x27]\n"
- "ldr x7, [%[inptrs], 48]\n"
- "fmla v18.4s, v29.4s, v12.4s\n"
- "ldr x17, [%[inptrs], 8]\n"
- "ldr q27, [x7, x27]\n"
- "ldr x19, [%[inptrs], 96]\n"
- "ldr q28, [x17, x27]\n"
- "ldr x7, [%[inptrs], 56]\n"
- "ldr q25, [x19, x27]\n"
- "ldr x17, [%[inptrs], 16]\n"
- "ldr q16, [x7, x27]\n"
- "ldr x20, [%[inptrs], 144]\n"
- "ldr q15, [x17, x27]\n"
- "ldr x19, [%[inptrs], 104]\n"
- "ldr q21, [x20, x27]\n"
- "subs x16, x16, #1\n"
- "ldr q29, [x19, x27]\n"
- "beq 3f\n"
- "2:\n"
- "mov v3.16b, v13.16b\n"
- "ldr x7, [%[inptrs], 64]\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "ldr x17, [%[inptrs], 24]\n"
- "fmla v22.4s, v27.4s, v12.4s\n"
- "ldr q30, [x7, x27]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr x21, [%[inptrs], 192]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 152]\n"
- "fmla v18.4s, v28.4s, v11.4s\n"
- "ldr q24, [x17, x27]\n"
- "fmla v22.4s, v25.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 112]\n"
- "fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 72]\n"
- "fmla v17.4s, v16.4s, v12.4s\n"
- "ldr x17, [%[inptrs], 32]\n"
- "fmla v18.4s, v25.4s, v6.4s\n"
- "ldr q31, [x21, x27]\n"
- "fmla v22.4s, v16.4s, v11.4s\n"
- "ldr x22, [%[inptrs], 240]\n"
- "fmla v23.4s, v15.4s, v11.4s\n"
- "ldr x21, [%[inptrs], 200]\n"
- "fmla v14.4s, v15.4s, v12.4s\n"
- "ldr x23, [%[outptrs], 0]\n"
- "fmla v18.4s, v16.4s, v8.4s\n"
- "ldr q25, [x20, x27]\n"
- "fmla v22.4s, v21.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v19.4s, v21.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 32]\n"
- "fmla v0.4s, v21.4s, v12.4s\n"
- "ldr q21, [x19, x27]\n"
- "fmla v18.4s, v15.4s, v10.4s\n"
- "ldr q20, [x7, x27]\n"
- "fmla v22.4s, v29.4s, v8.4s\n"
- "ldr x19, [%[inptrs], 120]\n"
- "fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 80]\n"
- "fmla v19.4s, v29.4s, v11.4s\n"
- "ldr x25, [%[outptrs], 64]\n"
- "fmla v18.4s, v29.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 96]\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "ldr q26, [x17, x27]\n"
- "fmla v22.4s, v30.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 40]\n"
- "fmla v23.4s, v30.4s, v8.4s\n"
- "subs x16, x16, #1\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "fmla v14.4s, v30.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr q27, [x22, x27]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "ldr x22, [%[inptrs], 248]\n"
- "fmla v23.4s, v24.4s, v10.4s\n"
- "fmla v19.4s, v31.4s, v6.4s\n"
- "fmla v14.4s, v24.4s, v11.4s\n"
- "ldr q30, [x21, x27]\n"
- "fmla v0.4s, v31.4s, v9.4s\n"
- "ldr q24, [x20, x27]\n"
- "fmla v22.4s, v25.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 208]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "fmla v1.4s, v25.4s, v9.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v21.4s, v4.4s\n"
- "fmla v22.4s, v21.4s, v7.4s\n"
- "fmla v23.4s, v21.4s, v5.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v14.4s, v21.4s, v6.4s\n"
- "fmla v17.4s, v21.4s, v8.4s\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "str q18, [x23, x28]\n"
- "mov v16.16b, v13.16b\n"
- "fmla v2.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 8]\n"
- "fmla v23.4s, v20.4s, v7.4s\n"
- "fmla v14.4s, v20.4s, v8.4s\n"
- "fmla v16.4s, v25.4s, v12.4s\n"
- "ldr q25, [x19, x27]\n"
- "fmla v17.4s, v20.4s, v10.4s\n"
- "ldr x19, [%[inptrs], 128]\n"
- "fmla v2.4s, v20.4s, v11.4s\n"
- "fmla v3.4s, v20.4s, v9.4s\n"
- "fmla v14.4s, v26.4s, v10.4s\n"
- "fmla v0.4s, v27.4s, v6.4s\n"
- "mov v15.16b, v13.16b\n"
- "fmla v19.4s, v30.4s, v5.4s\n"
- "fmla v1.4s, v30.4s, v6.4s\n"
- "fmla v16.4s, v30.4s, v9.4s\n"
- "fmla v3.4s, v26.4s, v11.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "ldr q27, [x17, x27]\n"
- "fmla v0.4s, v30.4s, v8.4s\n"
- "ldr q28, [x22, x27]\n"
- "fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x7, [%[inptrs], 88]\n"
- "fmla v19.4s, v24.4s, v7.4s\n"
- "ldr x22, [%[inptrs], 256]\n"
- "fmla v17.4s, v24.4s, v5.4s\n"
- "ldr x17, [%[inptrs], 0]\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v8.4s\n"
- "str q22, [x24, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v2.4s, v24.4s, v6.4s\n"
- "ldr x24, [%[outptrs], 40]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "fmla v18.4s, v20.4s, v12.4s\n"
- "ldr q22, [x21, x27]\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 216]\n"
- "fmla v17.4s, v25.4s, v7.4s\n"
- "fmla v14.4s, v25.4s, v5.4s\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v8.4s\n"
- "fmla v3.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "str q23, [x23, x28]\n"
- "mov v21.16b, v13.16b\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "fmla v2.4s, v29.4s, v10.4s\n"
- "fmla v21.4s, v24.4s, v12.4s\n"
- "ldr q30, [x20, x27]\n"
- "fmla v3.4s, v29.4s, v8.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "ldr q31, [x19, x27]\n"
- "fmla v0.4s, v28.4s, v5.4s\n"
- "ldr x19, [%[inptrs], 136]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "ldr q26, [x7, x27]\n"
- "fmla v3.4s, v27.4s, v10.4s\n"
- "ldr q23, [x22, x27]\n"
- "fmla v19.4s, v22.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 264]\n"
- "fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x7, [%[inptrs], 48]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "fmla v16.4s, v22.4s, v8.4s\n"
- "fmla v15.4s, v22.4s, v6.4s\n"
- "fmla v21.4s, v22.4s, v9.4s\n"
- "str q19, [x25, x28]\n"
- "mov v24.16b, v13.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr q27, [x21, x27]\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 224]\n"
- "fmla v24.4s, v25.4s, v12.4s\n"
- "ldr q28, [x20, x27]\n"
- "fmla v1.4s, v30.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v2.4s, v30.4s, v5.4s\n"
- "ldr x25, [%[outptrs], 72]\n"
- "str q17, [x24, x28]\n"
- "fmla v16.4s, v30.4s, v10.4s\n"
- "fmla v15.4s, v30.4s, v8.4s\n"
- "ldr q22, [x19, x27]\n"
- "fmla v18.4s, v30.4s, v6.4s\n"
- "ldr x24, [%[outptrs], 48]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr x19, [%[inptrs], 96]\n"
- "fmla v24.4s, v30.4s, v9.4s\n"
- "fmla v20.4s, v30.4s, v12.4s\n"
- "fmla v14.4s, v31.4s, v4.4s\n"
- "ldr q30, [x22, x27]\n"
- "fmla v2.4s, v31.4s, v7.4s\n"
- "ldr q19, [x21, x27]\n"
- "fmla v3.4s, v31.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 272]\n"
- "fmla v15.4s, v31.4s, v10.4s\n"
- "ldr x21, [%[inptrs], 232]\n"
- "str q14, [x23, x28]\n"
- "fmla v18.4s, v31.4s, v8.4s\n"
- "fmla v24.4s, v31.4s, v11.4s\n"
- "ldr q31, [x20, x27]\n"
- "fmla v3.4s, v26.4s, v7.4s\n"
- "ldr q17, [x22, x27]\n"
- "fmla v0.4s, v23.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 280]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr q14, [x21, x27]\n"
- "fmla v16.4s, v23.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 24]\n"
- "fmla v21.4s, v23.4s, v6.4s\n"
- "ldr q26, [x22, x27]\n"
- "str q0, [x26, x28]\n"
- "fmla v1.4s, v27.4s, v4.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "ldr q13, [%[wbptr]]\n"
- "fmla v16.4s, v27.4s, v7.4s\n"
- "ldr x26, [%[outptrs], 104]\n"
- "fmla v21.4s, v27.4s, v8.4s\n"
- "add x27, x27, #16\n"
- "str q1, [x25, x28]\n"
- "fmla v24.4s, v27.4s, v6.4s\n"
- "fmla v20.4s, v27.4s, v9.4s\n"
- "ldr q12, [%[wbptr], #16]\n"
- "fmla v2.4s, v28.4s, v4.4s\n"
- "ldr q29, [x17, x27]\n"
- "fmla v15.4s, v28.4s, v7.4s\n"
- "ldr q27, [x7, x27]\n"
- "fmla v18.4s, v28.4s, v5.4s\n"
- "ldr x25, [%[outptrs], 80]\n"
- "fmla v21.4s, v28.4s, v10.4s\n"
- "ldr x17, [%[inptrs], 8]\n"
- "str q2, [x24, x28]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "fmla v20.4s, v28.4s, v11.4s\n"
- "ldr q9, [%[wbptr], #64]\n"
- "fmla v3.4s, v22.4s, v4.4s\n"
- "ldr q28, [x17, x27]\n"
- "fmla v18.4s, v22.4s, v7.4s\n"
- "ldr q25, [x19, x27]\n"
- "fmla v24.4s, v22.4s, v10.4s\n"
- "ldr x24, [%[outptrs], 56]\n"
- "fmla v16.4s, v30.4s, v4.4s\n"
- "ldr q11, [%[wbptr], #32]\n"
- "str q3, [x23, x28]\n"
- "fmla v21.4s, v30.4s, v5.4s\n"
- "fmla v20.4s, v30.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 56]\n"
- "fmla v15.4s, v19.4s, v4.4s\n"
- "ldr x17, [%[inptrs], 16]\n"
- "str q16, [x26, x28]\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v21.4s, v19.4s, v7.4s\n"
- "ldr q16, [x7, x27]\n"
- "fmla v20.4s, v19.4s, v8.4s\n"
- "ldr q6, [%[wbptr], #112]\n"
- "str q15, [x25, x28]\n"
- "fmla v18.4s, v31.4s, v4.4s\n"
- "fmla v24.4s, v31.4s, v7.4s\n"
- "ldr q15, [x17, x27]\n"
- "fmla v21.4s, v17.4s, v4.4s\n"
- "ldr x25, [%[outptrs], 88]\n"
- "fmla v20.4s, v31.4s, v10.4s\n"
- "ldr q8, [%[wbptr], #80]\n"
- "str q18, [x24, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v24.4s, v14.4s, v4.4s\n"
- "ldr x26, [%[outptrs], 112]\n"
- "mov v22.16b, v13.16b\n"
- "ldr x20, [%[inptrs], 144]\n"
- "str q21, [x26, x28]\n"
- "fmla v20.4s, v17.4s, v5.4s\n"
- "mov v23.16b, v13.16b\n"
- "ldr q10, [%[wbptr], #48]\n"
- "str q24, [x25, x28]\n"
- "mov v19.16b, v13.16b\n"
- "mov v17.16b, v13.16b\n"
- "ldr q21, [x20, x27]\n"
- "fmla v20.4s, v14.4s, v7.4s\n"
- "ldr q5, [%[wbptr], #128]\n"
- "mov v14.16b, v13.16b\n"
- "ldr x26, [%[outptrs], 120]\n"
- "mov v0.16b, v13.16b\n"
- "ldr x19, [%[inptrs], 104]\n"
- "mov v1.16b, v13.16b\n"
- "mov v2.16b, v13.16b\n"
- "fmla v20.4s, v26.4s, v4.4s\n"
- "ldr q7, [%[wbptr], #96]\n"
- "fmla v18.4s, v29.4s, v12.4s\n"
- "ldr q29, [x19, x27]\n"
- "str q20, [x26, x28]\n"
- "ldr q4, [%[wbptr], #144]\n"
- "add x28, x28, #16\n"
- "bne 2b\n"
- "3:\n"
- "mov v3.16b, v13.16b\n"
- "ldr x7, [%[inptrs], 64]\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "ldr x17, [%[inptrs], 24]\n"
- "fmla v22.4s, v27.4s, v12.4s\n"
- "ldr q30, [x7, x27]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr x21, [%[inptrs], 192]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 152]\n"
- "fmla v18.4s, v28.4s, v11.4s\n"
- "ldr q24, [x17, x27]\n"
- "fmla v22.4s, v25.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 112]\n"
- "fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 72]\n"
- "fmla v17.4s, v16.4s, v12.4s\n"
- "ldr x17, [%[inptrs], 32]\n"
- "fmla v18.4s, v25.4s, v6.4s\n"
- "ldr q31, [x21, x27]\n"
- "fmla v22.4s, v16.4s, v11.4s\n"
- "ldr x22, [%[inptrs], 240]\n"
- "fmla v23.4s, v15.4s, v11.4s\n"
- "ldr x21, [%[inptrs], 200]\n"
- "fmla v14.4s, v15.4s, v12.4s\n"
- "ldr x23, [%[outptrs], 0]\n"
- "fmla v18.4s, v16.4s, v8.4s\n"
- "ldr q25, [x20, x27]\n"
- "fmla v22.4s, v21.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v19.4s, v21.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 32]\n"
- "fmla v0.4s, v21.4s, v12.4s\n"
- "ldr q21, [x19, x27]\n"
- "fmla v18.4s, v15.4s, v10.4s\n"
- "ldr q20, [x7, x27]\n"
- "fmla v22.4s, v29.4s, v8.4s\n"
- "ldr x19, [%[inptrs], 120]\n"
- "fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 80]\n"
- "fmla v19.4s, v29.4s, v11.4s\n"
- "ldr x25, [%[outptrs], 64]\n"
- "fmla v18.4s, v29.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 96]\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "ldr q26, [x17, x27]\n"
- "fmla v22.4s, v30.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 40]\n"
- "fmla v23.4s, v30.4s, v8.4s\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "fmla v14.4s, v30.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "mov v16.16b, v13.16b\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "fmla v19.4s, v31.4s, v6.4s\n"
- "fmla v0.4s, v31.4s, v9.4s\n"
- "mov v15.16b, v13.16b\n"
- "fmla v23.4s, v24.4s, v10.4s\n"
- "fmla v14.4s, v24.4s, v11.4s\n"
- "ldr q27, [x22, x27]\n"
- "fmla v22.4s, v25.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 248]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "fmla v1.4s, v25.4s, v9.4s\n"
- "fmla v16.4s, v25.4s, v12.4s\n"
- "ldr q30, [x21, x27]\n"
- "fmla v18.4s, v21.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 208]\n"
- "fmla v22.4s, v21.4s, v7.4s\n"
- "fmla v23.4s, v21.4s, v5.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v17.4s, v21.4s, v8.4s\n"
- "fmla v14.4s, v21.4s, v6.4s\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "str q18, [x23, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v2.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 8]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "ldr q24, [x20, x27]\n"
- "fmla v23.4s, v20.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v17.4s, v20.4s, v10.4s\n"
- "fmla v14.4s, v20.4s, v8.4s\n"
- "fmla v2.4s, v20.4s, v11.4s\n"
- "fmla v3.4s, v20.4s, v9.4s\n"
- "fmla v18.4s, v20.4s, v12.4s\n"
- "ldr q25, [x19, x27]\n"
- "fmla v0.4s, v27.4s, v6.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v14.4s, v26.4s, v10.4s\n"
- "ldr x19, [%[inptrs], 128]\n"
- "fmla v3.4s, v26.4s, v11.4s\n"
- "ldr q27, [x17, x27]\n"
- "fmla v19.4s, v30.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 88]\n"
- "fmla v0.4s, v30.4s, v8.4s\n"
- "fmla v1.4s, v30.4s, v6.4s\n"
- "fmla v16.4s, v30.4s, v9.4s\n"
- "ldr q28, [x22, x27]\n"
- "fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 256]\n"
- "fmla v19.4s, v24.4s, v7.4s\n"
- "fmla v17.4s, v24.4s, v5.4s\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v8.4s\n"
- "fmla v2.4s, v24.4s, v6.4s\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "str q22, [x24, x28]\n"
- "mov v21.16b, v13.16b\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 40]\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v17.4s, v25.4s, v7.4s\n"
- "fmla v21.4s, v24.4s, v12.4s\n"
- "ldr q22, [x21, x27]\n"
- "fmla v14.4s, v25.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 216]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v8.4s\n"
- "str q23, [x23, x28]\n"
- "mov v24.16b, v13.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr x23, [%[outptrs], 16]\n"
- "fmla v3.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "fmla v24.4s, v25.4s, v12.4s\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr q30, [x20, x27]\n"
- "fmla v2.4s, v29.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmla v3.4s, v29.4s, v8.4s\n"
- "fmla v0.4s, v28.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "ldr q31, [x19, x27]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "ldr q26, [x7, x27]\n"
- "fmla v19.4s, v22.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 136]\n"
- "fmla v3.4s, v27.4s, v10.4s\n"
- "ldr q23, [x22, x27]\n"
- "fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x22, [%[inptrs], 264]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "fmla v16.4s, v22.4s, v8.4s\n"
- "str q19, [x25, x28]\n"
- "fmla v15.4s, v22.4s, v6.4s\n"
- "fmla v21.4s, v22.4s, v9.4s\n"
- "ldr q27, [x21, x27]\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "ldr q28, [x20, x27]\n"
- "fmla v1.4s, v30.4s, v7.4s\n"
- "ldr x21, [%[inptrs], 224]\n"
- "fmla v2.4s, v30.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v16.4s, v30.4s, v10.4s\n"
- "ldr x25, [%[outptrs], 72]\n"
- "str q17, [x24, x28]\n"
- "fmla v15.4s, v30.4s, v8.4s\n"
- "fmla v18.4s, v30.4s, v6.4s\n"
- "ldr q22, [x19, x27]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr x24, [%[outptrs], 48]\n"
- "fmla v24.4s, v30.4s, v9.4s\n"
- "fmla v20.4s, v30.4s, v12.4s\n"
- "fmla v14.4s, v31.4s, v4.4s\n"
- "ldr q30, [x22, x27]\n"
- "fmla v2.4s, v31.4s, v7.4s\n"
- "ldr q19, [x21, x27]\n"
- "fmla v3.4s, v31.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 272]\n"
- "fmla v15.4s, v31.4s, v10.4s\n"
- "ldr x21, [%[inptrs], 232]\n"
- "str q14, [x23, x28]\n"
- "fmla v18.4s, v31.4s, v8.4s\n"
- "fmla v24.4s, v31.4s, v11.4s\n"
- "ldr q31, [x20, x27]\n"
- "fmla v3.4s, v26.4s, v7.4s\n"
- "ldr q17, [x22, x27]\n"
- "fmla v0.4s, v23.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 280]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr q14, [x21, x27]\n"
- "fmla v16.4s, v23.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 24]\n"
- "fmla v21.4s, v23.4s, v6.4s\n"
- "ldr q26, [x22, x27]\n"
- "str q0, [x26, x28]\n"
- "fmla v1.4s, v27.4s, v4.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 104]\n"
- "fmla v16.4s, v27.4s, v7.4s\n"
- "add x27, x27, #16\n"
- "fmla v21.4s, v27.4s, v8.4s\n"
- "fmla v24.4s, v27.4s, v6.4s\n"
- "str q1, [x25, x28]\n"
- "fmla v20.4s, v27.4s, v9.4s\n"
- "fmla v2.4s, v28.4s, v4.4s\n"
- "ldr x25, [%[outptrs], 80]\n"
- "fmla v15.4s, v28.4s, v7.4s\n"
- "fmla v18.4s, v28.4s, v5.4s\n"
- "fmla v21.4s, v28.4s, v10.4s\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "fmla v20.4s, v28.4s, v11.4s\n"
- "fmla v3.4s, v22.4s, v4.4s\n"
- "str q2, [x24, x28]\n"
- "fmla v16.4s, v30.4s, v4.4s\n"
- "fmla v18.4s, v22.4s, v7.4s\n"
- "ldr x24, [%[outptrs], 56]\n"
- "fmla v24.4s, v22.4s, v10.4s\n"
- "fmla v21.4s, v30.4s, v5.4s\n"
- "str q3, [x23, x28]\n"
- "fmla v20.4s, v30.4s, v6.4s\n"
- "str q16, [x26, x28]\n"
- "fmla v15.4s, v19.4s, v4.4s\n"
- "fmla v18.4s, v31.4s, v4.4s\n"
- "ldr x26, [%[outptrs], 112]\n"
- "fmla v21.4s, v19.4s, v7.4s\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v8.4s\n"
- "str q15, [x25, x28]\n"
- "str q18, [x24, x28]\n"
- "ldr x25, [%[outptrs], 88]\n"
- "fmla v24.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v17.4s, v4.4s\n"
- "fmla v20.4s, v31.4s, v10.4s\n"
- "str q21, [x26, x28]\n"
- "fmla v20.4s, v17.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 120]\n"
- "fmla v24.4s, v14.4s, v4.4s\n"
- "fmla v20.4s, v14.4s, v7.4s\n"
- "str q24, [x25, x28]\n"
- "fmla v20.4s, v26.4s, v4.4s\n"
- "str q20, [x26, x28]\n"
- "add x28, x28, #16\n"
- "4:\n"
- "cbz x15, 7f\n"
- "ldr s13, [%[wbptr]]\n"
- "mov v18.16b, v13.16b\n"
- "ldr s12, [%[wbptr], #4]\n"
- "mov v22.16b, v13.16b\n"
- "ldr s11, [%[wbptr], #8]\n"
- "mov v23.16b, v13.16b\n"
- "ldr s10, [%[wbptr], #12]\n"
- "mov v19.16b, v13.16b\n"
- "ldr s9, [%[wbptr], #16]\n"
- "mov v17.16b, v13.16b\n"
- "ldr s8, [%[wbptr], #20]\n"
- "mov v14.16b, v13.16b\n"
- "ldr s7, [%[wbptr], #24]\n"
- "mov v0.16b, v13.16b\n"
- "ldr s6, [%[wbptr], #28]\n"
- "mov v1.16b, v13.16b\n"
- "ldr s5, [%[wbptr], #32]\n"
- "mov v2.16b, v13.16b\n"
- "ldr s4, [%[wbptr], #36]\n"
- "ldr x17, [%[inptrs], 0]\n"
- "ldr x7, [%[inptrs], 48]\n"
- "ldr x19, [%[inptrs], 96]\n"
- "ldr x20, [%[inptrs], 144]\n"
- "subs x15, x15, #1\n"
- "ldr s29, [x17, x27]\n"
- "fmla v18.4s, v29.4s, v12.4s\n"
- "ldr s27, [x7, x27]\n"
- "ldr s25, [x19, x27]\n"
- "ldr x17, [%[inptrs], 8]\n"
- "ldr s21, [x20, x27]\n"
- "ldr x7, [%[inptrs], 56]\n"
- "ldr s28, [x17, x27]\n"
- "ldr x19, [%[inptrs], 104]\n"
- "ldr s16, [x7, x27]\n"
- "ldr x17, [%[inptrs], 16]\n"
- "ldr s29, [x19, x27]\n"
- "ldr s15, [x17, x27]\n"
- "beq 6f\n"
- "5:\n"
- "mov v3.16b, v13.16b\n"
- "ldr x7, [%[inptrs], 64]\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "ldr x17, [%[inptrs], 24]\n"
- "fmla v22.4s, v27.4s, v12.4s\n"
- "ldr s30, [x7, x27]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr x21, [%[inptrs], 192]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 152]\n"
- "fmla v18.4s, v28.4s, v11.4s\n"
- "ldr s24, [x17, x27]\n"
- "fmla v22.4s, v25.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 112]\n"
- "fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 72]\n"
- "fmla v17.4s, v16.4s, v12.4s\n"
- "ldr x17, [%[inptrs], 32]\n"
- "fmla v18.4s, v25.4s, v6.4s\n"
- "ldr s31, [x21, x27]\n"
- "fmla v22.4s, v16.4s, v11.4s\n"
- "ldr x22, [%[inptrs], 240]\n"
- "fmla v23.4s, v15.4s, v11.4s\n"
- "ldr x21, [%[inptrs], 200]\n"
- "fmla v14.4s, v15.4s, v12.4s\n"
- "ldr x23, [%[outptrs], 0]\n"
- "fmla v18.4s, v16.4s, v8.4s\n"
- "ldr s25, [x20, x27]\n"
- "fmla v22.4s, v21.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v19.4s, v21.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 32]\n"
- "fmla v0.4s, v21.4s, v12.4s\n"
- "ldr s21, [x19, x27]\n"
- "fmla v18.4s, v15.4s, v10.4s\n"
- "ldr s20, [x7, x27]\n"
- "fmla v22.4s, v29.4s, v8.4s\n"
- "ldr x19, [%[inptrs], 120]\n"
- "fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 80]\n"
- "fmla v19.4s, v29.4s, v11.4s\n"
- "ldr x25, [%[outptrs], 64]\n"
- "fmla v18.4s, v29.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 96]\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "ldr s26, [x17, x27]\n"
- "fmla v22.4s, v30.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 40]\n"
- "fmla v23.4s, v30.4s, v8.4s\n"
- "subs x15, x15, #1\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "fmla v14.4s, v30.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "ldr s27, [x22, x27]\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "ldr x22, [%[inptrs], 248]\n"
- "fmla v23.4s, v24.4s, v10.4s\n"
- "fmla v19.4s, v31.4s, v6.4s\n"
- "fmla v14.4s, v24.4s, v11.4s\n"
- "ldr s30, [x21, x27]\n"
- "fmla v0.4s, v31.4s, v9.4s\n"
- "ldr s24, [x20, x27]\n"
- "fmla v22.4s, v25.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 208]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "fmla v1.4s, v25.4s, v9.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v21.4s, v4.4s\n"
- "fmla v22.4s, v21.4s, v7.4s\n"
- "fmla v23.4s, v21.4s, v5.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v14.4s, v21.4s, v6.4s\n"
- "fmla v17.4s, v21.4s, v8.4s\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "str s18, [x23, x28]\n"
- "mov v16.16b, v13.16b\n"
- "fmla v2.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 8]\n"
- "fmla v23.4s, v20.4s, v7.4s\n"
- "fmla v14.4s, v20.4s, v8.4s\n"
- "fmla v16.4s, v25.4s, v12.4s\n"
- "ldr s25, [x19, x27]\n"
- "fmla v17.4s, v20.4s, v10.4s\n"
- "ldr x19, [%[inptrs], 128]\n"
- "fmla v2.4s, v20.4s, v11.4s\n"
- "fmla v3.4s, v20.4s, v9.4s\n"
- "fmla v14.4s, v26.4s, v10.4s\n"
- "fmla v0.4s, v27.4s, v6.4s\n"
- "mov v15.16b, v13.16b\n"
- "fmla v19.4s, v30.4s, v5.4s\n"
- "fmla v1.4s, v30.4s, v6.4s\n"
- "fmla v16.4s, v30.4s, v9.4s\n"
- "fmla v3.4s, v26.4s, v11.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "ldr s27, [x17, x27]\n"
- "fmla v0.4s, v30.4s, v8.4s\n"
- "ldr s28, [x22, x27]\n"
- "fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x7, [%[inptrs], 88]\n"
- "fmla v19.4s, v24.4s, v7.4s\n"
- "ldr x22, [%[inptrs], 256]\n"
- "fmla v17.4s, v24.4s, v5.4s\n"
- "ldr x17, [%[inptrs], 0]\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v8.4s\n"
- "str s22, [x24, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v2.4s, v24.4s, v6.4s\n"
- "ldr x24, [%[outptrs], 40]\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "fmla v18.4s, v20.4s, v12.4s\n"
- "ldr s22, [x21, x27]\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 216]\n"
- "fmla v17.4s, v25.4s, v7.4s\n"
- "fmla v14.4s, v25.4s, v5.4s\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v8.4s\n"
- "fmla v3.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "str s23, [x23, x28]\n"
- "mov v21.16b, v13.16b\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 16]\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "fmla v2.4s, v29.4s, v10.4s\n"
- "fmla v21.4s, v24.4s, v12.4s\n"
- "ldr s30, [x20, x27]\n"
- "fmla v3.4s, v29.4s, v8.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "ldr s31, [x19, x27]\n"
- "fmla v0.4s, v28.4s, v5.4s\n"
- "ldr x19, [%[inptrs], 136]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "ldr s26, [x7, x27]\n"
- "fmla v3.4s, v27.4s, v10.4s\n"
- "ldr s23, [x22, x27]\n"
- "fmla v19.4s, v22.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 264]\n"
- "fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x7, [%[inptrs], 48]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "fmla v16.4s, v22.4s, v8.4s\n"
- "fmla v15.4s, v22.4s, v6.4s\n"
- "fmla v21.4s, v22.4s, v9.4s\n"
- "str s19, [x25, x28]\n"
- "mov v24.16b, v13.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr s27, [x21, x27]\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 224]\n"
- "fmla v24.4s, v25.4s, v12.4s\n"
- "ldr s28, [x20, x27]\n"
- "fmla v1.4s, v30.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v2.4s, v30.4s, v5.4s\n"
- "ldr x25, [%[outptrs], 72]\n"
- "str s17, [x24, x28]\n"
- "fmla v16.4s, v30.4s, v10.4s\n"
- "fmla v15.4s, v30.4s, v8.4s\n"
- "ldr s22, [x19, x27]\n"
- "fmla v18.4s, v30.4s, v6.4s\n"
- "ldr x24, [%[outptrs], 48]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr x19, [%[inptrs], 96]\n"
- "fmla v24.4s, v30.4s, v9.4s\n"
- "fmla v20.4s, v30.4s, v12.4s\n"
- "fmla v14.4s, v31.4s, v4.4s\n"
- "ldr s30, [x22, x27]\n"
- "fmla v2.4s, v31.4s, v7.4s\n"
- "ldr s19, [x21, x27]\n"
- "fmla v3.4s, v31.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 272]\n"
- "fmla v15.4s, v31.4s, v10.4s\n"
- "ldr x21, [%[inptrs], 232]\n"
- "str s14, [x23, x28]\n"
- "fmla v18.4s, v31.4s, v8.4s\n"
- "fmla v24.4s, v31.4s, v11.4s\n"
- "ldr s31, [x20, x27]\n"
- "fmla v3.4s, v26.4s, v7.4s\n"
- "ldr s17, [x22, x27]\n"
- "fmla v0.4s, v23.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 280]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr s14, [x21, x27]\n"
- "fmla v16.4s, v23.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 24]\n"
- "fmla v21.4s, v23.4s, v6.4s\n"
- "ldr s26, [x22, x27]\n"
- "str s0, [x26, x28]\n"
- "fmla v1.4s, v27.4s, v4.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "ldr s13, [%[wbptr]]\n"
- "fmla v16.4s, v27.4s, v7.4s\n"
- "ldr x26, [%[outptrs], 104]\n"
- "fmla v21.4s, v27.4s, v8.4s\n"
- "add x27, x27, #4\n"
- "str s1, [x25, x28]\n"
- "fmla v24.4s, v27.4s, v6.4s\n"
- "fmla v20.4s, v27.4s, v9.4s\n"
- "ldr s12, [%[wbptr], #4]\n"
- "fmla v2.4s, v28.4s, v4.4s\n"
- "ldr s29, [x17, x27]\n"
- "fmla v15.4s, v28.4s, v7.4s\n"
- "ldr s27, [x7, x27]\n"
- "fmla v18.4s, v28.4s, v5.4s\n"
- "ldr x25, [%[outptrs], 80]\n"
- "fmla v21.4s, v28.4s, v10.4s\n"
- "ldr x17, [%[inptrs], 8]\n"
- "str s2, [x24, x28]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "fmla v20.4s, v28.4s, v11.4s\n"
- "ldr s9, [%[wbptr], #16]\n"
- "fmla v3.4s, v22.4s, v4.4s\n"
- "ldr s28, [x17, x27]\n"
- "fmla v18.4s, v22.4s, v7.4s\n"
- "ldr s25, [x19, x27]\n"
- "fmla v24.4s, v22.4s, v10.4s\n"
- "ldr x24, [%[outptrs], 56]\n"
- "fmla v16.4s, v30.4s, v4.4s\n"
- "ldr s11, [%[wbptr], #8]\n"
- "str s3, [x23, x28]\n"
- "fmla v21.4s, v30.4s, v5.4s\n"
- "fmla v20.4s, v30.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 56]\n"
- "fmla v15.4s, v19.4s, v4.4s\n"
- "ldr x17, [%[inptrs], 16]\n"
- "str s16, [x26, x28]\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v21.4s, v19.4s, v7.4s\n"
- "ldr s16, [x7, x27]\n"
- "fmla v20.4s, v19.4s, v8.4s\n"
- "ldr s6, [%[wbptr], #28]\n"
- "str s15, [x25, x28]\n"
- "fmla v18.4s, v31.4s, v4.4s\n"
- "fmla v24.4s, v31.4s, v7.4s\n"
- "ldr s15, [x17, x27]\n"
- "fmla v21.4s, v17.4s, v4.4s\n"
- "ldr x25, [%[outptrs], 88]\n"
- "fmla v20.4s, v31.4s, v10.4s\n"
- "ldr s8, [%[wbptr], #20]\n"
- "str s18, [x24, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v24.4s, v14.4s, v4.4s\n"
- "ldr x26, [%[outptrs], 112]\n"
- "mov v22.16b, v13.16b\n"
- "ldr x20, [%[inptrs], 144]\n"
- "str s21, [x26, x28]\n"
- "fmla v20.4s, v17.4s, v5.4s\n"
- "mov v23.16b, v13.16b\n"
- "ldr s10, [%[wbptr], #12]\n"
- "str s24, [x25, x28]\n"
- "mov v19.16b, v13.16b\n"
- "mov v17.16b, v13.16b\n"
- "ldr s21, [x20, x27]\n"
- "fmla v20.4s, v14.4s, v7.4s\n"
- "ldr s5, [%[wbptr], #32]\n"
- "mov v14.16b, v13.16b\n"
- "ldr x26, [%[outptrs], 120]\n"
- "mov v0.16b, v13.16b\n"
- "ldr x19, [%[inptrs], 104]\n"
- "mov v1.16b, v13.16b\n"
- "mov v2.16b, v13.16b\n"
- "fmla v20.4s, v26.4s, v4.4s\n"
- "ldr s7, [%[wbptr], #24]\n"
- "fmla v18.4s, v29.4s, v12.4s\n"
- "ldr s29, [x19, x27]\n"
- "str s20, [x26, x28]\n"
- "ldr s4, [%[wbptr], #36]\n"
- "add x28, x28, #4\n"
- "bne 5b\n"
- "6:\n"
- "mov v3.16b, v13.16b\n"
- "ldr x7, [%[inptrs], 64]\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "ldr x17, [%[inptrs], 24]\n"
- "fmla v22.4s, v27.4s, v12.4s\n"
- "ldr s30, [x7, x27]\n"
- "fmla v23.4s, v28.4s, v12.4s\n"
- "ldr x21, [%[inptrs], 192]\n"
- "fmla v19.4s, v25.4s, v12.4s\n"
- "ldr x20, [%[inptrs], 152]\n"
- "fmla v18.4s, v28.4s, v11.4s\n"
- "ldr s24, [x17, x27]\n"
- "fmla v22.4s, v25.4s, v9.4s\n"
- "ldr x19, [%[inptrs], 112]\n"
- "fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 72]\n"
- "fmla v17.4s, v16.4s, v12.4s\n"
- "ldr x17, [%[inptrs], 32]\n"
- "fmla v18.4s, v25.4s, v6.4s\n"
- "ldr s31, [x21, x27]\n"
- "fmla v22.4s, v16.4s, v11.4s\n"
- "ldr x22, [%[inptrs], 240]\n"
- "fmla v23.4s, v15.4s, v11.4s\n"
- "ldr x21, [%[inptrs], 200]\n"
- "fmla v14.4s, v15.4s, v12.4s\n"
- "ldr x23, [%[outptrs], 0]\n"
- "fmla v18.4s, v16.4s, v8.4s\n"
- "ldr s25, [x20, x27]\n"
- "fmla v22.4s, v21.4s, v6.4s\n"
- "ldr x20, [%[inptrs], 160]\n"
- "fmla v19.4s, v21.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 32]\n"
- "fmla v0.4s, v21.4s, v12.4s\n"
- "ldr s21, [x19, x27]\n"
- "fmla v18.4s, v15.4s, v10.4s\n"
- "ldr s20, [x7, x27]\n"
- "fmla v22.4s, v29.4s, v8.4s\n"
- "ldr x19, [%[inptrs], 120]\n"
- "fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 80]\n"
- "fmla v19.4s, v29.4s, v11.4s\n"
- "ldr x25, [%[outptrs], 64]\n"
- "fmla v18.4s, v29.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 96]\n"
- "fmla v17.4s, v29.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "ldr s26, [x17, x27]\n"
- "fmla v22.4s, v30.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v18.4s, v30.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 40]\n"
- "fmla v23.4s, v30.4s, v8.4s\n"
- "fmla v17.4s, v30.4s, v11.4s\n"
- "fmla v14.4s, v30.4s, v9.4s\n"
- "fmla v2.4s, v30.4s, v12.4s\n"
- "mov v16.16b, v13.16b\n"
- "fmla v3.4s, v24.4s, v12.4s\n"
- "fmla v19.4s, v31.4s, v6.4s\n"
- "fmla v0.4s, v31.4s, v9.4s\n"
- "mov v15.16b, v13.16b\n"
- "fmla v23.4s, v24.4s, v10.4s\n"
- "fmla v14.4s, v24.4s, v11.4s\n"
- "ldr s27, [x22, x27]\n"
- "fmla v22.4s, v25.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 248]\n"
- "fmla v19.4s, v25.4s, v8.4s\n"
- "fmla v17.4s, v25.4s, v6.4s\n"
- "fmla v0.4s, v25.4s, v11.4s\n"
- "fmla v1.4s, v25.4s, v9.4s\n"
- "fmla v16.4s, v25.4s, v12.4s\n"
- "ldr s30, [x21, x27]\n"
- "fmla v18.4s, v21.4s, v4.4s\n"
- "ldr x21, [%[inptrs], 208]\n"
- "fmla v22.4s, v21.4s, v7.4s\n"
- "fmla v23.4s, v21.4s, v5.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v17.4s, v21.4s, v8.4s\n"
- "fmla v14.4s, v21.4s, v6.4s\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "str s18, [x23, x28]\n"
- "mov v18.16b, v13.16b\n"
- "fmla v2.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 8]\n"
- "fmla v15.4s, v21.4s, v12.4s\n"
- "ldr s24, [x20, x27]\n"
- "fmla v23.4s, v20.4s, v7.4s\n"
- "ldr x20, [%[inptrs], 168]\n"
- "fmla v17.4s, v20.4s, v10.4s\n"
- "fmla v14.4s, v20.4s, v8.4s\n"
- "fmla v2.4s, v20.4s, v11.4s\n"
- "fmla v3.4s, v20.4s, v9.4s\n"
- "fmla v18.4s, v20.4s, v12.4s\n"
- "ldr s25, [x19, x27]\n"
- "fmla v0.4s, v27.4s, v6.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v14.4s, v26.4s, v10.4s\n"
- "ldr x19, [%[inptrs], 128]\n"
- "fmla v3.4s, v26.4s, v11.4s\n"
- "ldr s27, [x17, x27]\n"
- "fmla v19.4s, v30.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 88]\n"
- "fmla v0.4s, v30.4s, v8.4s\n"
- "fmla v1.4s, v30.4s, v6.4s\n"
- "fmla v16.4s, v30.4s, v9.4s\n"
- "ldr s28, [x22, x27]\n"
- "fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 256]\n"
- "fmla v19.4s, v24.4s, v7.4s\n"
- "fmla v17.4s, v24.4s, v5.4s\n"
- "fmla v0.4s, v24.4s, v10.4s\n"
- "fmla v1.4s, v24.4s, v8.4s\n"
- "fmla v2.4s, v24.4s, v6.4s\n"
- "fmla v16.4s, v24.4s, v11.4s\n"
- "str s22, [x24, x28]\n"
- "mov v21.16b, v13.16b\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "ldr x24, [%[outptrs], 40]\n"
- "fmla v23.4s, v25.4s, v4.4s\n"
- "fmla v17.4s, v25.4s, v7.4s\n"
- "fmla v21.4s, v24.4s, v12.4s\n"
- "ldr s22, [x21, x27]\n"
- "fmla v14.4s, v25.4s, v5.4s\n"
- "ldr x21, [%[inptrs], 216]\n"
- "fmla v1.4s, v25.4s, v10.4s\n"
- "fmla v2.4s, v25.4s, v8.4s\n"
- "str s23, [x23, x28]\n"
- "mov v24.16b, v13.16b\n"
- "mov v20.16b, v13.16b\n"
- "ldr x23, [%[outptrs], 16]\n"
- "fmla v3.4s, v25.4s, v6.4s\n"
- "fmla v15.4s, v25.4s, v11.4s\n"
- "fmla v18.4s, v25.4s, v9.4s\n"
- "fmla v24.4s, v25.4s, v12.4s\n"
- "fmla v14.4s, v29.4s, v7.4s\n"
- "ldr s30, [x20, x27]\n"
- "fmla v2.4s, v29.4s, v10.4s\n"
- "ldr x20, [%[inptrs], 176]\n"
- "fmla v3.4s, v29.4s, v8.4s\n"
- "fmla v0.4s, v28.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v11.4s\n"
- "ldr s31, [x19, x27]\n"
- "fmla v16.4s, v28.4s, v6.4s\n"
- "ldr s26, [x7, x27]\n"
- "fmla v19.4s, v22.4s, v4.4s\n"
- "ldr x19, [%[inptrs], 136]\n"
- "fmla v3.4s, v27.4s, v10.4s\n"
- "ldr s23, [x22, x27]\n"
- "fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x22, [%[inptrs], 264]\n"
- "fmla v1.4s, v22.4s, v5.4s\n"
- "fmla v16.4s, v22.4s, v8.4s\n"
- "str s19, [x25, x28]\n"
- "fmla v15.4s, v22.4s, v6.4s\n"
- "fmla v21.4s, v22.4s, v9.4s\n"
- "ldr s27, [x21, x27]\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "ldr s28, [x20, x27]\n"
- "fmla v1.4s, v30.4s, v7.4s\n"
- "ldr x21, [%[inptrs], 224]\n"
- "fmla v2.4s, v30.4s, v5.4s\n"
- "ldr x20, [%[inptrs], 184]\n"
- "fmla v16.4s, v30.4s, v10.4s\n"
- "ldr x25, [%[outptrs], 72]\n"
- "str s17, [x24, x28]\n"
- "fmla v15.4s, v30.4s, v8.4s\n"
- "fmla v18.4s, v30.4s, v6.4s\n"
- "ldr s22, [x19, x27]\n"
- "fmla v21.4s, v30.4s, v11.4s\n"
- "ldr x24, [%[outptrs], 48]\n"
- "fmla v24.4s, v30.4s, v9.4s\n"
- "fmla v20.4s, v30.4s, v12.4s\n"
- "fmla v14.4s, v31.4s, v4.4s\n"
- "ldr s30, [x22, x27]\n"
- "fmla v2.4s, v31.4s, v7.4s\n"
- "ldr s19, [x21, x27]\n"
- "fmla v3.4s, v31.4s, v5.4s\n"
- "ldr x22, [%[inptrs], 272]\n"
- "fmla v15.4s, v31.4s, v10.4s\n"
- "ldr x21, [%[inptrs], 232]\n"
- "str s14, [x23, x28]\n"
- "fmla v18.4s, v31.4s, v8.4s\n"
- "fmla v24.4s, v31.4s, v11.4s\n"
- "ldr s31, [x20, x27]\n"
- "fmla v3.4s, v26.4s, v7.4s\n"
- "ldr s17, [x22, x27]\n"
- "fmla v0.4s, v23.4s, v4.4s\n"
- "ldr x22, [%[inptrs], 280]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr s14, [x21, x27]\n"
- "fmla v16.4s, v23.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 24]\n"
- "fmla v21.4s, v23.4s, v6.4s\n"
- "ldr s26, [x22, x27]\n"
- "str s0, [x26, x28]\n"
- "fmla v1.4s, v27.4s, v4.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 104]\n"
- "fmla v16.4s, v27.4s, v7.4s\n"
- "add x27, x27, #4\n"
- "fmla v21.4s, v27.4s, v8.4s\n"
- "fmla v24.4s, v27.4s, v6.4s\n"
- "str s1, [x25, x28]\n"
- "fmla v20.4s, v27.4s, v9.4s\n"
- "fmla v2.4s, v28.4s, v4.4s\n"
- "ldr x25, [%[outptrs], 80]\n"
- "fmla v15.4s, v28.4s, v7.4s\n"
- "fmla v18.4s, v28.4s, v5.4s\n"
- "fmla v21.4s, v28.4s, v10.4s\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "fmla v20.4s, v28.4s, v11.4s\n"
- "fmla v3.4s, v22.4s, v4.4s\n"
- "str s2, [x24, x28]\n"
- "fmla v16.4s, v30.4s, v4.4s\n"
- "fmla v18.4s, v22.4s, v7.4s\n"
- "ldr x24, [%[outptrs], 56]\n"
- "fmla v24.4s, v22.4s, v10.4s\n"
- "fmla v21.4s, v30.4s, v5.4s\n"
- "str s3, [x23, x28]\n"
- "fmla v20.4s, v30.4s, v6.4s\n"
- "str s16, [x26, x28]\n"
- "fmla v15.4s, v19.4s, v4.4s\n"
- "fmla v18.4s, v31.4s, v4.4s\n"
- "ldr x26, [%[outptrs], 112]\n"
- "fmla v21.4s, v19.4s, v7.4s\n"
- "fmla v24.4s, v19.4s, v5.4s\n"
- "fmla v20.4s, v19.4s, v8.4s\n"
- "str s15, [x25, x28]\n"
- "str s18, [x24, x28]\n"
- "ldr x25, [%[outptrs], 88]\n"
- "fmla v24.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v17.4s, v4.4s\n"
- "fmla v20.4s, v31.4s, v10.4s\n"
- "str s21, [x26, x28]\n"
- "fmla v20.4s, v17.4s, v5.4s\n"
- "ldr x26, [%[outptrs], 120]\n"
- "fmla v24.4s, v14.4s, v4.4s\n"
- "fmla v20.4s, v14.4s, v7.4s\n"
- "str s24, [x25, x28]\n"
- "fmla v20.4s, v26.4s, v4.4s\n"
- "str s20, [x26, x28]\n"
- "add x28, x28, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr)
- : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x9, %[inptr0], %[input_row_stride]\n"
- "add x28, %[input_col_stride1], %[input_col_stride1]\n"
- "add x16, %[outptr0], %[output_row_stride]\n"
- "add x24, x9, %[input_row_stride]\n"
- "add x25, x28, #64\n"
- "add x23, x28, %[input_col_stride1]\n"
- "add x26, x24, %[input_row_stride]\n"
- "add x11, x23, #64\n"
- "add x12, x23, %[input_col_stride1]\n"
- "add x10, x26, %[input_row_stride]\n"
- "add x13, x12, #64\n"
- "add x14, x12, %[input_col_stride1]\n"
- "add x27, x10, %[input_row_stride]\n"
- "add x15, x14, #64\n"
- "add x17, x16, %[output_row_stride]\n"
- "add x7, x17, %[output_row_stride]\n"
- "add x19, %[output_col_stride1], %[output_col_stride1]\n"
- "and x21, %[n_channels], #3\n"
- "add x20, x19, %[output_col_stride1]\n"
- "lsr x22, %[n_channels], #2\n"
- "cbz x22, 4f\n"
- "1:\n"
- "ldr q21, [%[wbptr]]\n"
- "subs x22, x22, #1\n"
- "mov v7.16b, v21.16b\n"
- "ldr q20, [%[wbptr], #16]\n"
- "mov v3.16b, v21.16b\n"
- "ldr q14, [%[wbptr], #32]\n"
- "mov v6.16b, v21.16b\n"
- "ldr q13, [%[wbptr], #48]\n"
- "mov v15.16b, v21.16b\n"
- "ldr q17, [%[wbptr], #64]\n"
- "mov v2.16b, v21.16b\n"
- "ldr q12, [%[wbptr], #80]\n"
- "mov v5.16b, v21.16b\n"
- "ldr q11, [%[wbptr], #96]\n"
- "mov v0.16b, v21.16b\n"
- "ldr q10, [%[wbptr], #112]\n"
- "mov v16.16b, v21.16b\n"
- "ldr q9, [%[wbptr], #128]\n"
- "mov v1.16b, v21.16b\n"
- "ldr q8, [%[wbptr], #144]\n"
- "mov v4.16b, v21.16b\n"
- "ldr q22, [%[inptr0]]\n"
- "fmla v7.4s, v22.4s, v20.4s\n"
- "ldr q19, [x9]\n"
- "fmla v3.4s, v19.4s, v20.4s\n"
- "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v6.4s, v23.4s, v20.4s\n"
- "ldr q18, [x24]\n"
- "fmla v7.4s, v19.4s, v17.4s\n"
- "ldr q27, [x9, %[input_col_stride1]]\n"
- "fmla v3.4s, v18.4s, v17.4s\n"
- "ldr q28, [%[inptr0], x28]\n"
- "fmla v15.4s, v18.4s, v20.4s\n"
- "ldr q25, [x26]\n"
- "fmla v7.4s, v23.4s, v14.4s\n"
- "ldr q22, [x24, %[input_col_stride1]]\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x9, #64]\n"
- "prfm pldl1keep, [%[inptr0], x8]\n"
- "fmla v7.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "prfm pldl1keep, [x9, x8]\n"
- "prfm pldl1keep, [%[inptr0], x25]\n"
- "prfm pldl1keep, [x26, #64]\n"
- "prfm pldl1keep, [x24, x8]\n"
- "fmla v7.4s, v27.4s, v12.4s\n"
- "beq 3f\n"
- "2:\n"
- "mov v18.16b, v21.16b\n"
- "ldr q23, [x9, x28]\n"
- "mov v19.16b, v21.16b\n"
- "prfm pldl1keep, [x9, x25]\n"
- "fmla v6.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x11]\n"
- "fmla v2.4s, v27.4s, v20.4s\n"
- "ldr q24, [%[inptr0], x23]\n"
- "fmla v7.4s, v28.4s, v13.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v6.4s, v28.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x8]\n"
- "fmla v5.4s, v28.4s, v20.4s\n"
- "ldr q26, [x10]\n"
- "fmla v3.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x25]\n"
- "fmla v15.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x9, x11]\n"
- "fmla v0.4s, v25.4s, v20.4s\n"
- "ldr q25, [x26, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x13]\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "prfm pldl1keep, [x27, #64]\n"
- "fmla v6.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x10, x8]\n"
- "fmla v15.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x25]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "prfm pldl1keep, [x24, x11]\n"
- "fmla v16.4s, v22.4s, v20.4s\n"
- "ldr q22, [x24, x28]\n"
- "fmla v7.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x13]\n"
- "fmla v3.4s, v23.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v6.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x27, x8]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "prfm pldl1keep, [x10, x25]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "prfm pldl1keep, [x26, x11]\n"
- "fmla v1.4s, v23.4s, v20.4s\n"
- "ldr q23, [x9, x23]\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x24, x13]\n"
- "fmla v5.4s, v24.4s, v14.4s\n"
- "prfm pldl1keep, [x9, x15]\n"
- "fmla v4.4s, v24.4s, v20.4s\n"
- "ldr q24, [%[inptr0], x12]\n"
- "fmla v15.4s, v26.4s, v10.4s\n"
- "prfm pldl1keep, [x27, x25]\n"
- "fmla v0.4s, v26.4s, v17.4s\n"
- "ldr q29, [x27]\n"
- "fmla v3.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x11]\n"
- "fmla v15.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x26, x13]\n"
- "fmla v2.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "prfm pldl1keep, [x27, x11]\n"
- "fmla v16.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x10, x13]\n"
- "fmla v18.4s, v25.4s, v20.4s\n"
- "ldr q26, [x10, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x26, x15]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x27, x13]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x15]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x27, x15]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v5.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v22.4s, v14.4s\n"
- "subs x22, x22, #1\n"
- "fmla v1.4s, v22.4s, v17.4s\n"
- "fmla v19.4s, v22.4s, v20.4s\n"
- "mov v22.16b, v21.16b\n"
- "fmla v6.4s, v23.4s, v11.4s\n"
- "fmla v2.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v23.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v14.4s\n"
- "fmla v4.4s, v23.4s, v17.4s\n"
- "fmla v22.4s, v23.4s, v20.4s\n"
- "ldr q27, [x26, x28]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "fmla v0.4s, v29.4s, v10.4s\n"
- "mov v23.16b, v21.16b\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "mov v25.16b, v21.16b\n"
- "mov v24.16b, v21.16b\n"
- "fmla v15.4s, v26.4s, v9.4s\n"
- "fmla v0.4s, v26.4s, v12.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v18.4s, v26.4s, v17.4s\n"
- "fmla v3.4s, v27.4s, v8.4s\n"
- "ldr q29, [x24, x23]\n"
- "fmla v15.4s, v27.4s, v11.4s\n"
- "fmla v2.4s, v27.4s, v9.4s\n"
- "fmla v0.4s, v27.4s, v13.4s\n"
- "fmla v16.4s, v27.4s, v12.4s\n"
- "fmla v1.4s, v27.4s, v10.4s\n"
- "fmla v18.4s, v27.4s, v14.4s\n"
- "fmla v19.4s, v27.4s, v17.4s\n"
- "fmla v23.4s, v27.4s, v20.4s\n"
- "fmla v6.4s, v29.4s, v8.4s\n"
- "ldr q28, [x9, x12]\n"
- "fmla v2.4s, v29.4s, v11.4s\n"
- "fmla v5.4s, v29.4s, v9.4s\n"
- "fmla v16.4s, v29.4s, v13.4s\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "fmla v4.4s, v29.4s, v10.4s\n"
- "fmla v19.4s, v29.4s, v14.4s\n"
- "fmla v22.4s, v29.4s, v17.4s\n"
- "fmla v25.4s, v29.4s, v20.4s\n"
- "fmla v5.4s, v28.4s, v11.4s\n"
- "ldr q21, [%[inptr0], x14]\n"
- "fmla v1.4s, v28.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v22.4s, v28.4s, v14.4s\n"
- "ldr q26, [x27, %[input_col_stride1]]\n"
- "fmla v0.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x8]\n"
- "fmla v4.4s, v21.4s, v13.4s\n"
- "ldr q21, [x10, x28]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr q29, [x26, x23]\n"
- "fmla v15.4s, v21.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x25]\n"
- "fmla v0.4s, v21.4s, v11.4s\n"
- "fmla v16.4s, v21.4s, v9.4s\n"
- "fmla v18.4s, v21.4s, v12.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v23.4s, v21.4s, v17.4s\n"
- "ldr q21, [x24, x12]\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v16.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v9.4s\n"
- "fmla v18.4s, v29.4s, v13.4s\n"
- "fmla v19.4s, v29.4s, v12.4s\n"
- "fmla v22.4s, v29.4s, v10.4s\n"
- "fmla v23.4s, v29.4s, v14.4s\n"
- "fmla v25.4s, v29.4s, v17.4s\n"
- "fmla v24.4s, v29.4s, v20.4s\n"
- "ldr q28, [x9, x14]\n"
- "fmla v5.4s, v21.4s, v8.4s\n"
- "ldr q27, [x27, x28]\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "add x9, x9, #16\n"
- "fmla v4.4s, v21.4s, v9.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "fmla v19.4s, v21.4s, v13.4s\n"
- "prfm pldl1keep, [x9, x8]\n"
- "fmla v22.4s, v21.4s, v12.4s\n"
- "fmla v25.4s, v21.4s, v14.4s\n"
- "fmla v4.4s, v28.4s, v11.4s\n"
- "ldr q20, [x10, x23]\n"
- "fmla v0.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "fmla v22.4s, v28.4s, v13.4s\n"
- "ldr q26, [x26, x12]\n"
- "fmla v23.4s, v27.4s, v10.4s\n"
- "ldr q21, [x24, x14]\n"
- "fmla v16.4s, v20.4s, v8.4s\n"
- "add x24, x24, #16\n"
- "fmla v18.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v19.4s, v20.4s, v9.4s\n"
- "prfm pldl1keep, [x24, x8]\n"
- "fmla v23.4s, v20.4s, v12.4s\n"
- "fmla v25.4s, v20.4s, v10.4s\n"
- "fmla v24.4s, v20.4s, v17.4s\n"
- "ldr q28, [x27, x23]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr q20, [x10, x12]\n"
- "fmla v19.4s, v26.4s, v11.4s\n"
- "fmla v22.4s, v26.4s, v9.4s\n"
- "fmla v23.4s, v26.4s, v13.4s\n"
- "fmla v25.4s, v26.4s, v12.4s\n"
- "fmla v24.4s, v26.4s, v14.4s\n"
- "ldr q17, [x26, x14]\n"
- "fmla v4.4s, v21.4s, v8.4s\n"
- "ldr q26, [x27, x12]\n"
- "fmla v22.4s, v21.4s, v11.4s\n"
- "add x26, x26, #16\n"
- "fmla v25.4s, v21.4s, v13.4s\n"
- "ldr q27, [x10, x14]\n"
- "fmla v18.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v23.4s, v28.4s, v9.4s\n"
- "add x10, x10, #16\n"
- "fmla v24.4s, v28.4s, v10.4s\n"
- "ldr q28, [x27, x14]\n"
- "fmla v19.4s, v20.4s, v8.4s\n"
- "ldr q21, [%[wbptr]]\n"
- "fmla v23.4s, v20.4s, v11.4s\n"
- "add x27, x27, #16\n"
- "fmla v25.4s, v20.4s, v9.4s\n"
- "fmla v24.4s, v20.4s, v12.4s\n"
- "fmla v22.4s, v17.4s, v8.4s\n"
- "ldr q20, [%[wbptr], #16]\n"
- "fmla v23.4s, v26.4s, v8.4s\n"
- "ldr q14, [%[wbptr], #32]\n"
- "fmla v24.4s, v17.4s, v13.4s\n"
- "movi v29.16b, #0\n"
- "fmla v25.4s, v17.4s, v11.4s\n"
- "ldr q17, [%[wbptr], #64]\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "ldr q13, [%[wbptr], #48]\n"
- "str q7, [%[outptr0]]\n"
- "fmla v25.4s, v27.4s, v8.4s\n"
- "str q6, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "ldr q12, [%[wbptr], #80]\n"
- "str q5, [%[outptr0], x19]\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "ldr q10, [%[wbptr], #112]\n"
- "str q4, [%[outptr0], x20]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "str q3, [x16]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr q11, [%[wbptr], #96]\n"
- "str q2, [x16, %[output_col_stride1]]\n"
- "fmax v22.4s, v22.4s, v29.4s\n"
- "str q1, [x16, x19]\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str q22, [x16, x20]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "str q15, [x17]\n"
- "fmax v19.4s, v19.4s, v29.4s\n"
- "str q16, [x17, %[output_col_stride1]]\n"
- "fmax v25.4s, v25.4s, v29.4s\n"
- "str q19, [x17, x19]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str q25, [x17, x20]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "str q0, [x7]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str q18, [x7, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "str q23, [x7, x19]\n"
- "mov v7.16b, v21.16b\n"
- "str q24, [x7, x20]\n"
- "mov v3.16b, v21.16b\n"
- "mov v6.16b, v21.16b\n"
- "ldr q9, [%[wbptr], #128]\n"
- "mov v15.16b, v21.16b\n"
- "ldr q8, [%[wbptr], #144]\n"
- "mov v2.16b, v21.16b\n"
- "ldr q22, [%[inptr0]]\n"
- "mov v5.16b, v21.16b\n"
- "ldr q19, [x9]\n"
- "mov v0.16b, v21.16b\n"
- "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
- "mov v16.16b, v21.16b\n"
- "ldr q18, [x24]\n"
- "mov v1.16b, v21.16b\n"
- "ldr q27, [x9, %[input_col_stride1]]\n"
- "mov v4.16b, v21.16b\n"
- "ldr q28, [%[inptr0], x28]\n"
- "fmla v7.4s, v22.4s, v20.4s\n"
- "ldr q25, [x26]\n"
- "fmla v3.4s, v19.4s, v20.4s\n"
- "ldr q22, [x24, %[input_col_stride1]]\n"
- "fmla v6.4s, v23.4s, v20.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmla v7.4s, v19.4s, v17.4s\n"
- "add x16, x16, #16\n"
- "fmla v3.4s, v18.4s, v17.4s\n"
- "add x17, x17, #16\n"
- "fmla v15.4s, v18.4s, v20.4s\n"
- "add x7, x7, #16\n"
- "fmla v7.4s, v23.4s, v14.4s\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "fmla v7.4s, v18.4s, v10.4s\n"
- "fmla v7.4s, v27.4s, v12.4s\n"
- "bne 2b\n"
- "3:\n"
- "mov v18.16b, v21.16b\n"
- "ldr q23, [x9, x28]\n"
- "mov v19.16b, v21.16b\n"
- "prfm pldl1keep, [x9, x25]\n"
- "fmla v6.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x11]\n"
- "fmla v2.4s, v27.4s, v20.4s\n"
- "ldr q24, [%[inptr0], x23]\n"
- "fmla v7.4s, v28.4s, v13.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v6.4s, v28.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x8]\n"
- "fmla v5.4s, v28.4s, v20.4s\n"
- "ldr q26, [x10]\n"
- "fmla v3.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x25]\n"
- "fmla v15.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x9, x11]\n"
- "fmla v0.4s, v25.4s, v20.4s\n"
- "ldr q25, [x26, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x13]\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "prfm pldl1keep, [x27, #64]\n"
- "fmla v6.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x10, x8]\n"
- "fmla v15.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x25]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "prfm pldl1keep, [x24, x11]\n"
- "fmla v16.4s, v22.4s, v20.4s\n"
- "ldr q22, [x24, x28]\n"
- "fmla v7.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x13]\n"
- "fmla v3.4s, v23.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v6.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x27, x8]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "prfm pldl1keep, [x10, x25]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "prfm pldl1keep, [x26, x11]\n"
- "fmla v1.4s, v23.4s, v20.4s\n"
- "ldr q23, [x9, x23]\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x24, x13]\n"
- "fmla v5.4s, v24.4s, v14.4s\n"
- "prfm pldl1keep, [x9, x15]\n"
- "fmla v4.4s, v24.4s, v20.4s\n"
- "ldr q24, [%[inptr0], x12]\n"
- "fmla v15.4s, v26.4s, v10.4s\n"
- "prfm pldl1keep, [x27, x25]\n"
- "fmla v0.4s, v26.4s, v17.4s\n"
- "ldr q29, [x27]\n"
- "fmla v3.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x11]\n"
- "fmla v15.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x26, x13]\n"
- "fmla v2.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "prfm pldl1keep, [x27, x11]\n"
- "fmla v16.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x10, x13]\n"
- "fmla v18.4s, v25.4s, v20.4s\n"
- "ldr q26, [x10, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x26, x15]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x27, x13]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x15]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x27, x15]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v5.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v22.4s, v14.4s\n"
- "fmla v1.4s, v22.4s, v17.4s\n"
- "fmla v19.4s, v22.4s, v20.4s\n"
- "ldr q27, [x26, x28]\n"
- "fmla v6.4s, v23.4s, v11.4s\n"
- "fmla v2.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v23.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v14.4s\n"
- "fmla v4.4s, v23.4s, v17.4s\n"
- "fmla v0.4s, v29.4s, v10.4s\n"
- "mov v22.16b, v21.16b\n"
- "fmla v15.4s, v26.4s, v9.4s\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v22.4s, v23.4s, v20.4s\n"
- "ldr q29, [x24, x23]\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "ldr q28, [x9, x12]\n"
- "fmla v0.4s, v26.4s, v12.4s\n"
- "fmla v18.4s, v26.4s, v17.4s\n"
- "mov v23.16b, v21.16b\n"
- "fmla v3.4s, v27.4s, v8.4s\n"
- "fmla v15.4s, v27.4s, v11.4s\n"
- "fmla v2.4s, v27.4s, v9.4s\n"
- "fmla v0.4s, v27.4s, v13.4s\n"
- "fmla v16.4s, v27.4s, v12.4s\n"
- "fmla v1.4s, v27.4s, v10.4s\n"
- "fmla v18.4s, v27.4s, v14.4s\n"
- "fmla v19.4s, v27.4s, v17.4s\n"
- "fmla v23.4s, v27.4s, v20.4s\n"
- "mov v25.16b, v21.16b\n"
- "mov v24.16b, v21.16b\n"
- "fmla v6.4s, v29.4s, v8.4s\n"
- "fmla v2.4s, v29.4s, v11.4s\n"
- "fmla v5.4s, v29.4s, v9.4s\n"
- "fmla v16.4s, v29.4s, v13.4s\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "fmla v4.4s, v29.4s, v10.4s\n"
- "fmla v19.4s, v29.4s, v14.4s\n"
- "fmla v22.4s, v29.4s, v17.4s\n"
- "fmla v25.4s, v29.4s, v20.4s\n"
- "ldr q21, [%[inptr0], x14]\n"
- "fmla v5.4s, v28.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v1.4s, v28.4s, v13.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v22.4s, v28.4s, v14.4s\n"
- "ldr q26, [x27, %[input_col_stride1]]\n"
- "fmla v0.4s, v26.4s, v9.4s\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "fmla v4.4s, v21.4s, v13.4s\n"
- "ldr q21, [x10, x28]\n"
- "fmla v15.4s, v21.4s, v8.4s\n"
- "ldr q29, [x26, x23]\n"
- "fmla v0.4s, v21.4s, v11.4s\n"
- "fmla v16.4s, v21.4s, v9.4s\n"
- "fmla v18.4s, v21.4s, v12.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v23.4s, v21.4s, v17.4s\n"
- "ldr q21, [x24, x12]\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v16.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v9.4s\n"
- "fmla v18.4s, v29.4s, v13.4s\n"
- "fmla v19.4s, v29.4s, v12.4s\n"
- "fmla v22.4s, v29.4s, v10.4s\n"
- "fmla v23.4s, v29.4s, v14.4s\n"
- "fmla v25.4s, v29.4s, v17.4s\n"
- "fmla v24.4s, v29.4s, v20.4s\n"
- "ldr q28, [x9, x14]\n"
- "fmla v5.4s, v21.4s, v8.4s\n"
- "ldr q27, [x27, x28]\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "add x9, x9, #16\n"
- "fmla v4.4s, v21.4s, v9.4s\n"
- "fmla v19.4s, v21.4s, v13.4s\n"
- "fmla v22.4s, v21.4s, v12.4s\n"
- "fmla v25.4s, v21.4s, v14.4s\n"
- "fmla v0.4s, v27.4s, v8.4s\n"
- "ldr q20, [x10, x23]\n"
- "fmla v4.4s, v28.4s, v11.4s\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "fmla v22.4s, v28.4s, v13.4s\n"
- "ldr q26, [x26, x12]\n"
- "fmla v23.4s, v27.4s, v10.4s\n"
- "ldr q21, [x24, x14]\n"
- "fmla v16.4s, v20.4s, v8.4s\n"
- "add x24, x24, #16\n"
- "fmla v18.4s, v20.4s, v11.4s\n"
- "fmla v19.4s, v20.4s, v9.4s\n"
- "fmla v23.4s, v20.4s, v12.4s\n"
- "fmla v25.4s, v20.4s, v10.4s\n"
- "fmla v24.4s, v20.4s, v17.4s\n"
- "ldr q28, [x27, x23]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr q20, [x10, x12]\n"
- "fmla v19.4s, v26.4s, v11.4s\n"
- "fmla v22.4s, v26.4s, v9.4s\n"
- "fmla v23.4s, v26.4s, v13.4s\n"
- "fmla v25.4s, v26.4s, v12.4s\n"
- "fmla v24.4s, v26.4s, v14.4s\n"
- "ldr q17, [x26, x14]\n"
- "fmla v4.4s, v21.4s, v8.4s\n"
- "ldr q26, [x27, x12]\n"
- "fmla v22.4s, v21.4s, v11.4s\n"
- "add x26, x26, #16\n"
- "fmla v25.4s, v21.4s, v13.4s\n"
- "ldr q27, [x10, x14]\n"
- "fmla v18.4s, v28.4s, v8.4s\n"
- "add x10, x10, #16\n"
- "fmla v23.4s, v28.4s, v9.4s\n"
- "fmla v24.4s, v28.4s, v10.4s\n"
- "fmla v19.4s, v20.4s, v8.4s\n"
- "ldr q28, [x27, x14]\n"
- "fmla v25.4s, v20.4s, v9.4s\n"
- "add x27, x27, #16\n"
- "fmla v23.4s, v20.4s, v11.4s\n"
- "fmla v24.4s, v20.4s, v12.4s\n"
- "fmla v22.4s, v17.4s, v8.4s\n"
- "movi v29.16b, #0\n"
- "fmla v25.4s, v17.4s, v11.4s\n"
- "fmla v24.4s, v17.4s, v13.4s\n"
- "fmla v23.4s, v26.4s, v8.4s\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmla v25.4s, v27.4s, v8.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "str q7, [%[outptr0]]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "str q6, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "str q5, [%[outptr0], x19]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "str q4, [%[outptr0], x20]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "str q3, [x16]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "str q2, [x16, %[output_col_stride1]]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "str q1, [x16, x19]\n"
- "fmax v22.4s, v22.4s, v29.4s\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "str q22, [x16, x20]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "str q15, [x17]\n"
- "fmax v19.4s, v19.4s, v29.4s\n"
- "str q16, [x17, %[output_col_stride1]]\n"
- "fmax v25.4s, v25.4s, v29.4s\n"
- "str q19, [x17, x19]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str q25, [x17, x20]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "str q0, [x7]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str q18, [x7, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "str q23, [x7, x19]\n"
- "add x16, x16, #16\n"
- "str q24, [x7, x20]\n"
- "add x17, x17, #16\n"
- "add x7, x7, #16\n"
- "4:\n"
- "cbz x21, 7f\n"
- "ldr s21, [%[wbptr]]\n"
- "mov v7.16b, v21.16b\n"
- "ldr s20, [%[wbptr], #4]\n"
- "mov v3.16b, v21.16b\n"
- "ldr s14, [%[wbptr], #8]\n"
- "mov v6.16b, v21.16b\n"
- "ldr s13, [%[wbptr], #12]\n"
- "mov v15.16b, v21.16b\n"
- "ldr s17, [%[wbptr], #16]\n"
- "mov v2.16b, v21.16b\n"
- "ldr s12, [%[wbptr], #20]\n"
- "mov v5.16b, v21.16b\n"
- "ldr s11, [%[wbptr], #24]\n"
- "mov v0.16b, v21.16b\n"
- "ldr s10, [%[wbptr], #28]\n"
- "mov v16.16b, v21.16b\n"
- "ldr s9, [%[wbptr], #32]\n"
- "mov v1.16b, v21.16b\n"
- "ldr s8, [%[wbptr], #36]\n"
- "mov v4.16b, v21.16b\n"
- "ldr s22, [%[inptr0]]\n"
- "fmla v7.4s, v22.4s, v20.4s\n"
- "ldr s19, [x9]\n"
- "fmla v3.4s, v19.4s, v20.4s\n"
- "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v6.4s, v23.4s, v20.4s\n"
- "ldr s18, [x24]\n"
- "fmla v7.4s, v19.4s, v17.4s\n"
- "ldr s27, [x9, %[input_col_stride1]]\n"
- "fmla v3.4s, v18.4s, v17.4s\n"
- "ldr s28, [%[inptr0], x28]\n"
- "fmla v15.4s, v18.4s, v20.4s\n"
- "ldr s25, [x26]\n"
- "fmla v7.4s, v23.4s, v14.4s\n"
- "ldr s22, [x24, %[input_col_stride1]]\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x9, #64]\n"
- "subs x21, x21, #1\n"
- "prfm pldl1keep, [%[inptr0], x8]\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v7.4s, v18.4s, v10.4s\n"
- "prfm pldl1keep, [x9, x8]\n"
- "prfm pldl1keep, [%[inptr0], x25]\n"
- "prfm pldl1keep, [x26, #64]\n"
- "prfm pldl1keep, [x24, x8]\n"
- "fmla v7.4s, v27.4s, v12.4s\n"
- "beq 6f\n"
- "5:\n"
- "mov v18.16b, v21.16b\n"
- "ldr s23, [x9, x28]\n"
- "mov v19.16b, v21.16b\n"
- "prfm pldl1keep, [x9, x25]\n"
- "fmla v6.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x11]\n"
- "fmla v2.4s, v27.4s, v20.4s\n"
- "ldr s24, [%[inptr0], x23]\n"
- "fmla v7.4s, v28.4s, v13.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v6.4s, v28.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x8]\n"
- "fmla v5.4s, v28.4s, v20.4s\n"
- "ldr s26, [x10]\n"
- "fmla v3.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x25]\n"
- "fmla v15.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x9, x11]\n"
- "fmla v0.4s, v25.4s, v20.4s\n"
- "ldr s25, [x26, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x13]\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "prfm pldl1keep, [x27, #64]\n"
- "fmla v6.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x10, x8]\n"
- "fmla v15.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x25]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "prfm pldl1keep, [x24, x11]\n"
- "fmla v16.4s, v22.4s, v20.4s\n"
- "ldr s22, [x24, x28]\n"
- "fmla v7.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x13]\n"
- "fmla v3.4s, v23.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v6.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x27, x8]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "prfm pldl1keep, [x10, x25]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "prfm pldl1keep, [x26, x11]\n"
- "fmla v1.4s, v23.4s, v20.4s\n"
- "ldr s23, [x9, x23]\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x24, x13]\n"
- "fmla v5.4s, v24.4s, v14.4s\n"
- "prfm pldl1keep, [x9, x15]\n"
- "fmla v4.4s, v24.4s, v20.4s\n"
- "ldr s24, [%[inptr0], x12]\n"
- "fmla v15.4s, v26.4s, v10.4s\n"
- "prfm pldl1keep, [x27, x25]\n"
- "fmla v0.4s, v26.4s, v17.4s\n"
- "ldr s29, [x27]\n"
- "fmla v3.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x11]\n"
- "fmla v15.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x26, x13]\n"
- "fmla v2.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "prfm pldl1keep, [x27, x11]\n"
- "fmla v16.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x10, x13]\n"
- "fmla v18.4s, v25.4s, v20.4s\n"
- "ldr s26, [x10, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x26, x15]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x27, x13]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x15]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x27, x15]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v5.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v22.4s, v14.4s\n"
- "subs x21, x21, #1\n"
- "fmla v1.4s, v22.4s, v17.4s\n"
- "fmla v19.4s, v22.4s, v20.4s\n"
- "mov v22.16b, v21.16b\n"
- "fmla v6.4s, v23.4s, v11.4s\n"
- "fmla v2.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v23.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v14.4s\n"
- "fmla v4.4s, v23.4s, v17.4s\n"
- "fmla v22.4s, v23.4s, v20.4s\n"
- "ldr s27, [x26, x28]\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "fmla v0.4s, v29.4s, v10.4s\n"
- "mov v23.16b, v21.16b\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "mov v25.16b, v21.16b\n"
- "mov v24.16b, v21.16b\n"
- "fmla v15.4s, v26.4s, v9.4s\n"
- "fmla v0.4s, v26.4s, v12.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v18.4s, v26.4s, v17.4s\n"
- "fmla v3.4s, v27.4s, v8.4s\n"
- "ldr s29, [x24, x23]\n"
- "fmla v15.4s, v27.4s, v11.4s\n"
- "fmla v2.4s, v27.4s, v9.4s\n"
- "fmla v0.4s, v27.4s, v13.4s\n"
- "fmla v16.4s, v27.4s, v12.4s\n"
- "fmla v1.4s, v27.4s, v10.4s\n"
- "fmla v18.4s, v27.4s, v14.4s\n"
- "fmla v19.4s, v27.4s, v17.4s\n"
- "fmla v23.4s, v27.4s, v20.4s\n"
- "fmla v6.4s, v29.4s, v8.4s\n"
- "ldr s28, [x9, x12]\n"
- "fmla v2.4s, v29.4s, v11.4s\n"
- "fmla v5.4s, v29.4s, v9.4s\n"
- "fmla v16.4s, v29.4s, v13.4s\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "fmla v4.4s, v29.4s, v10.4s\n"
- "fmla v19.4s, v29.4s, v14.4s\n"
- "fmla v22.4s, v29.4s, v17.4s\n"
- "fmla v25.4s, v29.4s, v20.4s\n"
- "fmla v5.4s, v28.4s, v11.4s\n"
- "ldr s21, [%[inptr0], x14]\n"
- "fmla v1.4s, v28.4s, v13.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v22.4s, v28.4s, v14.4s\n"
- "ldr s26, [x27, %[input_col_stride1]]\n"
- "fmla v0.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x8]\n"
- "fmla v4.4s, v21.4s, v13.4s\n"
- "ldr s21, [x10, x28]\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "ldr s29, [x26, x23]\n"
- "fmla v15.4s, v21.4s, v8.4s\n"
- "prfm pldl1keep, [%[inptr0], x25]\n"
- "fmla v0.4s, v21.4s, v11.4s\n"
- "fmla v16.4s, v21.4s, v9.4s\n"
- "fmla v18.4s, v21.4s, v12.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v23.4s, v21.4s, v17.4s\n"
- "ldr s21, [x24, x12]\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v16.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v9.4s\n"
- "fmla v18.4s, v29.4s, v13.4s\n"
- "fmla v19.4s, v29.4s, v12.4s\n"
- "fmla v22.4s, v29.4s, v10.4s\n"
- "fmla v23.4s, v29.4s, v14.4s\n"
- "fmla v25.4s, v29.4s, v17.4s\n"
- "fmla v24.4s, v29.4s, v20.4s\n"
- "ldr s28, [x9, x14]\n"
- "fmla v5.4s, v21.4s, v8.4s\n"
- "ldr s27, [x27, x28]\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "add x9, x9, #4\n"
- "fmla v4.4s, v21.4s, v9.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "fmla v19.4s, v21.4s, v13.4s\n"
- "prfm pldl1keep, [x9, x8]\n"
- "fmla v22.4s, v21.4s, v12.4s\n"
- "fmla v25.4s, v21.4s, v14.4s\n"
- "fmla v4.4s, v28.4s, v11.4s\n"
- "ldr s20, [x10, x23]\n"
- "fmla v0.4s, v27.4s, v8.4s\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "fmla v22.4s, v28.4s, v13.4s\n"
- "ldr s26, [x26, x12]\n"
- "fmla v23.4s, v27.4s, v10.4s\n"
- "ldr s21, [x24, x14]\n"
- "fmla v16.4s, v20.4s, v8.4s\n"
- "add x24, x24, #4\n"
- "fmla v18.4s, v20.4s, v11.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v19.4s, v20.4s, v9.4s\n"
- "prfm pldl1keep, [x24, x8]\n"
- "fmla v23.4s, v20.4s, v12.4s\n"
- "fmla v25.4s, v20.4s, v10.4s\n"
- "fmla v24.4s, v20.4s, v17.4s\n"
- "ldr s28, [x27, x23]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr s20, [x10, x12]\n"
- "fmla v19.4s, v26.4s, v11.4s\n"
- "fmla v22.4s, v26.4s, v9.4s\n"
- "fmla v23.4s, v26.4s, v13.4s\n"
- "fmla v25.4s, v26.4s, v12.4s\n"
- "fmla v24.4s, v26.4s, v14.4s\n"
- "ldr s17, [x26, x14]\n"
- "fmla v4.4s, v21.4s, v8.4s\n"
- "ldr s26, [x27, x12]\n"
- "fmla v22.4s, v21.4s, v11.4s\n"
- "add x26, x26, #4\n"
- "fmla v25.4s, v21.4s, v13.4s\n"
- "ldr s27, [x10, x14]\n"
- "fmla v18.4s, v28.4s, v8.4s\n"
- "prfm pldl1keep, [x26, #64]\n"
- "fmla v23.4s, v28.4s, v9.4s\n"
- "add x10, x10, #4\n"
- "fmla v24.4s, v28.4s, v10.4s\n"
- "ldr s28, [x27, x14]\n"
- "fmla v19.4s, v20.4s, v8.4s\n"
- "ldr s21, [%[wbptr]]\n"
- "fmla v23.4s, v20.4s, v11.4s\n"
- "add x27, x27, #4\n"
- "fmla v25.4s, v20.4s, v9.4s\n"
- "fmla v24.4s, v20.4s, v12.4s\n"
- "fmla v22.4s, v17.4s, v8.4s\n"
- "ldr s20, [%[wbptr], #4]\n"
- "fmla v23.4s, v26.4s, v8.4s\n"
- "ldr s14, [%[wbptr], #8]\n"
- "fmla v24.4s, v17.4s, v13.4s\n"
- "movi v29.16b, #0\n"
- "fmla v25.4s, v17.4s, v11.4s\n"
- "ldr s17, [%[wbptr], #16]\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "ldr s13, [%[wbptr], #12]\n"
- "str s7, [%[outptr0]]\n"
- "fmla v25.4s, v27.4s, v8.4s\n"
- "str s6, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "ldr s12, [%[wbptr], #20]\n"
- "str s5, [%[outptr0], x19]\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "ldr s10, [%[wbptr], #28]\n"
- "str s4, [%[outptr0], x20]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "str s3, [x16]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr s11, [%[wbptr], #24]\n"
- "str s2, [x16, %[output_col_stride1]]\n"
- "fmax v22.4s, v22.4s, v29.4s\n"
- "str s1, [x16, x19]\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str s22, [x16, x20]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "str s15, [x17]\n"
- "fmax v19.4s, v19.4s, v29.4s\n"
- "str s16, [x17, %[output_col_stride1]]\n"
- "fmax v25.4s, v25.4s, v29.4s\n"
- "str s19, [x17, x19]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str s25, [x17, x20]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "str s0, [x7]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str s18, [x7, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "str s23, [x7, x19]\n"
- "mov v7.16b, v21.16b\n"
- "str s24, [x7, x20]\n"
- "mov v3.16b, v21.16b\n"
- "mov v6.16b, v21.16b\n"
- "ldr s9, [%[wbptr], #32]\n"
- "mov v15.16b, v21.16b\n"
- "ldr s8, [%[wbptr], #36]\n"
- "mov v2.16b, v21.16b\n"
- "ldr s22, [%[inptr0]]\n"
- "mov v5.16b, v21.16b\n"
- "ldr s19, [x9]\n"
- "mov v0.16b, v21.16b\n"
- "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
- "mov v16.16b, v21.16b\n"
- "ldr s18, [x24]\n"
- "mov v1.16b, v21.16b\n"
- "ldr s27, [x9, %[input_col_stride1]]\n"
- "mov v4.16b, v21.16b\n"
- "ldr s28, [%[inptr0], x28]\n"
- "fmla v7.4s, v22.4s, v20.4s\n"
- "ldr s25, [x26]\n"
- "fmla v3.4s, v19.4s, v20.4s\n"
- "ldr s22, [x24, %[input_col_stride1]]\n"
- "fmla v6.4s, v23.4s, v20.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmla v7.4s, v19.4s, v17.4s\n"
- "add x16, x16, #4\n"
- "fmla v3.4s, v18.4s, v17.4s\n"
- "add x17, x17, #4\n"
- "fmla v15.4s, v18.4s, v20.4s\n"
- "add x7, x7, #4\n"
- "fmla v7.4s, v23.4s, v14.4s\n"
- "fmla v3.4s, v27.4s, v14.4s\n"
- "fmla v7.4s, v18.4s, v10.4s\n"
- "fmla v7.4s, v27.4s, v12.4s\n"
- "bne 5b\n"
- "6:\n"
- "mov v18.16b, v21.16b\n"
- "ldr s23, [x9, x28]\n"
- "mov v19.16b, v21.16b\n"
- "prfm pldl1keep, [x9, x25]\n"
- "fmla v6.4s, v27.4s, v17.4s\n"
- "prfm pldl1keep, [%[inptr0], x11]\n"
- "fmla v2.4s, v27.4s, v20.4s\n"
- "ldr s24, [%[inptr0], x23]\n"
- "fmla v7.4s, v28.4s, v13.4s\n"
- "prfm pldl1keep, [x10, #64]\n"
- "fmla v6.4s, v28.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x8]\n"
- "fmla v5.4s, v28.4s, v20.4s\n"
- "ldr s26, [x10]\n"
- "fmla v3.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x25]\n"
- "fmla v15.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x9, x11]\n"
- "fmla v0.4s, v25.4s, v20.4s\n"
- "ldr s25, [x26, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [%[inptr0], x13]\n"
- "fmla v3.4s, v22.4s, v12.4s\n"
- "prfm pldl1keep, [x27, #64]\n"
- "fmla v6.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [x10, x8]\n"
- "fmla v15.4s, v22.4s, v14.4s\n"
- "prfm pldl1keep, [x26, x25]\n"
- "fmla v2.4s, v22.4s, v17.4s\n"
- "prfm pldl1keep, [x24, x11]\n"
- "fmla v16.4s, v22.4s, v20.4s\n"
- "ldr s22, [x24, x28]\n"
- "fmla v7.4s, v23.4s, v11.4s\n"
- "prfm pldl1keep, [x9, x13]\n"
- "fmla v3.4s, v23.4s, v13.4s\n"
- "prfm pldl1keep, [%[inptr0], x15]\n"
- "fmla v6.4s, v23.4s, v12.4s\n"
- "prfm pldl1keep, [x27, x8]\n"
- "fmla v2.4s, v23.4s, v14.4s\n"
- "prfm pldl1keep, [x10, x25]\n"
- "fmla v5.4s, v23.4s, v17.4s\n"
- "prfm pldl1keep, [x26, x11]\n"
- "fmla v1.4s, v23.4s, v20.4s\n"
- "ldr s23, [x9, x23]\n"
- "fmla v6.4s, v24.4s, v13.4s\n"
- "prfm pldl1keep, [x24, x13]\n"
- "fmla v5.4s, v24.4s, v14.4s\n"
- "prfm pldl1keep, [x9, x15]\n"
- "fmla v4.4s, v24.4s, v20.4s\n"
- "ldr s24, [%[inptr0], x12]\n"
- "fmla v15.4s, v26.4s, v10.4s\n"
- "prfm pldl1keep, [x27, x25]\n"
- "fmla v0.4s, v26.4s, v17.4s\n"
- "ldr s29, [x27]\n"
- "fmla v3.4s, v25.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x11]\n"
- "fmla v15.4s, v25.4s, v12.4s\n"
- "prfm pldl1keep, [x26, x13]\n"
- "fmla v2.4s, v25.4s, v10.4s\n"
- "prfm pldl1keep, [x24, x15]\n"
- "fmla v0.4s, v25.4s, v14.4s\n"
- "prfm pldl1keep, [x27, x11]\n"
- "fmla v16.4s, v25.4s, v17.4s\n"
- "prfm pldl1keep, [x10, x13]\n"
- "fmla v18.4s, v25.4s, v20.4s\n"
- "ldr s26, [x10, %[input_col_stride1]]\n"
- "fmla v7.4s, v22.4s, v8.4s\n"
- "prfm pldl1keep, [x26, x15]\n"
- "fmla v3.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x27, x13]\n"
- "fmla v6.4s, v22.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x15]\n"
- "fmla v15.4s, v22.4s, v13.4s\n"
- "prfm pldl1keep, [x27, x15]\n"
- "fmla v2.4s, v22.4s, v12.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v5.4s, v22.4s, v10.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v22.4s, v14.4s\n"
- "fmla v1.4s, v22.4s, v17.4s\n"
- "fmla v19.4s, v22.4s, v20.4s\n"
- "ldr s27, [x26, x28]\n"
- "fmla v6.4s, v23.4s, v11.4s\n"
- "fmla v2.4s, v23.4s, v13.4s\n"
- "fmla v5.4s, v23.4s, v12.4s\n"
- "fmla v1.4s, v23.4s, v14.4s\n"
- "fmla v4.4s, v23.4s, v17.4s\n"
- "fmla v0.4s, v29.4s, v10.4s\n"
- "mov v22.16b, v21.16b\n"
- "fmla v15.4s, v26.4s, v9.4s\n"
- "fmla v5.4s, v24.4s, v13.4s\n"
- "fmla v16.4s, v26.4s, v10.4s\n"
- "fmla v22.4s, v23.4s, v20.4s\n"
- "ldr s29, [x24, x23]\n"
- "fmla v4.4s, v24.4s, v14.4s\n"
- "ldr s28, [x9, x12]\n"
- "fmla v0.4s, v26.4s, v12.4s\n"
- "fmla v18.4s, v26.4s, v17.4s\n"
- "mov v23.16b, v21.16b\n"
- "fmla v3.4s, v27.4s, v8.4s\n"
- "fmla v15.4s, v27.4s, v11.4s\n"
- "fmla v2.4s, v27.4s, v9.4s\n"
- "fmla v0.4s, v27.4s, v13.4s\n"
- "fmla v16.4s, v27.4s, v12.4s\n"
- "fmla v1.4s, v27.4s, v10.4s\n"
- "fmla v18.4s, v27.4s, v14.4s\n"
- "fmla v19.4s, v27.4s, v17.4s\n"
- "fmla v23.4s, v27.4s, v20.4s\n"
- "mov v25.16b, v21.16b\n"
- "mov v24.16b, v21.16b\n"
- "fmla v6.4s, v29.4s, v8.4s\n"
- "fmla v2.4s, v29.4s, v11.4s\n"
- "fmla v5.4s, v29.4s, v9.4s\n"
- "fmla v16.4s, v29.4s, v13.4s\n"
- "fmla v1.4s, v29.4s, v12.4s\n"
- "fmla v4.4s, v29.4s, v10.4s\n"
- "fmla v19.4s, v29.4s, v14.4s\n"
- "fmla v22.4s, v29.4s, v17.4s\n"
- "fmla v25.4s, v29.4s, v20.4s\n"
- "ldr s21, [%[inptr0], x14]\n"
- "fmla v5.4s, v28.4s, v11.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v1.4s, v28.4s, v13.4s\n"
- "fmla v4.4s, v28.4s, v12.4s\n"
- "fmla v22.4s, v28.4s, v14.4s\n"
- "ldr s26, [x27, %[input_col_stride1]]\n"
- "fmla v0.4s, v26.4s, v9.4s\n"
- "fmla v18.4s, v26.4s, v10.4s\n"
- "fmla v4.4s, v21.4s, v13.4s\n"
- "ldr s21, [x10, x28]\n"
- "fmla v15.4s, v21.4s, v8.4s\n"
- "ldr s29, [x26, x23]\n"
- "fmla v0.4s, v21.4s, v11.4s\n"
- "fmla v16.4s, v21.4s, v9.4s\n"
- "fmla v18.4s, v21.4s, v12.4s\n"
- "fmla v19.4s, v21.4s, v10.4s\n"
- "fmla v23.4s, v21.4s, v17.4s\n"
- "ldr s21, [x24, x12]\n"
- "fmla v2.4s, v29.4s, v8.4s\n"
- "fmla v16.4s, v29.4s, v11.4s\n"
- "fmla v1.4s, v29.4s, v9.4s\n"
- "fmla v18.4s, v29.4s, v13.4s\n"
- "fmla v19.4s, v29.4s, v12.4s\n"
- "fmla v22.4s, v29.4s, v10.4s\n"
- "fmla v23.4s, v29.4s, v14.4s\n"
- "fmla v25.4s, v29.4s, v17.4s\n"
- "fmla v24.4s, v29.4s, v20.4s\n"
- "ldr s28, [x9, x14]\n"
- "fmla v5.4s, v21.4s, v8.4s\n"
- "ldr s27, [x27, x28]\n"
- "fmla v1.4s, v21.4s, v11.4s\n"
- "add x9, x9, #4\n"
- "fmla v4.4s, v21.4s, v9.4s\n"
- "fmla v19.4s, v21.4s, v13.4s\n"
- "fmla v22.4s, v21.4s, v12.4s\n"
- "fmla v25.4s, v21.4s, v14.4s\n"
- "fmla v0.4s, v27.4s, v8.4s\n"
- "ldr s20, [x10, x23]\n"
- "fmla v4.4s, v28.4s, v11.4s\n"
- "fmla v18.4s, v27.4s, v9.4s\n"
- "fmla v22.4s, v28.4s, v13.4s\n"
- "ldr s26, [x26, x12]\n"
- "fmla v23.4s, v27.4s, v10.4s\n"
- "ldr s21, [x24, x14]\n"
- "fmla v16.4s, v20.4s, v8.4s\n"
- "add x24, x24, #4\n"
- "fmla v18.4s, v20.4s, v11.4s\n"
- "fmla v19.4s, v20.4s, v9.4s\n"
- "fmla v23.4s, v20.4s, v12.4s\n"
- "fmla v25.4s, v20.4s, v10.4s\n"
- "fmla v24.4s, v20.4s, v17.4s\n"
- "ldr s28, [x27, x23]\n"
- "fmla v1.4s, v26.4s, v8.4s\n"
- "ldr s20, [x10, x12]\n"
- "fmla v19.4s, v26.4s, v11.4s\n"
- "fmla v22.4s, v26.4s, v9.4s\n"
- "fmla v23.4s, v26.4s, v13.4s\n"
- "fmla v25.4s, v26.4s, v12.4s\n"
- "fmla v24.4s, v26.4s, v14.4s\n"
- "ldr s17, [x26, x14]\n"
- "fmla v4.4s, v21.4s, v8.4s\n"
- "ldr s26, [x27, x12]\n"
- "fmla v22.4s, v21.4s, v11.4s\n"
- "add x26, x26, #4\n"
- "fmla v25.4s, v21.4s, v13.4s\n"
- "ldr s27, [x10, x14]\n"
- "fmla v18.4s, v28.4s, v8.4s\n"
- "add x10, x10, #4\n"
- "fmla v23.4s, v28.4s, v9.4s\n"
- "fmla v24.4s, v28.4s, v10.4s\n"
- "fmla v19.4s, v20.4s, v8.4s\n"
- "ldr s28, [x27, x14]\n"
- "fmla v25.4s, v20.4s, v9.4s\n"
- "add x27, x27, #4\n"
- "fmla v23.4s, v20.4s, v11.4s\n"
- "fmla v24.4s, v20.4s, v12.4s\n"
- "fmla v22.4s, v17.4s, v8.4s\n"
- "movi v29.16b, #0\n"
- "fmla v25.4s, v17.4s, v11.4s\n"
- "fmla v24.4s, v17.4s, v13.4s\n"
- "fmla v23.4s, v26.4s, v8.4s\n"
- "fmax v7.4s, v7.4s, v29.4s\n"
- "fmla v25.4s, v27.4s, v8.4s\n"
- "fmax v6.4s, v6.4s, v29.4s\n"
- "str s7, [%[outptr0]]\n"
- "fmla v24.4s, v26.4s, v9.4s\n"
- "str s6, [%[outptr0], %[output_col_stride1]]\n"
- "fmax v5.4s, v5.4s, v29.4s\n"
- "fmax v4.4s, v4.4s, v29.4s\n"
- "fmax v3.4s, v3.4s, v29.4s\n"
- "str s5, [%[outptr0], x19]\n"
- "fmla v24.4s, v27.4s, v11.4s\n"
- "str s4, [%[outptr0], x20]\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "str s3, [x16]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "str s2, [x16, %[output_col_stride1]]\n"
- "fmla v24.4s, v28.4s, v8.4s\n"
- "str s1, [x16, x19]\n"
- "fmax v22.4s, v22.4s, v29.4s\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "str s22, [x16, x20]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "str s15, [x17]\n"
- "fmax v19.4s, v19.4s, v29.4s\n"
- "str s16, [x17, %[output_col_stride1]]\n"
- "fmax v25.4s, v25.4s, v29.4s\n"
- "str s19, [x17, x19]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str s25, [x17, x20]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "str s0, [x7]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str s18, [x7, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "str s23, [x7, x19]\n"
- "add x16, x16, #4\n"
- "str s24, [x7, x20]\n"
- "add x17, x17, #4\n"
- "add x7, x7, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *inptrs[6][6],
- float *outptrs[4][4]
-)
-{
- __asm __volatile(
- "mov x27, xzr\n"
- "mov x28, xzr\n"
- "and x19, %[n_channels], #3\n"
- "lsr x26, %[n_channels], #2\n"
- "cbz x26, 4f\n"
- "1:\n"
- "ldr q25, [%[wbptr]]\n"
- "ldr x25, [%[inptrs], 0]\n"
- "mov v2.16b, v25.16b\n"
- "ldr q22, [%[wbptr], #16]\n"
- "mov v16.16b, v25.16b\n"
- "ldr q9, [%[wbptr], #32]\n"
- "mov v18.16b, v25.16b\n"
- "ldr q8, [%[wbptr], #48]\n"
- "mov v13.16b, v25.16b\n"
- "ldr q19, [%[wbptr], #64]\n"
- "mov v0.16b, v25.16b\n"
- "ldr q7, [%[wbptr], #80]\n"
- "mov v17.16b, v25.16b\n"
- "ldr q6, [%[wbptr], #96]\n"
- "mov v14.16b, v25.16b\n"
- "ldr q5, [%[wbptr], #112]\n"
- "mov v12.16b, v25.16b\n"
- "ldr q4, [%[wbptr], #128]\n"
- "mov v15.16b, v25.16b\n"
- "ldr q3, [%[wbptr], #144]\n"
- "ldr q27, [x25, x27]\n"
- "ldr x17, [%[inptrs], 48]\n"
- "fmla v2.4s, v27.4s, v22.4s\n"
- "ldr x25, [%[inptrs], 8]\n"
- "ldr q26, [x17, x27]\n"
- "ldr x24, [%[inptrs], 96]\n"
- "fmla v16.4s, v26.4s, v22.4s\n"
- "ldr q31, [x25, x27]\n"
- "ldr q28, [x24, x27]\n"
- "ldr x17, [%[inptrs], 56]\n"
- "fmla v2.4s, v26.4s, v19.4s\n"
- "ldr x25, [%[inptrs], 16]\n"
- "ldr q29, [x17, x27]\n"
- "ldr x7, [%[inptrs], 144]\n"
- "ldr x24, [%[inptrs], 104]\n"
- "subs x26, x26, #1\n"
- "ldr q30, [x25, x27]\n"
- "ldr q27, [x7, x27]\n"
- "ldr q21, [x24, x27]\n"
- "fmla v2.4s, v31.4s, v9.4s\n"
- "beq 3f\n"
- "2:\n"
- "mov v1.16b, v25.16b\n"
- "ldr x17, [%[inptrs], 64]\n"
- "mov v10.16b, v25.16b\n"
- "ldr x25, [%[inptrs], 24]\n"
- "fmla v18.4s, v31.4s, v22.4s\n"
- "ldr q23, [x17, x27]\n"
- "fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x15, [%[inptrs], 192]\n"
- "fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x7, [%[inptrs], 152]\n"
- "fmla v13.4s, v28.4s, v22.4s\n"
- "ldr q26, [x25, x27]\n"
- "fmla v18.4s, v29.4s, v19.4s\n"
- "ldr x24, [%[inptrs], 112]\n"
- "fmla v2.4s, v29.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 72]\n"
- "fmla v16.4s, v29.4s, v9.4s\n"
- "ldr x25, [%[inptrs], 32]\n"
- "fmla v0.4s, v29.4s, v22.4s\n"
- "ldr q28, [x15, x27]\n"
- "fmla v18.4s, v30.4s, v9.4s\n"
- "ldr x16, [%[inptrs], 240]\n"
- "fmla v2.4s, v30.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 200]\n"
- "fmla v17.4s, v30.4s, v22.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 160]\n"
- "fmla v13.4s, v27.4s, v19.4s\n"
- "ldr x20, [%[outptrs], 0]\n"
- "fmla v14.4s, v27.4s, v22.4s\n"
- "ldr q20, [x24, x27]\n"
- "fmla v2.4s, v21.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 120]\n"
- "fmla v16.4s, v21.4s, v7.4s\n"
- "ldr x21, [%[outptrs], 32]\n"
- "fmla v18.4s, v21.4s, v5.4s\n"
- "ldr x22, [%[outptrs], 64]\n"
- "fmla v13.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 96]\n"
- "fmla v0.4s, v21.4s, v19.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v12.4s, v21.4s, v22.4s\n"
- "ldr q24, [x17, x27]\n"
- "fmla v2.4s, v23.4s, v6.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v23.4s, v8.4s\n"
- "ldr x17, [%[inptrs], 80]\n"
- "fmla v18.4s, v23.4s, v7.4s\n"
- "subs x26, x26, #1\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "fmla v17.4s, v23.4s, v19.4s\n"
- "fmla v15.4s, v23.4s, v22.4s\n"
- "ldr q23, [x25, x27]\n"
- "fmla v1.4s, v26.4s, v22.4s\n"
- "ldr x25, [%[inptrs], 40]\n"
- "fmla v18.4s, v26.4s, v8.4s\n"
- "fmla v13.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v26.4s, v9.4s\n"
- "ldr q30, [x16, x27]\n"
- "fmla v14.4s, v28.4s, v19.4s\n"
- "ldr q26, [x15, x27]\n"
- "fmla v16.4s, v29.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 248]\n"
- "fmla v13.4s, v29.4s, v7.4s\n"
- "ldr x15, [%[inptrs], 208]\n"
- "fmla v0.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v19.4s\n"
- "fmla v14.4s, v29.4s, v9.4s\n"
- "fmla v10.4s, v29.4s, v22.4s\n"
- "mov v11.16b, v25.16b\n"
- "fmla v2.4s, v20.4s, v3.4s\n"
- "fmla v16.4s, v20.4s, v6.4s\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v13.4s, v20.4s, v8.4s\n"
- "fmla v0.4s, v20.4s, v7.4s\n"
- "fmla v17.4s, v20.4s, v5.4s\n"
- "fmla v12.4s, v20.4s, v9.4s\n"
- "fmla v15.4s, v20.4s, v19.4s\n"
- "fmla v11.4s, v20.4s, v22.4s\n"
- "mov v21.16b, v25.16b\n"
- "fmla v18.4s, v24.4s, v6.4s\n"
- "fmla v0.4s, v24.4s, v8.4s\n"
- "fmla v1.4s, v24.4s, v19.4s\n"
- "fmla v17.4s, v24.4s, v7.4s\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "mov v20.16b, v25.16b\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "fmla v21.4s, v24.4s, v22.4s\n"
- "ldr q27, [x7, x27]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 168]\n"
- "fmla v17.4s, v23.4s, v8.4s\n"
- "ldr q30, [x24, x27]\n"
- "fmla v13.4s, v26.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 128]\n"
- "fmla v14.4s, v26.4s, v7.4s\n"
- "fmla v12.4s, v26.4s, v5.4s\n"
- "fmla v10.4s, v26.4s, v19.4s\n"
- "ldr q31, [x17, x27]\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "ldr x17, [%[inptrs], 88]\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v0.4s, v27.4s, v4.4s\n"
- "fmla v14.4s, v27.4s, v8.4s\n"
- "fmla v12.4s, v27.4s, v7.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "fmla v10.4s, v27.4s, v9.4s\n"
- "fmla v11.4s, v27.4s, v19.4s\n"
- "fmla v20.4s, v27.4s, v22.4s\n"
- "mov v24.16b, v25.16b\n"
- "mov v23.16b, v25.16b\n"
- "fmla v18.4s, v30.4s, v3.4s\n"
- "fmla v0.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "fmla v12.4s, v30.4s, v8.4s\n"
- "fmla v15.4s, v30.4s, v7.4s\n"
- "fmla v1.4s, v30.4s, v5.4s\n"
- "fmla v11.4s, v30.4s, v9.4s\n"
- "fmla v21.4s, v30.4s, v19.4s\n"
- "fmla v24.4s, v30.4s, v22.4s\n"
- "ldr q25, [x25, x27]\n"
- "fmla v17.4s, v31.4s, v6.4s\n"
- "ldr x25, [%[inptrs], 0]\n"
- "fmla v15.4s, v31.4s, v8.4s\n"
- "fmla v1.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v31.4s, v9.4s\n"
- "ldr q26, [x16, x27]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 256]\n"
- "fmla v10.4s, v26.4s, v5.4s\n"
- "ldr q31, [x15, x27]\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x15, [%[inptrs], 216]\n"
- "fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 176]\n"
- "fmla v12.4s, v31.4s, v4.4s\n"
- "fmla v10.4s, v31.4s, v7.4s\n"
- "fmla v11.4s, v31.4s, v5.4s\n"
- "fmla v20.4s, v31.4s, v19.4s\n"
- "fmla v0.4s, v29.4s, v3.4s\n"
- "ldr q28, [x24, x27]\n"
- "fmla v15.4s, v29.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 136]\n"
- "fmla v12.4s, v29.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v8.4s\n"
- "fmla v11.4s, v29.4s, v7.4s\n"
- "fmla v21.4s, v29.4s, v5.4s\n"
- "fmla v20.4s, v29.4s, v9.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v23.4s, v29.4s, v22.4s\n"
- "ldr q25, [x17, x27]\n"
- "fmla v17.4s, v28.4s, v3.4s\n"
- "ldr q29, [x16, x27]\n"
- "fmla v15.4s, v28.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 264]\n"
- "fmla v1.4s, v28.4s, v4.4s\n"
- "ldr x17, [%[inptrs], 48]\n"
- "fmla v11.4s, v28.4s, v8.4s\n"
- "fmla v21.4s, v28.4s, v7.4s\n"
- "fmla v24.4s, v28.4s, v9.4s\n"
- "ldr q22, [x15, x27]\n"
- "fmla v14.4s, v29.4s, v3.4s\n"
- "ldr x15, [%[inptrs], 224]\n"
- "fmla v1.4s, v25.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v4.4s\n"
- "fmla v21.4s, v25.4s, v8.4s\n"
- "ldr q27, [x7, x27]\n"
- "fmla v20.4s, v29.4s, v5.4s\n"
- "ldr q26, [x24, x27]\n"
- "fmla v12.4s, v22.4s, v3.4s\n"
- "ldr x7, [%[inptrs], 184]\n"
- "fmla v10.4s, v22.4s, v6.4s\n"
- "ldr x24, [%[inptrs], 96]\n"
- "fmla v11.4s, v22.4s, v4.4s\n"
- "fmla v24.4s, v22.4s, v5.4s\n"
- "fmla v20.4s, v22.4s, v7.4s\n"
- "fmla v23.4s, v22.4s, v19.4s\n"
- "fmla v15.4s, v27.4s, v3.4s\n"
- "ldr q25, [x16, x27]\n"
- "fmla v21.4s, v27.4s, v4.4s\n"
- "ldr q31, [x15, x27]\n"
- "fmla v11.4s, v27.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 272]\n"
- "fmla v20.4s, v27.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 232]\n"
- "fmla v24.4s, v27.4s, v7.4s\n"
- "fmla v23.4s, v27.4s, v9.4s\n"
- "fmla v1.4s, v26.4s, v3.4s\n"
- "ldr q22, [x7, x27]\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr q19, [x16, x27]\n"
- "fmla v10.4s, v25.4s, v3.4s\n"
- "ldr x16, [%[inptrs], 280]\n"
- "fmla v24.4s, v26.4s, v8.4s\n"
- "ldr q28, [x15, x27]\n"
- "fmla v20.4s, v25.4s, v4.4s\n"
- "ldr x7, [%[inptrs], 144]\n"
- "fmla v23.4s, v25.4s, v5.4s\n"
- "ldr q30, [x16, x27]\n"
- "fmla v11.4s, v31.4s, v3.4s\n"
- "add x27, x27, #16\n"
- "fmla v24.4s, v31.4s, v4.4s\n"
- "ldr q27, [x25, x27]\n"
- "fmla v20.4s, v31.4s, v6.4s\n"
- "ldr x25, [%[inptrs], 8]\n"
- "fmla v23.4s, v31.4s, v7.4s\n"
- "movi v29.16b, #0\n"
- "fmla v21.4s, v22.4s, v3.4s\n"
- "ldr q26, [x17, x27]\n"
- "fmla v24.4s, v22.4s, v6.4s\n"
- "ldr x17, [%[inptrs], 56]\n"
- "fmla v20.4s, v19.4s, v3.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmla v23.4s, v22.4s, v8.4s\n"
- "ldr q25, [%[wbptr]]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "ldr q22, [%[wbptr], #16]\n"
- "str q2, [x20, x28]\n"
- "fmla v24.4s, v28.4s, v3.4s\n"
- "fmax v17.4s, v17.4s, v29.4s\n"
- "ldr q9, [%[wbptr], #32]\n"
- "fmla v23.4s, v19.4s, v4.4s\n"
- "ldr q8, [%[wbptr], #48]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr q19, [%[wbptr], #64]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 8]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str q18, [x20, x28]\n"
- "fmla v23.4s, v28.4s, v6.4s\n"
- "str q16, [x21, x28]\n"
- "fmax v21.4s, v21.4s, v29.4s\n"
- "fmax v13.4s, v13.4s, v29.4s\n"
- "ldr q7, [%[wbptr], #80]\n"
- "fmax v12.4s, v12.4s, v29.4s\n"
- "ldr q5, [%[wbptr], #112]\n"
- "fmla v23.4s, v30.4s, v3.4s\n"
- "ldr q6, [%[wbptr], #96]\n"
- "str q13, [x22, x28]\n"
- "fmax v11.4s, v11.4s, v29.4s\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "ldr q4, [%[wbptr], #128]\n"
- "fmax v14.4s, v14.4s, v29.4s\n"
- "ldr q31, [x25, x27]\n"
- "fmax v10.4s, v10.4s, v29.4s\n"
- "ldr q3, [%[wbptr], #144]\n"
- "fmax v20.4s, v20.4s, v29.4s\n"
- "ldr q28, [x24, x27]\n"
- "str q14, [x23, x28]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "mov v2.16b, v25.16b\n"
- "ldr q29, [x17, x27]\n"
- "ldr x20, [%[outptrs], 16]\n"
- "ldr x21, [%[outptrs], 40]\n"
- "ldr x22, [%[outptrs], 72]\n"
- "ldr x23, [%[outptrs], 104]\n"
- "ldr x25, [%[inptrs], 16]\n"
- "ldr x24, [%[inptrs], 104]\n"
- "str q17, [x20, x28]\n"
- "mov v16.16b, v25.16b\n"
- "str q0, [x21, x28]\n"
- "mov v18.16b, v25.16b\n"
- "str q12, [x22, x28]\n"
- "mov v13.16b, v25.16b\n"
- "str q10, [x23, x28]\n"
- "mov v0.16b, v25.16b\n"
- "fmla v2.4s, v27.4s, v22.4s\n"
- "ldr q30, [x25, x27]\n"
- "fmla v16.4s, v26.4s, v22.4s\n"
- "ldr x20, [%[outptrs], 24]\n"
- "mov v17.16b, v25.16b\n"
- "ldr x21, [%[outptrs], 48]\n"
- "str q1, [x20, x28]\n"
- "mov v14.16b, v25.16b\n"
- "str q15, [x21, x28]\n"
- "mov v12.16b, v25.16b\n"
- "mov v15.16b, v25.16b\n"
- "ldr x21, [%[outptrs], 56]\n"
- "fmla v2.4s, v26.4s, v19.4s\n"
- "ldr q27, [x7, x27]\n"
- "str q21, [x21, x28]\n"
- "ldr x22, [%[outptrs], 80]\n"
- "ldr q21, [x24, x27]\n"
- "ldr x23, [%[outptrs], 112]\n"
- "str q11, [x22, x28]\n"
- "fmla v2.4s, v31.4s, v9.4s\n"
- "str q20, [x23, x28]\n"
- "ldr x22, [%[outptrs], 88]\n"
- "ldr x23, [%[outptrs], 120]\n"
- "str q24, [x22, x28]\n"
- "str q23, [x23, x28]\n"
- "add x28, x28, #16\n"
- "bne 2b\n"
- "3:\n"
- "mov v1.16b, v25.16b\n"
- "ldr x17, [%[inptrs], 64]\n"
- "mov v10.16b, v25.16b\n"
- "ldr x25, [%[inptrs], 24]\n"
- "mov v11.16b, v25.16b\n"
- "ldr x15, [%[inptrs], 192]\n"
- "fmla v18.4s, v31.4s, v22.4s\n"
- "ldr q23, [x17, x27]\n"
- "fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 152]\n"
- "fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x24, [%[inptrs], 112]\n"
- "fmla v13.4s, v28.4s, v22.4s\n"
- "ldr q26, [x25, x27]\n"
- "fmla v18.4s, v29.4s, v19.4s\n"
- "ldr x17, [%[inptrs], 72]\n"
- "fmla v2.4s, v29.4s, v7.4s\n"
- "ldr x25, [%[inptrs], 32]\n"
- "fmla v16.4s, v29.4s, v9.4s\n"
- "ldr x16, [%[inptrs], 240]\n"
- "fmla v0.4s, v29.4s, v22.4s\n"
- "ldr q28, [x15, x27]\n"
- "fmla v18.4s, v30.4s, v9.4s\n"
- "ldr x15, [%[inptrs], 200]\n"
- "fmla v2.4s, v30.4s, v8.4s\n"
- "ldr x20, [%[outptrs], 0]\n"
- "fmla v17.4s, v30.4s, v22.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 160]\n"
- "fmla v13.4s, v27.4s, v19.4s\n"
- "ldr x21, [%[outptrs], 32]\n"
- "fmla v14.4s, v27.4s, v22.4s\n"
- "ldr q20, [x24, x27]\n"
- "fmla v2.4s, v21.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 120]\n"
- "fmla v16.4s, v21.4s, v7.4s\n"
- "ldr x22, [%[outptrs], 64]\n"
- "fmla v18.4s, v21.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 96]\n"
- "fmla v13.4s, v21.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v0.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v12.4s, v21.4s, v22.4s\n"
- "ldr q24, [x17, x27]\n"
- "fmla v2.4s, v23.4s, v6.4s\n"
- "ldr x17, [%[inptrs], 80]\n"
- "fmla v16.4s, v23.4s, v8.4s\n"
- "fmla v18.4s, v23.4s, v7.4s\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "fmla v17.4s, v23.4s, v19.4s\n"
- "fmla v15.4s, v23.4s, v22.4s\n"
- "ldr q23, [x25, x27]\n"
- "fmla v1.4s, v26.4s, v22.4s\n"
- "ldr x25, [%[inptrs], 40]\n"
- "fmla v18.4s, v26.4s, v8.4s\n"
- "fmla v13.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v26.4s, v9.4s\n"
- "ldr q30, [x16, x27]\n"
- "fmla v14.4s, v28.4s, v19.4s\n"
- "ldr q26, [x15, x27]\n"
- "fmla v16.4s, v29.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 248]\n"
- "fmla v13.4s, v29.4s, v7.4s\n"
- "ldr x15, [%[inptrs], 208]\n"
- "fmla v0.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v19.4s\n"
- "fmla v14.4s, v29.4s, v9.4s\n"
- "fmla v10.4s, v29.4s, v22.4s\n"
- "mov v21.16b, v25.16b\n"
- "fmla v2.4s, v20.4s, v3.4s\n"
- "fmla v16.4s, v20.4s, v6.4s\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v13.4s, v20.4s, v8.4s\n"
- "fmla v0.4s, v20.4s, v7.4s\n"
- "fmla v17.4s, v20.4s, v5.4s\n"
- "fmla v12.4s, v20.4s, v9.4s\n"
- "fmla v15.4s, v20.4s, v19.4s\n"
- "fmla v11.4s, v20.4s, v22.4s\n"
- "mov v20.16b, v25.16b\n"
- "fmla v18.4s, v24.4s, v6.4s\n"
- "fmla v0.4s, v24.4s, v8.4s\n"
- "fmla v1.4s, v24.4s, v19.4s\n"
- "fmla v17.4s, v24.4s, v7.4s\n"
- "fmla v21.4s, v24.4s, v22.4s\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "ldr q27, [x7, x27]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "ldr q30, [x24, x27]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 168]\n"
- "fmla v17.4s, v23.4s, v8.4s\n"
- "ldr q31, [x17, x27]\n"
- "fmla v13.4s, v26.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 128]\n"
- "fmla v14.4s, v26.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 88]\n"
- "fmla v12.4s, v26.4s, v5.4s\n"
- "fmla v10.4s, v26.4s, v19.4s\n"
- "mov v24.16b, v25.16b\n"
- "mov v23.16b, v25.16b\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v0.4s, v27.4s, v4.4s\n"
- "fmla v14.4s, v27.4s, v8.4s\n"
- "fmla v12.4s, v27.4s, v7.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "fmla v10.4s, v27.4s, v9.4s\n"
- "fmla v11.4s, v27.4s, v19.4s\n"
- "fmla v20.4s, v27.4s, v22.4s\n"
- "ldr q25, [x25, x27]\n"
- "fmla v18.4s, v30.4s, v3.4s\n"
- "fmla v0.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "fmla v12.4s, v30.4s, v8.4s\n"
- "fmla v15.4s, v30.4s, v7.4s\n"
- "fmla v1.4s, v30.4s, v5.4s\n"
- "fmla v11.4s, v30.4s, v9.4s\n"
- "fmla v21.4s, v30.4s, v19.4s\n"
- "fmla v24.4s, v30.4s, v22.4s\n"
- "ldr q26, [x16, x27]\n"
- "fmla v17.4s, v31.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 256]\n"
- "fmla v15.4s, v31.4s, v8.4s\n"
- "fmla v1.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v31.4s, v9.4s\n"
- "ldr q31, [x15, x27]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr x15, [%[inptrs], 216]\n"
- "fmla v10.4s, v26.4s, v5.4s\n"
- "ldr q29, [x7, x27]\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "ldr q28, [x24, x27]\n"
- "fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x7, [%[inptrs], 176]\n"
- "fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x24, [%[inptrs], 136]\n"
- "fmla v12.4s, v31.4s, v4.4s\n"
- "fmla v10.4s, v31.4s, v7.4s\n"
- "fmla v11.4s, v31.4s, v5.4s\n"
- "fmla v20.4s, v31.4s, v19.4s\n"
- "fmla v0.4s, v29.4s, v3.4s\n"
- "ldr q25, [x17, x27]\n"
- "fmla v15.4s, v29.4s, v4.4s\n"
- "fmla v21.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v8.4s\n"
- "fmla v11.4s, v29.4s, v7.4s\n"
- "fmla v20.4s, v29.4s, v9.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v23.4s, v29.4s, v22.4s\n"
- "fmla v17.4s, v28.4s, v3.4s\n"
- "ldr q29, [x16, x27]\n"
- "fmla v15.4s, v28.4s, v6.4s\n"
- "ldr q22, [x15, x27]\n"
- "fmla v1.4s, v28.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 264]\n"
- "fmla v11.4s, v28.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 224]\n"
- "fmla v21.4s, v28.4s, v7.4s\n"
- "fmla v24.4s, v28.4s, v9.4s\n"
- "fmla v14.4s, v29.4s, v3.4s\n"
- "ldr q27, [x7, x27]\n"
- "fmla v1.4s, v25.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 184]\n"
- "fmla v10.4s, v29.4s, v4.4s\n"
- "fmla v20.4s, v29.4s, v5.4s\n"
- "fmla v21.4s, v25.4s, v8.4s\n"
- "ldr q26, [x24, x27]\n"
- "fmla v12.4s, v22.4s, v3.4s\n"
- "ldr q25, [x16, x27]\n"
- "fmla v11.4s, v22.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 272]\n"
- "fmla v10.4s, v22.4s, v6.4s\n"
- "fmla v20.4s, v22.4s, v7.4s\n"
- "fmla v24.4s, v22.4s, v5.4s\n"
- "fmla v23.4s, v22.4s, v19.4s\n"
- "fmla v15.4s, v27.4s, v3.4s\n"
- "ldr q31, [x15, x27]\n"
- "fmla v11.4s, v27.4s, v6.4s\n"
- "ldr q22, [x7, x27]\n"
- "fmla v21.4s, v27.4s, v4.4s\n"
- "ldr x15, [%[inptrs], 232]\n"
- "fmla v20.4s, v27.4s, v8.4s\n"
- "fmla v24.4s, v27.4s, v7.4s\n"
- "fmla v23.4s, v27.4s, v9.4s\n"
- "ldr q19, [x16, x27]\n"
- "fmla v1.4s, v26.4s, v3.4s\n"
- "ldr q28, [x15, x27]\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 280]\n"
- "fmla v24.4s, v26.4s, v8.4s\n"
- "fmla v10.4s, v25.4s, v3.4s\n"
- "fmla v20.4s, v25.4s, v4.4s\n"
- "ldr q30, [x16, x27]\n"
- "fmla v23.4s, v25.4s, v5.4s\n"
- "add x27, x27, #16\n"
- "fmla v11.4s, v31.4s, v3.4s\n"
- "fmla v21.4s, v22.4s, v3.4s\n"
- "fmla v24.4s, v31.4s, v4.4s\n"
- "movi v29.16b, #0\n"
- "fmla v20.4s, v31.4s, v6.4s\n"
- "fmla v23.4s, v31.4s, v7.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "fmla v24.4s, v22.4s, v6.4s\n"
- "fmax v17.4s, v17.4s, v29.4s\n"
- "fmla v20.4s, v19.4s, v3.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "str q2, [x20, x28]\n"
- "fmla v23.4s, v22.4s, v8.4s\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 8]\n"
- "fmla v24.4s, v28.4s, v3.4s\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str q18, [x20, x28]\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str q16, [x21, x28]\n"
- "fmla v23.4s, v19.4s, v4.4s\n"
- "fmax v21.4s, v21.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 16]\n"
- "fmax v13.4s, v13.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 40]\n"
- "str q17, [x20, x28]\n"
- "fmax v12.4s, v12.4s, v29.4s\n"
- "str q0, [x21, x28]\n"
- "fmla v23.4s, v28.4s, v6.4s\n"
- "str q13, [x22, x28]\n"
- "fmax v11.4s, v11.4s, v29.4s\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 24]\n"
- "fmax v14.4s, v14.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 48]\n"
- "str q1, [x20, x28]\n"
- "fmla v23.4s, v30.4s, v3.4s\n"
- "str q15, [x21, x28]\n"
- "fmax v10.4s, v10.4s, v29.4s\n"
- "str q14, [x23, x28]\n"
- "fmax v20.4s, v20.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 56]\n"
- "ldr x22, [%[outptrs], 72]\n"
- "ldr x23, [%[outptrs], 104]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str q21, [x21, x28]\n"
- "str q12, [x22, x28]\n"
- "str q10, [x23, x28]\n"
- "ldr x22, [%[outptrs], 80]\n"
- "ldr x23, [%[outptrs], 112]\n"
- "str q11, [x22, x28]\n"
- "str q20, [x23, x28]\n"
- "ldr x22, [%[outptrs], 88]\n"
- "ldr x23, [%[outptrs], 120]\n"
- "str q24, [x22, x28]\n"
- "str q23, [x23, x28]\n"
- "add x28, x28, #16\n"
- "4:\n"
- "cbz x19, 7f\n"
- "ldr s25, [%[wbptr]]\n"
- "mov v2.16b, v25.16b\n"
- "ldr s22, [%[wbptr], #4]\n"
- "mov v16.16b, v25.16b\n"
- "ldr s9, [%[wbptr], #8]\n"
- "mov v18.16b, v25.16b\n"
- "ldr s8, [%[wbptr], #12]\n"
- "mov v13.16b, v25.16b\n"
- "ldr s19, [%[wbptr], #16]\n"
- "mov v0.16b, v25.16b\n"
- "ldr s7, [%[wbptr], #20]\n"
- "mov v17.16b, v25.16b\n"
- "ldr s6, [%[wbptr], #24]\n"
- "mov v14.16b, v25.16b\n"
- "ldr s5, [%[wbptr], #28]\n"
- "mov v12.16b, v25.16b\n"
- "ldr s4, [%[wbptr], #32]\n"
- "mov v15.16b, v25.16b\n"
- "ldr s3, [%[wbptr], #36]\n"
- "ldr x25, [%[inptrs], 0]\n"
- "ldr x17, [%[inptrs], 48]\n"
- "ldr x24, [%[inptrs], 96]\n"
- "ldr x7, [%[inptrs], 144]\n"
- "subs x19, x19, #1\n"
- "ldr s27, [x25, x27]\n"
- "fmla v2.4s, v27.4s, v22.4s\n"
- "ldr s26, [x17, x27]\n"
- "fmla v16.4s, v26.4s, v22.4s\n"
- "ldr s28, [x24, x27]\n"
- "ldr s27, [x7, x27]\n"
- "ldr x25, [%[inptrs], 8]\n"
- "ldr x17, [%[inptrs], 56]\n"
- "ldr x24, [%[inptrs], 104]\n"
- "ldr s31, [x25, x27]\n"
- "fmla v2.4s, v26.4s, v19.4s\n"
- "ldr s29, [x17, x27]\n"
- "ldr s21, [x24, x27]\n"
- "ldr x25, [%[inptrs], 16]\n"
- "ldr s30, [x25, x27]\n"
- "fmla v2.4s, v31.4s, v9.4s\n"
- "beq 6f\n"
- "5:\n"
- "mov v1.16b, v25.16b\n"
- "ldr x17, [%[inptrs], 64]\n"
- "mov v10.16b, v25.16b\n"
- "ldr x25, [%[inptrs], 24]\n"
- "fmla v18.4s, v31.4s, v22.4s\n"
- "ldr s23, [x17, x27]\n"
- "fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x15, [%[inptrs], 192]\n"
- "fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x7, [%[inptrs], 152]\n"
- "fmla v13.4s, v28.4s, v22.4s\n"
- "ldr s26, [x25, x27]\n"
- "fmla v18.4s, v29.4s, v19.4s\n"
- "ldr x24, [%[inptrs], 112]\n"
- "fmla v2.4s, v29.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 72]\n"
- "fmla v16.4s, v29.4s, v9.4s\n"
- "ldr x25, [%[inptrs], 32]\n"
- "fmla v0.4s, v29.4s, v22.4s\n"
- "ldr s28, [x15, x27]\n"
- "fmla v18.4s, v30.4s, v9.4s\n"
- "ldr x16, [%[inptrs], 240]\n"
- "fmla v2.4s, v30.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 200]\n"
- "fmla v17.4s, v30.4s, v22.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 160]\n"
- "fmla v13.4s, v27.4s, v19.4s\n"
- "ldr x20, [%[outptrs], 0]\n"
- "fmla v14.4s, v27.4s, v22.4s\n"
- "ldr s20, [x24, x27]\n"
- "fmla v2.4s, v21.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 120]\n"
- "fmla v16.4s, v21.4s, v7.4s\n"
- "ldr x21, [%[outptrs], 32]\n"
- "fmla v18.4s, v21.4s, v5.4s\n"
- "ldr x22, [%[outptrs], 64]\n"
- "fmla v13.4s, v21.4s, v9.4s\n"
- "ldr x23, [%[outptrs], 96]\n"
- "fmla v0.4s, v21.4s, v19.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v12.4s, v21.4s, v22.4s\n"
- "ldr s24, [x17, x27]\n"
- "fmla v2.4s, v23.4s, v6.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v16.4s, v23.4s, v8.4s\n"
- "ldr x17, [%[inptrs], 80]\n"
- "fmla v18.4s, v23.4s, v7.4s\n"
- "subs x19, x19, #1\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "fmla v17.4s, v23.4s, v19.4s\n"
- "fmla v15.4s, v23.4s, v22.4s\n"
- "ldr s23, [x25, x27]\n"
- "fmla v1.4s, v26.4s, v22.4s\n"
- "ldr x25, [%[inptrs], 40]\n"
- "fmla v18.4s, v26.4s, v8.4s\n"
- "fmla v13.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v26.4s, v9.4s\n"
- "ldr s30, [x16, x27]\n"
- "fmla v14.4s, v28.4s, v19.4s\n"
- "ldr s26, [x15, x27]\n"
- "fmla v16.4s, v29.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 248]\n"
- "fmla v13.4s, v29.4s, v7.4s\n"
- "ldr x15, [%[inptrs], 208]\n"
- "fmla v0.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v19.4s\n"
- "fmla v14.4s, v29.4s, v9.4s\n"
- "fmla v10.4s, v29.4s, v22.4s\n"
- "mov v11.16b, v25.16b\n"
- "fmla v2.4s, v20.4s, v3.4s\n"
- "fmla v16.4s, v20.4s, v6.4s\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v13.4s, v20.4s, v8.4s\n"
- "fmla v0.4s, v20.4s, v7.4s\n"
- "fmla v17.4s, v20.4s, v5.4s\n"
- "fmla v12.4s, v20.4s, v9.4s\n"
- "fmla v15.4s, v20.4s, v19.4s\n"
- "fmla v11.4s, v20.4s, v22.4s\n"
- "mov v21.16b, v25.16b\n"
- "fmla v18.4s, v24.4s, v6.4s\n"
- "fmla v0.4s, v24.4s, v8.4s\n"
- "fmla v1.4s, v24.4s, v19.4s\n"
- "fmla v17.4s, v24.4s, v7.4s\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "mov v20.16b, v25.16b\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "fmla v21.4s, v24.4s, v22.4s\n"
- "ldr s27, [x7, x27]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 168]\n"
- "fmla v17.4s, v23.4s, v8.4s\n"
- "ldr s30, [x24, x27]\n"
- "fmla v13.4s, v26.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 128]\n"
- "fmla v14.4s, v26.4s, v7.4s\n"
- "fmla v12.4s, v26.4s, v5.4s\n"
- "fmla v10.4s, v26.4s, v19.4s\n"
- "ldr s31, [x17, x27]\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "ldr x17, [%[inptrs], 88]\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v0.4s, v27.4s, v4.4s\n"
- "fmla v14.4s, v27.4s, v8.4s\n"
- "fmla v12.4s, v27.4s, v7.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "fmla v10.4s, v27.4s, v9.4s\n"
- "fmla v11.4s, v27.4s, v19.4s\n"
- "fmla v20.4s, v27.4s, v22.4s\n"
- "mov v24.16b, v25.16b\n"
- "mov v23.16b, v25.16b\n"
- "fmla v18.4s, v30.4s, v3.4s\n"
- "fmla v0.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "fmla v12.4s, v30.4s, v8.4s\n"
- "fmla v15.4s, v30.4s, v7.4s\n"
- "fmla v1.4s, v30.4s, v5.4s\n"
- "fmla v11.4s, v30.4s, v9.4s\n"
- "fmla v21.4s, v30.4s, v19.4s\n"
- "fmla v24.4s, v30.4s, v22.4s\n"
- "ldr s25, [x25, x27]\n"
- "fmla v17.4s, v31.4s, v6.4s\n"
- "ldr x25, [%[inptrs], 0]\n"
- "fmla v15.4s, v31.4s, v8.4s\n"
- "fmla v1.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v31.4s, v9.4s\n"
- "ldr s26, [x16, x27]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 256]\n"
- "fmla v10.4s, v26.4s, v5.4s\n"
- "ldr s31, [x15, x27]\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x15, [%[inptrs], 216]\n"
- "fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 176]\n"
- "fmla v12.4s, v31.4s, v4.4s\n"
- "fmla v10.4s, v31.4s, v7.4s\n"
- "fmla v11.4s, v31.4s, v5.4s\n"
- "fmla v20.4s, v31.4s, v19.4s\n"
- "fmla v0.4s, v29.4s, v3.4s\n"
- "ldr s28, [x24, x27]\n"
- "fmla v15.4s, v29.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 136]\n"
- "fmla v12.4s, v29.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v8.4s\n"
- "fmla v11.4s, v29.4s, v7.4s\n"
- "fmla v21.4s, v29.4s, v5.4s\n"
- "fmla v20.4s, v29.4s, v9.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v23.4s, v29.4s, v22.4s\n"
- "ldr s25, [x17, x27]\n"
- "fmla v17.4s, v28.4s, v3.4s\n"
- "ldr s29, [x16, x27]\n"
- "fmla v15.4s, v28.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 264]\n"
- "fmla v1.4s, v28.4s, v4.4s\n"
- "ldr x17, [%[inptrs], 48]\n"
- "fmla v11.4s, v28.4s, v8.4s\n"
- "fmla v21.4s, v28.4s, v7.4s\n"
- "fmla v24.4s, v28.4s, v9.4s\n"
- "ldr s22, [x15, x27]\n"
- "fmla v14.4s, v29.4s, v3.4s\n"
- "ldr x15, [%[inptrs], 224]\n"
- "fmla v1.4s, v25.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v4.4s\n"
- "fmla v21.4s, v25.4s, v8.4s\n"
- "ldr s27, [x7, x27]\n"
- "fmla v20.4s, v29.4s, v5.4s\n"
- "ldr s26, [x24, x27]\n"
- "fmla v12.4s, v22.4s, v3.4s\n"
- "ldr x7, [%[inptrs], 184]\n"
- "fmla v10.4s, v22.4s, v6.4s\n"
- "ldr x24, [%[inptrs], 96]\n"
- "fmla v11.4s, v22.4s, v4.4s\n"
- "fmla v24.4s, v22.4s, v5.4s\n"
- "fmla v20.4s, v22.4s, v7.4s\n"
- "fmla v23.4s, v22.4s, v19.4s\n"
- "fmla v15.4s, v27.4s, v3.4s\n"
- "ldr s25, [x16, x27]\n"
- "fmla v21.4s, v27.4s, v4.4s\n"
- "ldr s31, [x15, x27]\n"
- "fmla v11.4s, v27.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 272]\n"
- "fmla v20.4s, v27.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 232]\n"
- "fmla v24.4s, v27.4s, v7.4s\n"
- "fmla v23.4s, v27.4s, v9.4s\n"
- "fmla v1.4s, v26.4s, v3.4s\n"
- "ldr s22, [x7, x27]\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr s19, [x16, x27]\n"
- "fmla v10.4s, v25.4s, v3.4s\n"
- "ldr x16, [%[inptrs], 280]\n"
- "fmla v24.4s, v26.4s, v8.4s\n"
- "ldr s28, [x15, x27]\n"
- "fmla v20.4s, v25.4s, v4.4s\n"
- "ldr x7, [%[inptrs], 144]\n"
- "fmla v23.4s, v25.4s, v5.4s\n"
- "ldr s30, [x16, x27]\n"
- "fmla v11.4s, v31.4s, v3.4s\n"
- "add x27, x27, #4\n"
- "fmla v24.4s, v31.4s, v4.4s\n"
- "ldr s27, [x25, x27]\n"
- "fmla v20.4s, v31.4s, v6.4s\n"
- "ldr x25, [%[inptrs], 8]\n"
- "fmla v23.4s, v31.4s, v7.4s\n"
- "movi v29.16b, #0\n"
- "fmla v21.4s, v22.4s, v3.4s\n"
- "ldr s26, [x17, x27]\n"
- "fmla v24.4s, v22.4s, v6.4s\n"
- "ldr x17, [%[inptrs], 56]\n"
- "fmla v20.4s, v19.4s, v3.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmla v23.4s, v22.4s, v8.4s\n"
- "ldr s25, [%[wbptr]]\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "ldr s22, [%[wbptr], #4]\n"
- "str s2, [x20, x28]\n"
- "fmla v24.4s, v28.4s, v3.4s\n"
- "fmax v17.4s, v17.4s, v29.4s\n"
- "ldr s9, [%[wbptr], #8]\n"
- "fmla v23.4s, v19.4s, v4.4s\n"
- "ldr s8, [%[wbptr], #12]\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "ldr s19, [%[wbptr], #16]\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 8]\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str s18, [x20, x28]\n"
- "fmla v23.4s, v28.4s, v6.4s\n"
- "str s16, [x21, x28]\n"
- "fmax v21.4s, v21.4s, v29.4s\n"
- "fmax v13.4s, v13.4s, v29.4s\n"
- "ldr s7, [%[wbptr], #20]\n"
- "fmax v12.4s, v12.4s, v29.4s\n"
- "ldr s5, [%[wbptr], #28]\n"
- "fmla v23.4s, v30.4s, v3.4s\n"
- "ldr s6, [%[wbptr], #24]\n"
- "str s13, [x22, x28]\n"
- "fmax v11.4s, v11.4s, v29.4s\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "ldr s4, [%[wbptr], #32]\n"
- "fmax v14.4s, v14.4s, v29.4s\n"
- "ldr s31, [x25, x27]\n"
- "fmax v10.4s, v10.4s, v29.4s\n"
- "ldr s3, [%[wbptr], #36]\n"
- "fmax v20.4s, v20.4s, v29.4s\n"
- "ldr s28, [x24, x27]\n"
- "str s14, [x23, x28]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "mov v2.16b, v25.16b\n"
- "ldr s29, [x17, x27]\n"
- "ldr x20, [%[outptrs], 16]\n"
- "ldr x21, [%[outptrs], 40]\n"
- "ldr x22, [%[outptrs], 72]\n"
- "ldr x23, [%[outptrs], 104]\n"
- "ldr x25, [%[inptrs], 16]\n"
- "ldr x24, [%[inptrs], 104]\n"
- "str s17, [x20, x28]\n"
- "mov v16.16b, v25.16b\n"
- "str s0, [x21, x28]\n"
- "mov v18.16b, v25.16b\n"
- "str s12, [x22, x28]\n"
- "mov v13.16b, v25.16b\n"
- "str s10, [x23, x28]\n"
- "mov v0.16b, v25.16b\n"
- "fmla v2.4s, v27.4s, v22.4s\n"
- "ldr s30, [x25, x27]\n"
- "fmla v16.4s, v26.4s, v22.4s\n"
- "ldr x20, [%[outptrs], 24]\n"
- "mov v17.16b, v25.16b\n"
- "ldr x21, [%[outptrs], 48]\n"
- "str s1, [x20, x28]\n"
- "mov v14.16b, v25.16b\n"
- "str s15, [x21, x28]\n"
- "mov v12.16b, v25.16b\n"
- "mov v15.16b, v25.16b\n"
- "ldr x21, [%[outptrs], 56]\n"
- "fmla v2.4s, v26.4s, v19.4s\n"
- "ldr s27, [x7, x27]\n"
- "str s21, [x21, x28]\n"
- "ldr x22, [%[outptrs], 80]\n"
- "ldr s21, [x24, x27]\n"
- "ldr x23, [%[outptrs], 112]\n"
- "str s11, [x22, x28]\n"
- "fmla v2.4s, v31.4s, v9.4s\n"
- "str s20, [x23, x28]\n"
- "ldr x22, [%[outptrs], 88]\n"
- "ldr x23, [%[outptrs], 120]\n"
- "str s24, [x22, x28]\n"
- "str s23, [x23, x28]\n"
- "add x28, x28, #4\n"
- "bne 5b\n"
- "6:\n"
- "mov v1.16b, v25.16b\n"
- "ldr x17, [%[inptrs], 64]\n"
- "mov v10.16b, v25.16b\n"
- "ldr x25, [%[inptrs], 24]\n"
- "mov v11.16b, v25.16b\n"
- "ldr x15, [%[inptrs], 192]\n"
- "fmla v18.4s, v31.4s, v22.4s\n"
- "ldr s23, [x17, x27]\n"
- "fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 152]\n"
- "fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x24, [%[inptrs], 112]\n"
- "fmla v13.4s, v28.4s, v22.4s\n"
- "ldr s26, [x25, x27]\n"
- "fmla v18.4s, v29.4s, v19.4s\n"
- "ldr x17, [%[inptrs], 72]\n"
- "fmla v2.4s, v29.4s, v7.4s\n"
- "ldr x25, [%[inptrs], 32]\n"
- "fmla v16.4s, v29.4s, v9.4s\n"
- "ldr x16, [%[inptrs], 240]\n"
- "fmla v0.4s, v29.4s, v22.4s\n"
- "ldr s28, [x15, x27]\n"
- "fmla v18.4s, v30.4s, v9.4s\n"
- "ldr x15, [%[inptrs], 200]\n"
- "fmla v2.4s, v30.4s, v8.4s\n"
- "ldr x20, [%[outptrs], 0]\n"
- "fmla v17.4s, v30.4s, v22.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x7, [%[inptrs], 160]\n"
- "fmla v13.4s, v27.4s, v19.4s\n"
- "ldr x21, [%[outptrs], 32]\n"
- "fmla v14.4s, v27.4s, v22.4s\n"
- "ldr s20, [x24, x27]\n"
- "fmla v2.4s, v21.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 120]\n"
- "fmla v16.4s, v21.4s, v7.4s\n"
- "ldr x22, [%[outptrs], 64]\n"
- "fmla v18.4s, v21.4s, v5.4s\n"
- "ldr x23, [%[outptrs], 96]\n"
- "fmla v13.4s, v21.4s, v9.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v0.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v12.4s, v21.4s, v22.4s\n"
- "ldr s24, [x17, x27]\n"
- "fmla v2.4s, v23.4s, v6.4s\n"
- "ldr x17, [%[inptrs], 80]\n"
- "fmla v16.4s, v23.4s, v8.4s\n"
- "fmla v18.4s, v23.4s, v7.4s\n"
- "fmla v0.4s, v23.4s, v9.4s\n"
- "fmla v17.4s, v23.4s, v19.4s\n"
- "fmla v15.4s, v23.4s, v22.4s\n"
- "ldr s23, [x25, x27]\n"
- "fmla v1.4s, v26.4s, v22.4s\n"
- "ldr x25, [%[inptrs], 40]\n"
- "fmla v18.4s, v26.4s, v8.4s\n"
- "fmla v13.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v26.4s, v9.4s\n"
- "ldr s30, [x16, x27]\n"
- "fmla v14.4s, v28.4s, v19.4s\n"
- "ldr s26, [x15, x27]\n"
- "fmla v16.4s, v29.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 248]\n"
- "fmla v13.4s, v29.4s, v7.4s\n"
- "ldr x15, [%[inptrs], 208]\n"
- "fmla v0.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v19.4s\n"
- "fmla v14.4s, v29.4s, v9.4s\n"
- "fmla v10.4s, v29.4s, v22.4s\n"
- "mov v21.16b, v25.16b\n"
- "fmla v2.4s, v20.4s, v3.4s\n"
- "fmla v16.4s, v20.4s, v6.4s\n"
- "fmla v18.4s, v20.4s, v4.4s\n"
- "fmla v13.4s, v20.4s, v8.4s\n"
- "fmla v0.4s, v20.4s, v7.4s\n"
- "fmla v17.4s, v20.4s, v5.4s\n"
- "fmla v12.4s, v20.4s, v9.4s\n"
- "fmla v15.4s, v20.4s, v19.4s\n"
- "fmla v11.4s, v20.4s, v22.4s\n"
- "mov v20.16b, v25.16b\n"
- "fmla v18.4s, v24.4s, v6.4s\n"
- "fmla v0.4s, v24.4s, v8.4s\n"
- "fmla v1.4s, v24.4s, v19.4s\n"
- "fmla v17.4s, v24.4s, v7.4s\n"
- "fmla v21.4s, v24.4s, v22.4s\n"
- "fmla v15.4s, v24.4s, v9.4s\n"
- "ldr s27, [x7, x27]\n"
- "fmla v14.4s, v30.4s, v5.4s\n"
- "ldr s30, [x24, x27]\n"
- "fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x7, [%[inptrs], 168]\n"
- "fmla v17.4s, v23.4s, v8.4s\n"
- "ldr s31, [x17, x27]\n"
- "fmla v13.4s, v26.4s, v4.4s\n"
- "ldr x24, [%[inptrs], 128]\n"
- "fmla v14.4s, v26.4s, v7.4s\n"
- "ldr x17, [%[inptrs], 88]\n"
- "fmla v12.4s, v26.4s, v5.4s\n"
- "fmla v10.4s, v26.4s, v19.4s\n"
- "mov v24.16b, v25.16b\n"
- "mov v23.16b, v25.16b\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v0.4s, v27.4s, v4.4s\n"
- "fmla v14.4s, v27.4s, v8.4s\n"
- "fmla v12.4s, v27.4s, v7.4s\n"
- "fmla v15.4s, v27.4s, v5.4s\n"
- "fmla v10.4s, v27.4s, v9.4s\n"
- "fmla v11.4s, v27.4s, v19.4s\n"
- "fmla v20.4s, v27.4s, v22.4s\n"
- "ldr s25, [x25, x27]\n"
- "fmla v18.4s, v30.4s, v3.4s\n"
- "fmla v0.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v4.4s\n"
- "fmla v12.4s, v30.4s, v8.4s\n"
- "fmla v15.4s, v30.4s, v7.4s\n"
- "fmla v1.4s, v30.4s, v5.4s\n"
- "fmla v11.4s, v30.4s, v9.4s\n"
- "fmla v21.4s, v30.4s, v19.4s\n"
- "fmla v24.4s, v30.4s, v22.4s\n"
- "ldr s26, [x16, x27]\n"
- "fmla v17.4s, v31.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 256]\n"
- "fmla v15.4s, v31.4s, v8.4s\n"
- "fmla v1.4s, v31.4s, v7.4s\n"
- "fmla v21.4s, v31.4s, v9.4s\n"
- "ldr s31, [x15, x27]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "ldr x15, [%[inptrs], 216]\n"
- "fmla v10.4s, v26.4s, v5.4s\n"
- "ldr s29, [x7, x27]\n"
- "fmla v1.4s, v25.4s, v8.4s\n"
- "ldr s28, [x24, x27]\n"
- "fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x7, [%[inptrs], 176]\n"
- "fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x24, [%[inptrs], 136]\n"
- "fmla v12.4s, v31.4s, v4.4s\n"
- "fmla v10.4s, v31.4s, v7.4s\n"
- "fmla v11.4s, v31.4s, v5.4s\n"
- "fmla v20.4s, v31.4s, v19.4s\n"
- "fmla v0.4s, v29.4s, v3.4s\n"
- "ldr s25, [x17, x27]\n"
- "fmla v15.4s, v29.4s, v4.4s\n"
- "fmla v21.4s, v29.4s, v5.4s\n"
- "fmla v12.4s, v29.4s, v6.4s\n"
- "fmla v10.4s, v29.4s, v8.4s\n"
- "fmla v11.4s, v29.4s, v7.4s\n"
- "fmla v20.4s, v29.4s, v9.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v23.4s, v29.4s, v22.4s\n"
- "fmla v17.4s, v28.4s, v3.4s\n"
- "ldr s29, [x16, x27]\n"
- "fmla v15.4s, v28.4s, v6.4s\n"
- "ldr s22, [x15, x27]\n"
- "fmla v1.4s, v28.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 264]\n"
- "fmla v11.4s, v28.4s, v8.4s\n"
- "ldr x15, [%[inptrs], 224]\n"
- "fmla v21.4s, v28.4s, v7.4s\n"
- "fmla v24.4s, v28.4s, v9.4s\n"
- "fmla v14.4s, v29.4s, v3.4s\n"
- "ldr s27, [x7, x27]\n"
- "fmla v1.4s, v25.4s, v6.4s\n"
- "ldr x7, [%[inptrs], 184]\n"
- "fmla v10.4s, v29.4s, v4.4s\n"
- "fmla v20.4s, v29.4s, v5.4s\n"
- "fmla v21.4s, v25.4s, v8.4s\n"
- "ldr s26, [x24, x27]\n"
- "fmla v12.4s, v22.4s, v3.4s\n"
- "ldr s25, [x16, x27]\n"
- "fmla v11.4s, v22.4s, v4.4s\n"
- "ldr x16, [%[inptrs], 272]\n"
- "fmla v10.4s, v22.4s, v6.4s\n"
- "fmla v20.4s, v22.4s, v7.4s\n"
- "fmla v24.4s, v22.4s, v5.4s\n"
- "fmla v23.4s, v22.4s, v19.4s\n"
- "fmla v15.4s, v27.4s, v3.4s\n"
- "ldr s31, [x15, x27]\n"
- "fmla v11.4s, v27.4s, v6.4s\n"
- "ldr s22, [x7, x27]\n"
- "fmla v21.4s, v27.4s, v4.4s\n"
- "ldr x15, [%[inptrs], 232]\n"
- "fmla v20.4s, v27.4s, v8.4s\n"
- "fmla v24.4s, v27.4s, v7.4s\n"
- "fmla v23.4s, v27.4s, v9.4s\n"
- "ldr s19, [x16, x27]\n"
- "fmla v1.4s, v26.4s, v3.4s\n"
- "ldr s28, [x15, x27]\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr x16, [%[inptrs], 280]\n"
- "fmla v24.4s, v26.4s, v8.4s\n"
- "fmla v10.4s, v25.4s, v3.4s\n"
- "fmla v20.4s, v25.4s, v4.4s\n"
- "ldr s30, [x16, x27]\n"
- "fmla v23.4s, v25.4s, v5.4s\n"
- "add x27, x27, #4\n"
- "fmla v11.4s, v31.4s, v3.4s\n"
- "fmla v21.4s, v22.4s, v3.4s\n"
- "fmla v24.4s, v31.4s, v4.4s\n"
- "movi v29.16b, #0\n"
- "fmla v20.4s, v31.4s, v6.4s\n"
- "fmla v23.4s, v31.4s, v7.4s\n"
- "fmax v2.4s, v2.4s, v29.4s\n"
- "fmax v18.4s, v18.4s, v29.4s\n"
- "fmla v24.4s, v22.4s, v6.4s\n"
- "fmax v17.4s, v17.4s, v29.4s\n"
- "fmla v20.4s, v19.4s, v3.4s\n"
- "fmax v1.4s, v1.4s, v29.4s\n"
- "str s2, [x20, x28]\n"
- "fmla v23.4s, v22.4s, v8.4s\n"
- "fmax v16.4s, v16.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 8]\n"
- "fmla v24.4s, v28.4s, v3.4s\n"
- "fmax v0.4s, v0.4s, v29.4s\n"
- "str s18, [x20, x28]\n"
- "fmax v15.4s, v15.4s, v29.4s\n"
- "str s16, [x21, x28]\n"
- "fmla v23.4s, v19.4s, v4.4s\n"
- "fmax v21.4s, v21.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 16]\n"
- "fmax v13.4s, v13.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 40]\n"
- "str s17, [x20, x28]\n"
- "fmax v12.4s, v12.4s, v29.4s\n"
- "str s0, [x21, x28]\n"
- "fmla v23.4s, v28.4s, v6.4s\n"
- "str s13, [x22, x28]\n"
- "fmax v11.4s, v11.4s, v29.4s\n"
- "fmax v24.4s, v24.4s, v29.4s\n"
- "ldr x20, [%[outptrs], 24]\n"
- "fmax v14.4s, v14.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 48]\n"
- "str s1, [x20, x28]\n"
- "fmla v23.4s, v30.4s, v3.4s\n"
- "str s15, [x21, x28]\n"
- "fmax v10.4s, v10.4s, v29.4s\n"
- "str s14, [x23, x28]\n"
- "fmax v20.4s, v20.4s, v29.4s\n"
- "ldr x21, [%[outptrs], 56]\n"
- "ldr x22, [%[outptrs], 72]\n"
- "ldr x23, [%[outptrs], 104]\n"
- "fmax v23.4s, v23.4s, v29.4s\n"
- "str s21, [x21, x28]\n"
- "str s12, [x22, x28]\n"
- "str s10, [x23, x28]\n"
- "ldr x22, [%[outptrs], 80]\n"
- "ldr x23, [%[outptrs], 112]\n"
- "str s11, [x22, x28]\n"
- "str s20, [x23, x28]\n"
- "ldr x22, [%[outptrs], 88]\n"
- "ldr x23, [%[outptrs], 120]\n"
- "str s24, [x22, x28]\n"
- "str s23, [x23, x28]\n"
- "add x28, x28, #4\n"
- "7:\n"
- : [wbptr] "+r" (weight_bias_ptr)
- : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
- );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
- int n_channels,
- const void *weight_bias_ptr,
- const float *input,
- const unsigned int input_row_stride,
- const unsigned int input_col_stride,
- float *output,
- const unsigned int output_row_stride,
- const unsigned int output_col_stride
-)
-{
- __asm __volatile(
- "add x24, %[inptr0], %[input_row_stride]\n"
- "add x13, %[input_col_stride1], %[input_col_stride1]\n"
- "add x8, %[outptr0], %[output_row_stride]\n"
- "add x9, x24, %[input_row_stride]\n"
- "add x10, x13, #64\n"
- "add x19, x13, %[input_col_stride1]\n"
- "add x20, x9, %[input_row_stride]\n"
- "add x21, x19, #64\n"
- "add x17, x19, %[input_col_stride1]\n"
- "add x22, x20, %[input_row_stride]\n"
- "add x7, x17, #64\n"
- "add x11, x17, %[input_col_stride1]\n"
- "add x23, x22, %[input_row_stride]\n"
- "add x12, x11, #64\n"
- "add x25, x8, %[output_row_stride]\n"
- "add x26, x25, %[output_row_stride]\n"
- "add x27, %[output_col_stride1], %[output_col_stride1]\n"
- "and x14, %[n_channels], #3\n"
- "add x28, x27, %[output_col_stride1]\n"
- "lsr x15, %[n_channels], #2\n"
- "cbz x15, 4f\n"
- "1:\n"
- "ldr q23, [%[wbptr]]\n"
- "subs x15, x15, #1\n"
- "mov v12.16b, v23.16b\n"
- "ldr q20, [%[wbptr], #16]\n"
- "mov v8.16b, v23.16b\n"
- "ldr q6, [%[wbptr], #32]\n"
- "mov v11.16b, v23.16b\n"
- "ldr q5, [%[wbptr], #48]\n"
- "mov v16.16b, v23.16b\n"
- "ldr q19, [%[wbptr], #64]\n"
- "mov v7.16b, v23.16b\n"
- "ldr q4, [%[wbptr], #80]\n"
- "mov v10.16b, v23.16b\n"
- "ldr q3, [%[wbptr], #96]\n"
- "mov v14.16b, v23.16b\n"
- "ldr q2, [%[wbptr], #112]\n"
- "mov v15.16b, v23.16b\n"
- "ldr q1, [%[wbptr], #128]\n"
- "mov v17.16b, v23.16b\n"
- "ldr q0, [%[wbptr], #144]\n"
- "mov v9.16b, v23.16b\n"
- "ldr q28, [%[inptr0]]\n"
- "fmla v12.4s, v28.4s, v20.4s\n"
- "ldr q25, [x24]\n"
- "fmla v8.4s, v25.4s, v20.4s\n"
- "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v11.4s, v18.4s, v20.4s\n"
- "ldr q30, [x9]\n"
- "fmla v12.4s, v25.4s, v19.4s\n"
- "ldr q29, [x24, %[input_col_stride1]]\n"
- "fmla v8.4s, v30.4s, v19.4s\n"
- "ldr q24, [%[inptr0], x13]\n"
- "fmla v16.4s, v30.4s, v20.4s\n"
- "ldr q27, [x20]\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "ldr q22, [x9, %[input_col_stride1]]\n"
- "fmla v8.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x24, #64]\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v12.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "prfm pldl1keep, [x24, x16]\n"
- "prfm pldl1keep, [%[inptr0], x10]\n"
- "prfm pldl1keep, [x20, #64]\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v12.4s, v29.4s, v4.4s\n"
- "beq 3f\n"
- "2:\n"
- "mov v13.16b, v23.16b\n"
- "ldr q21, [x24, x13]\n"
- "mov v18.16b, v23.16b\n"
- "prfm pldl1keep, [x24, x10]\n"
- "fmla v11.4s, v29.4s, v19.4s\n"
- "prfm pldl1keep, [%[inptr0], x21]\n"
- "fmla v7.4s, v29.4s, v20.4s\n"
- "ldr q25, [%[inptr0], x19]\n"
- "fmla v12.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v11.4s, v24.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v10.4s, v24.4s, v20.4s\n"
- "ldr q24, [x22]\n"
- "fmla v8.4s, v27.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x10]\n"
- "fmla v16.4s, v27.4s, v19.4s\n"
- "prfm pldl1keep, [x24, x21]\n"
- "fmla v14.4s, v27.4s, v20.4s\n"
- "ldr q26, [x20, %[input_col_stride1]]\n"
- "fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v8.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v11.4s, v22.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x10]\n"
- "fmla v7.4s, v22.4s, v19.4s\n"
- "prfm pldl1keep, [x9, x21]\n"
- "fmla v15.4s, v22.4s, v20.4s\n"
- "ldr q30, [x9, x13]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x7]\n"
- "fmla v8.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x12]\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v7.4s, v21.4s, v6.4s\n"
- "prfm pldl1keep, [x22, x10]\n"
- "fmla v10.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [x20, x21]\n"
- "fmla v17.4s, v21.4s, v20.4s\n"
- "ldr q22, [x24, x19]\n"
- "fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v10.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x24, x12]\n"
- "fmla v9.4s, v25.4s, v20.4s\n"
- "ldr q21, [%[inptr0], x17]\n"
- "fmla v16.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x10]\n"
- "fmla v14.4s, v24.4s, v19.4s\n"
- "ldr q24, [x23]\n"
- "fmla v8.4s, v26.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x21]\n"
- "fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x7]\n"
- "fmla v7.4s, v26.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x12]\n"
- "fmla v14.4s, v26.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x21]\n"
- "fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x7]\n"
- "fmla v13.4s, v26.4s, v20.4s\n"
- "ldr q26, [x22, %[input_col_stride1]]\n"
- "fmla v12.4s, v30.4s, v0.4s\n"
- "prfm pldl1keep, [x20, x12]\n"
- "fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x7]\n"
- "fmla v11.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x12]\n"
- "fmla v16.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x23, x12]\n"
- "fmla v7.4s, v30.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v10.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "subs x15, x15, #1\n"
- "fmla v17.4s, v30.4s, v19.4s\n"
- "fmla v18.4s, v30.4s, v20.4s\n"
- "mov v25.16b, v23.16b\n"
- "fmla v11.4s, v22.4s, v3.4s\n"
- "fmla v7.4s, v22.4s, v5.4s\n"
- "fmla v10.4s, v22.4s, v4.4s\n"
- "fmla v17.4s, v22.4s, v6.4s\n"
- "fmla v9.4s, v22.4s, v19.4s\n"
- "fmla v25.4s, v22.4s, v20.4s\n"
- "ldr q27, [x20, x13]\n"
- "fmla v10.4s, v21.4s, v5.4s\n"
- "fmla v14.4s, v24.4s, v2.4s\n"
- "mov v22.16b, v23.16b\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "mov v24.16b, v23.16b\n"
- "mov v21.16b, v23.16b\n"
- "fmla v16.4s, v26.4s, v1.4s\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v13.4s, v26.4s, v19.4s\n"
- "fmla v8.4s, v27.4s, v0.4s\n"
- "ldr q28, [x9, x19]\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v7.4s, v27.4s, v1.4s\n"
- "fmla v14.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v18.4s, v27.4s, v19.4s\n"
- "fmla v22.4s, v27.4s, v20.4s\n"
- "fmla v11.4s, v28.4s, v0.4s\n"
- "ldr q29, [x24, x17]\n"
- "fmla v7.4s, v28.4s, v3.4s\n"
- "fmla v10.4s, v28.4s, v1.4s\n"
- "fmla v15.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v28.4s, v4.4s\n"
- "fmla v9.4s, v28.4s, v2.4s\n"
- "fmla v18.4s, v28.4s, v6.4s\n"
- "fmla v25.4s, v28.4s, v19.4s\n"
- "fmla v24.4s, v28.4s, v20.4s\n"
- "fmla v10.4s, v29.4s, v3.4s\n"
- "ldr q23, [%[inptr0], x11]\n"
- "fmla v17.4s, v29.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v9.4s, v29.4s, v4.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v25.4s, v29.4s, v6.4s\n"
- "ldr q30, [x23, %[input_col_stride1]]\n"
- "fmla v14.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v9.4s, v23.4s, v5.4s\n"
- "ldr q23, [x22, x13]\n"
- "fmla v13.4s, v30.4s, v2.4s\n"
- "ldr q29, [x20, x19]\n"
- "fmla v16.4s, v23.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x10]\n"
- "fmla v14.4s, v23.4s, v3.4s\n"
- "fmla v15.4s, v23.4s, v1.4s\n"
- "fmla v13.4s, v23.4s, v4.4s\n"
- "fmla v18.4s, v23.4s, v2.4s\n"
- "fmla v22.4s, v23.4s, v19.4s\n"
- "ldr q23, [x9, x17]\n"
- "fmla v7.4s, v29.4s, v0.4s\n"
- "fmla v15.4s, v29.4s, v3.4s\n"
- "fmla v17.4s, v29.4s, v1.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v2.4s\n"
- "fmla v22.4s, v29.4s, v6.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v21.4s, v29.4s, v20.4s\n"
- "ldr q26, [x24, x11]\n"
- "fmla v10.4s, v23.4s, v0.4s\n"
- "ldr q28, [x23, x13]\n"
- "fmla v17.4s, v23.4s, v3.4s\n"
- "add x24, x24, #16\n"
- "fmla v9.4s, v23.4s, v1.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v18.4s, v23.4s, v5.4s\n"
- "prfm pldl1keep, [x24, x16]\n"
- "fmla v25.4s, v23.4s, v4.4s\n"
- "fmla v24.4s, v23.4s, v6.4s\n"
- "fmla v9.4s, v26.4s, v3.4s\n"
- "ldr q20, [x22, x19]\n"
- "fmla v14.4s, v28.4s, v0.4s\n"
- "fmla v13.4s, v28.4s, v1.4s\n"
- "fmla v25.4s, v26.4s, v5.4s\n"
- "ldr q26, [x20, x17]\n"
- "fmla v22.4s, v28.4s, v2.4s\n"
- "ldr q23, [x9, x11]\n"
- "fmla v15.4s, v20.4s, v0.4s\n"
- "add x9, x9, #16\n"
- "fmla v13.4s, v20.4s, v3.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "fmla v18.4s, v20.4s, v1.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v22.4s, v20.4s, v4.4s\n"
- "fmla v24.4s, v20.4s, v2.4s\n"
- "fmla v21.4s, v20.4s, v19.4s\n"
- "ldr q27, [x23, x19]\n"
- "fmla v17.4s, v26.4s, v0.4s\n"
- "ldr q20, [x22, x17]\n"
- "fmla v18.4s, v26.4s, v3.4s\n"
- "fmla v25.4s, v26.4s, v1.4s\n"
- "fmla v22.4s, v26.4s, v5.4s\n"
- "fmla v24.4s, v26.4s, v4.4s\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr q19, [x20, x11]\n"
- "fmla v9.4s, v23.4s, v0.4s\n"
- "ldr q28, [x23, x17]\n"
- "fmla v25.4s, v23.4s, v3.4s\n"
- "add x20, x20, #16\n"
- "fmla v24.4s, v23.4s, v5.4s\n"
- "ldr q29, [x22, x11]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "prfm pldl1keep, [x20, #64]\n"
- "fmla v22.4s, v27.4s, v1.4s\n"
- "add x22, x22, #16\n"
- "fmla v21.4s, v27.4s, v2.4s\n"
- "ldr q30, [x23, x11]\n"
- "fmla v18.4s, v20.4s, v0.4s\n"
- "ldr q23, [%[wbptr]]\n"
- "fmla v22.4s, v20.4s, v3.4s\n"
- "add x23, x23, #16\n"
- "fmla v24.4s, v20.4s, v1.4s\n"
- "fmla v21.4s, v20.4s, v4.4s\n"
- "fmla v25.4s, v19.4s, v0.4s\n"
- "ldr q20, [%[wbptr], #16]\n"
- "fmla v22.4s, v28.4s, v0.4s\n"
- "ldr q6, [%[wbptr], #32]\n"
- "fmla v21.4s, v19.4s, v5.4s\n"
- "movi v26.16b, #0\n"
- "fmla v24.4s, v19.4s, v3.4s\n"
- "ldr q19, [%[wbptr], #64]\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
- "fmax v11.4s, v11.4s, v26.4s\n"
- "fmla v21.4s, v28.4s, v1.4s\n"
- "ldr q5, [%[wbptr], #48]\n"
- "fmla v24.4s, v29.4s, v0.4s\n"
- "ldr q4, [%[wbptr], #80]\n"
- "fmax v10.4s, v10.4s, v26.4s\n"
- "fmax v9.4s, v9.4s, v26.4s\n"
- "fmla v21.4s, v29.4s, v3.4s\n"
- "ldr q2, [%[wbptr], #112]\n"
- "fmov v27.4s, #6.0\n"
- "fmax v8.4s, v8.4s, v26.4s\n"
- "fmax v7.4s, v7.4s, v26.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmla v21.4s, v30.4s, v0.4s\n"
- "ldr q3, [%[wbptr], #96]\n"
- "fmin v12.4s, v12.4s, v27.4s\n"
- "ldr q1, [%[wbptr], #128]\n"
- "fmin v11.4s, v11.4s, v27.4s\n"
- "fmin v10.4s, v10.4s, v27.4s\n"
- "str q12, [%[outptr0]]\n"
- "fmin v9.4s, v9.4s, v27.4s\n"
- "str q11, [%[outptr0], %[output_col_stride1]]\n"
- "fmin v8.4s, v8.4s, v27.4s\n"
- "str q10, [%[outptr0], x27]\n"
- "fmin v7.4s, v7.4s, v27.4s\n"
- "str q9, [%[outptr0], x28]\n"
- "fmin v17.4s, v17.4s, v27.4s\n"
- "str q8, [x8]\n"
- "fmax v25.4s, v25.4s, v26.4s\n"
- "str q7, [x8, %[output_col_stride1]]\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str q17, [x8, x27]\n"
- "fmin v25.4s, v25.4s, v27.4s\n"
- "fmin v16.4s, v16.4s, v27.4s\n"
- "ldr q0, [%[wbptr], #144]\n"
- "str q25, [x8, x28]\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "str q16, [x25]\n"
- "fmax v18.4s, v18.4s, v26.4s\n"
- "fmin v15.4s, v15.4s, v27.4s\n"
- "ldr q28, [%[inptr0]]\n"
- "fmin v18.4s, v18.4s, v27.4s\n"
- "ldr q25, [x24]\n"
- "str q15, [x25, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v26.4s\n"
- "str q18, [x25, x27]\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "fmin v24.4s, v24.4s, v27.4s\n"
- "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
- "fmin v14.4s, v14.4s, v27.4s\n"
- "ldr q30, [x9]\n"
- "str q24, [x25, x28]\n"
- "fmax v13.4s, v13.4s, v26.4s\n"
- "str q14, [x26]\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
- "fmin v13.4s, v13.4s, v27.4s\n"
- "ldr q29, [x24, %[input_col_stride1]]\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "ldr q24, [%[inptr0], x13]\n"
- "str q13, [x26, %[output_col_stride1]]\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
- "str q22, [x26, x27]\n"
- "mov v12.16b, v23.16b\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "ldr q27, [x20]\n"
- "mov v8.16b, v23.16b\n"
- "ldr q22, [x9, %[input_col_stride1]]\n"
- "str q21, [x26, x28]\n"
- "mov v11.16b, v23.16b\n"
- "mov v16.16b, v23.16b\n"
- "add %[outptr0], %[outptr0], #16\n"
- "mov v7.16b, v23.16b\n"
- "add x8, x8, #16\n"
- "mov v10.16b, v23.16b\n"
- "add x25, x25, #16\n"
- "mov v14.16b, v23.16b\n"
- "add x26, x26, #16\n"
- "mov v15.16b, v23.16b\n"
- "mov v17.16b, v23.16b\n"
- "mov v9.16b, v23.16b\n"
- "fmla v12.4s, v28.4s, v20.4s\n"
- "fmla v8.4s, v25.4s, v20.4s\n"
- "fmla v11.4s, v18.4s, v20.4s\n"
- "fmla v16.4s, v30.4s, v20.4s\n"
- "fmla v12.4s, v25.4s, v19.4s\n"
- "fmla v8.4s, v30.4s, v19.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "fmla v8.4s, v29.4s, v6.4s\n"
- "fmla v12.4s, v30.4s, v2.4s\n"
- "fmla v12.4s, v29.4s, v4.4s\n"
- "bne 2b\n"
- "3:\n"
- "mov v13.16b, v23.16b\n"
- "ldr q21, [x24, x13]\n"
- "mov v18.16b, v23.16b\n"
- "prfm pldl1keep, [x24, x10]\n"
- "fmla v11.4s, v29.4s, v19.4s\n"
- "prfm pldl1keep, [%[inptr0], x21]\n"
- "fmla v7.4s, v29.4s, v20.4s\n"
- "ldr q25, [%[inptr0], x19]\n"
- "fmla v12.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v11.4s, v24.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v10.4s, v24.4s, v20.4s\n"
- "ldr q24, [x22]\n"
- "fmla v8.4s, v27.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x10]\n"
- "fmla v16.4s, v27.4s, v19.4s\n"
- "prfm pldl1keep, [x24, x21]\n"
- "fmla v14.4s, v27.4s, v20.4s\n"
- "ldr q26, [x20, %[input_col_stride1]]\n"
- "fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v8.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v11.4s, v22.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x10]\n"
- "fmla v7.4s, v22.4s, v19.4s\n"
- "prfm pldl1keep, [x9, x21]\n"
- "fmla v15.4s, v22.4s, v20.4s\n"
- "ldr q30, [x9, x13]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x7]\n"
- "fmla v8.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x12]\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v7.4s, v21.4s, v6.4s\n"
- "prfm pldl1keep, [x22, x10]\n"
- "fmla v10.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [x20, x21]\n"
- "fmla v17.4s, v21.4s, v20.4s\n"
- "ldr q22, [x24, x19]\n"
- "fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v10.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x24, x12]\n"
- "fmla v9.4s, v25.4s, v20.4s\n"
- "ldr q21, [%[inptr0], x17]\n"
- "fmla v16.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x10]\n"
- "fmla v14.4s, v24.4s, v19.4s\n"
- "ldr q24, [x23]\n"
- "fmla v8.4s, v26.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x21]\n"
- "fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x7]\n"
- "fmla v7.4s, v26.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x12]\n"
- "fmla v14.4s, v26.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x21]\n"
- "fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x7]\n"
- "fmla v13.4s, v26.4s, v20.4s\n"
- "ldr q26, [x22, %[input_col_stride1]]\n"
- "fmla v12.4s, v30.4s, v0.4s\n"
- "prfm pldl1keep, [x20, x12]\n"
- "fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x7]\n"
- "fmla v11.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x12]\n"
- "fmla v16.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x23, x12]\n"
- "fmla v7.4s, v30.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #160\n"
- "fmla v10.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v19.4s\n"
- "fmla v18.4s, v30.4s, v20.4s\n"
- "ldr q27, [x20, x13]\n"
- "fmla v11.4s, v22.4s, v3.4s\n"
- "fmla v7.4s, v22.4s, v5.4s\n"
- "fmla v10.4s, v22.4s, v4.4s\n"
- "fmla v17.4s, v22.4s, v6.4s\n"
- "fmla v9.4s, v22.4s, v19.4s\n"
- "fmla v14.4s, v24.4s, v2.4s\n"
- "mov v25.16b, v23.16b\n"
- "fmla v16.4s, v26.4s, v1.4s\n"
- "fmla v10.4s, v21.4s, v5.4s\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v25.4s, v22.4s, v20.4s\n"
- "ldr q28, [x9, x19]\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "ldr q29, [x24, x17]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "fmla v13.4s, v26.4s, v19.4s\n"
- "mov v22.16b, v23.16b\n"
- "fmla v8.4s, v27.4s, v0.4s\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v7.4s, v27.4s, v1.4s\n"
- "fmla v14.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v18.4s, v27.4s, v19.4s\n"
- "fmla v22.4s, v27.4s, v20.4s\n"
- "mov v24.16b, v23.16b\n"
- "mov v21.16b, v23.16b\n"
- "fmla v11.4s, v28.4s, v0.4s\n"
- "fmla v7.4s, v28.4s, v3.4s\n"
- "fmla v10.4s, v28.4s, v1.4s\n"
- "fmla v15.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v28.4s, v4.4s\n"
- "fmla v9.4s, v28.4s, v2.4s\n"
- "fmla v18.4s, v28.4s, v6.4s\n"
- "fmla v25.4s, v28.4s, v19.4s\n"
- "fmla v24.4s, v28.4s, v20.4s\n"
- "ldr q23, [%[inptr0], x11]\n"
- "fmla v10.4s, v29.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #16\n"
- "fmla v17.4s, v29.4s, v5.4s\n"
- "fmla v9.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v6.4s\n"
- "ldr q30, [x23, %[input_col_stride1]]\n"
- "fmla v14.4s, v30.4s, v1.4s\n"
- "fmla v13.4s, v30.4s, v2.4s\n"
- "fmla v9.4s, v23.4s, v5.4s\n"
- "ldr q23, [x22, x13]\n"
- "fmla v16.4s, v23.4s, v0.4s\n"
- "ldr q29, [x20, x19]\n"
- "fmla v14.4s, v23.4s, v3.4s\n"
- "fmla v15.4s, v23.4s, v1.4s\n"
- "fmla v13.4s, v23.4s, v4.4s\n"
- "fmla v18.4s, v23.4s, v2.4s\n"
- "fmla v22.4s, v23.4s, v19.4s\n"
- "ldr q23, [x9, x17]\n"
- "fmla v7.4s, v29.4s, v0.4s\n"
- "fmla v15.4s, v29.4s, v3.4s\n"
- "fmla v17.4s, v29.4s, v1.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v2.4s\n"
- "fmla v22.4s, v29.4s, v6.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v21.4s, v29.4s, v20.4s\n"
- "ldr q26, [x24, x11]\n"
- "fmla v10.4s, v23.4s, v0.4s\n"
- "ldr q28, [x23, x13]\n"
- "fmla v17.4s, v23.4s, v3.4s\n"
- "add x24, x24, #16\n"
- "fmla v9.4s, v23.4s, v1.4s\n"
- "fmla v18.4s, v23.4s, v5.4s\n"
- "fmla v25.4s, v23.4s, v4.4s\n"
- "fmla v24.4s, v23.4s, v6.4s\n"
- "fmla v14.4s, v28.4s, v0.4s\n"
- "ldr q20, [x22, x19]\n"
- "fmla v9.4s, v26.4s, v3.4s\n"
- "fmla v13.4s, v28.4s, v1.4s\n"
- "fmla v25.4s, v26.4s, v5.4s\n"
- "ldr q26, [x20, x17]\n"
- "fmla v22.4s, v28.4s, v2.4s\n"
- "ldr q23, [x9, x11]\n"
- "fmla v15.4s, v20.4s, v0.4s\n"
- "add x9, x9, #16\n"
- "fmla v13.4s, v20.4s, v3.4s\n"
- "fmla v18.4s, v20.4s, v1.4s\n"
- "fmla v22.4s, v20.4s, v4.4s\n"
- "fmla v24.4s, v20.4s, v2.4s\n"
- "fmla v21.4s, v20.4s, v19.4s\n"
- "ldr q27, [x23, x19]\n"
- "fmla v17.4s, v26.4s, v0.4s\n"
- "ldr q20, [x22, x17]\n"
- "fmla v18.4s, v26.4s, v3.4s\n"
- "fmla v25.4s, v26.4s, v1.4s\n"
- "fmla v22.4s, v26.4s, v5.4s\n"
- "fmla v24.4s, v26.4s, v4.4s\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr q19, [x20, x11]\n"
- "fmla v9.4s, v23.4s, v0.4s\n"
- "ldr q28, [x23, x17]\n"
- "fmla v25.4s, v23.4s, v3.4s\n"
- "add x20, x20, #16\n"
- "fmla v24.4s, v23.4s, v5.4s\n"
- "ldr q29, [x22, x11]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "add x22, x22, #16\n"
- "fmla v22.4s, v27.4s, v1.4s\n"
- "fmla v21.4s, v27.4s, v2.4s\n"
- "fmla v18.4s, v20.4s, v0.4s\n"
- "ldr q30, [x23, x11]\n"
- "fmla v24.4s, v20.4s, v1.4s\n"
- "add x23, x23, #16\n"
- "fmla v22.4s, v20.4s, v3.4s\n"
- "fmla v21.4s, v20.4s, v4.4s\n"
- "fmla v25.4s, v19.4s, v0.4s\n"
- "movi v26.16b, #0\n"
- "fmla v24.4s, v19.4s, v3.4s\n"
- "fmov v27.4s, #6.0\n"
- "fmla v21.4s, v19.4s, v5.4s\n"
- "fmla v22.4s, v28.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
- "fmax v11.4s, v11.4s, v26.4s\n"
- "fmla v24.4s, v29.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v26.4s\n"
- "fmla v21.4s, v28.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v27.4s\n"
- "fmin v11.4s, v11.4s, v27.4s\n"
- "fmin v10.4s, v10.4s, v27.4s\n"
- "str q12, [%[outptr0]]\n"
- "fmax v9.4s, v9.4s, v26.4s\n"
- "str q11, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v21.4s, v29.4s, v3.4s\n"
- "str q10, [%[outptr0], x27]\n"
- "fmin v9.4s, v9.4s, v27.4s\n"
- "fmax v8.4s, v8.4s, v26.4s\n"
- "fmax v7.4s, v7.4s, v26.4s\n"
- "str q9, [%[outptr0], x28]\n"
- "fmla v21.4s, v30.4s, v0.4s\n"
- "fmin v8.4s, v8.4s, v27.4s\n"
- "add %[outptr0], %[outptr0], #16\n"
- "fmin v7.4s, v7.4s, v27.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "str q8, [x8]\n"
- "fmax v25.4s, v25.4s, v26.4s\n"
- "str q7, [x8, %[output_col_stride1]]\n"
- "fmin v17.4s, v17.4s, v27.4s\n"
- "fmin v25.4s, v25.4s, v27.4s\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str q17, [x8, x27]\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "str q25, [x8, x28]\n"
- "fmin v16.4s, v16.4s, v27.4s\n"
- "fmin v15.4s, v15.4s, v27.4s\n"
- "add x8, x8, #16\n"
- "str q16, [x25]\n"
- "fmax v18.4s, v18.4s, v26.4s\n"
- "str q15, [x25, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v26.4s\n"
- "fmin v18.4s, v18.4s, v27.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "fmin v24.4s, v24.4s, v27.4s\n"
- "fmax v13.4s, v13.4s, v26.4s\n"
- "str q18, [x25, x27]\n"
- "fmin v14.4s, v14.4s, v27.4s\n"
- "str q24, [x25, x28]\n"
- "fmin v13.4s, v13.4s, v27.4s\n"
- "str q14, [x26]\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
- "str q13, [x26, %[output_col_stride1]]\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "add x25, x25, #16\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "str q22, [x26, x27]\n"
- "str q21, [x26, x28]\n"
- "add x26, x26, #16\n"
- "4:\n"
- "cbz x14, 7f\n"
- "ldr s23, [%[wbptr]]\n"
- "mov v12.16b, v23.16b\n"
- "ldr s20, [%[wbptr], #4]\n"
- "mov v8.16b, v23.16b\n"
- "ldr s6, [%[wbptr], #8]\n"
- "mov v11.16b, v23.16b\n"
- "ldr s5, [%[wbptr], #12]\n"
- "mov v16.16b, v23.16b\n"
- "ldr s19, [%[wbptr], #16]\n"
- "mov v7.16b, v23.16b\n"
- "ldr s4, [%[wbptr], #20]\n"
- "mov v10.16b, v23.16b\n"
- "ldr s3, [%[wbptr], #24]\n"
- "mov v14.16b, v23.16b\n"
- "ldr s2, [%[wbptr], #28]\n"
- "mov v15.16b, v23.16b\n"
- "ldr s1, [%[wbptr], #32]\n"
- "mov v17.16b, v23.16b\n"
- "ldr s0, [%[wbptr], #36]\n"
- "mov v9.16b, v23.16b\n"
- "ldr s28, [%[inptr0]]\n"
- "fmla v12.4s, v28.4s, v20.4s\n"
- "ldr s25, [x24]\n"
- "fmla v8.4s, v25.4s, v20.4s\n"
- "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
- "fmla v11.4s, v18.4s, v20.4s\n"
- "ldr s30, [x9]\n"
- "fmla v12.4s, v25.4s, v19.4s\n"
- "ldr s29, [x24, %[input_col_stride1]]\n"
- "fmla v8.4s, v30.4s, v19.4s\n"
- "ldr s24, [%[inptr0], x13]\n"
- "fmla v16.4s, v30.4s, v20.4s\n"
- "ldr s27, [x20]\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "ldr s22, [x9, %[input_col_stride1]]\n"
- "fmla v8.4s, v29.4s, v6.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "prfm pldl1keep, [x24, #64]\n"
- "subs x14, x14, #1\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "prfm pldl1keep, [x9, #64]\n"
- "fmla v12.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [x24, x16]\n"
- "prfm pldl1keep, [%[inptr0], x10]\n"
- "prfm pldl1keep, [x20, #64]\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v12.4s, v29.4s, v4.4s\n"
- "beq 6f\n"
- "5:\n"
- "mov v13.16b, v23.16b\n"
- "ldr s21, [x24, x13]\n"
- "mov v18.16b, v23.16b\n"
- "prfm pldl1keep, [x24, x10]\n"
- "fmla v11.4s, v29.4s, v19.4s\n"
- "prfm pldl1keep, [%[inptr0], x21]\n"
- "fmla v7.4s, v29.4s, v20.4s\n"
- "ldr s25, [%[inptr0], x19]\n"
- "fmla v12.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v11.4s, v24.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v10.4s, v24.4s, v20.4s\n"
- "ldr s24, [x22]\n"
- "fmla v8.4s, v27.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x10]\n"
- "fmla v16.4s, v27.4s, v19.4s\n"
- "prfm pldl1keep, [x24, x21]\n"
- "fmla v14.4s, v27.4s, v20.4s\n"
- "ldr s26, [x20, %[input_col_stride1]]\n"
- "fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v8.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v11.4s, v22.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x10]\n"
- "fmla v7.4s, v22.4s, v19.4s\n"
- "prfm pldl1keep, [x9, x21]\n"
- "fmla v15.4s, v22.4s, v20.4s\n"
- "ldr s30, [x9, x13]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x7]\n"
- "fmla v8.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x12]\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v7.4s, v21.4s, v6.4s\n"
- "prfm pldl1keep, [x22, x10]\n"
- "fmla v10.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [x20, x21]\n"
- "fmla v17.4s, v21.4s, v20.4s\n"
- "ldr s22, [x24, x19]\n"
- "fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v10.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x24, x12]\n"
- "fmla v9.4s, v25.4s, v20.4s\n"
- "ldr s21, [%[inptr0], x17]\n"
- "fmla v16.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x10]\n"
- "fmla v14.4s, v24.4s, v19.4s\n"
- "ldr s24, [x23]\n"
- "fmla v8.4s, v26.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x21]\n"
- "fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x7]\n"
- "fmla v7.4s, v26.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x12]\n"
- "fmla v14.4s, v26.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x21]\n"
- "fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x7]\n"
- "fmla v13.4s, v26.4s, v20.4s\n"
- "ldr s26, [x22, %[input_col_stride1]]\n"
- "fmla v12.4s, v30.4s, v0.4s\n"
- "prfm pldl1keep, [x20, x12]\n"
- "fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x7]\n"
- "fmla v11.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x12]\n"
- "fmla v16.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x23, x12]\n"
- "fmla v7.4s, v30.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v10.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "subs x14, x14, #1\n"
- "fmla v17.4s, v30.4s, v19.4s\n"
- "fmla v18.4s, v30.4s, v20.4s\n"
- "mov v25.16b, v23.16b\n"
- "fmla v11.4s, v22.4s, v3.4s\n"
- "fmla v7.4s, v22.4s, v5.4s\n"
- "fmla v10.4s, v22.4s, v4.4s\n"
- "fmla v17.4s, v22.4s, v6.4s\n"
- "fmla v9.4s, v22.4s, v19.4s\n"
- "fmla v25.4s, v22.4s, v20.4s\n"
- "ldr s27, [x20, x13]\n"
- "fmla v10.4s, v21.4s, v5.4s\n"
- "fmla v14.4s, v24.4s, v2.4s\n"
- "mov v22.16b, v23.16b\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "mov v24.16b, v23.16b\n"
- "mov v21.16b, v23.16b\n"
- "fmla v16.4s, v26.4s, v1.4s\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v13.4s, v26.4s, v19.4s\n"
- "fmla v8.4s, v27.4s, v0.4s\n"
- "ldr s28, [x9, x19]\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v7.4s, v27.4s, v1.4s\n"
- "fmla v14.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v18.4s, v27.4s, v19.4s\n"
- "fmla v22.4s, v27.4s, v20.4s\n"
- "fmla v11.4s, v28.4s, v0.4s\n"
- "ldr s29, [x24, x17]\n"
- "fmla v7.4s, v28.4s, v3.4s\n"
- "fmla v10.4s, v28.4s, v1.4s\n"
- "fmla v15.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v28.4s, v4.4s\n"
- "fmla v9.4s, v28.4s, v2.4s\n"
- "fmla v18.4s, v28.4s, v6.4s\n"
- "fmla v25.4s, v28.4s, v19.4s\n"
- "fmla v24.4s, v28.4s, v20.4s\n"
- "fmla v10.4s, v29.4s, v3.4s\n"
- "ldr s23, [%[inptr0], x11]\n"
- "fmla v17.4s, v29.4s, v5.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v9.4s, v29.4s, v4.4s\n"
- "prfm pldl1keep, [%[inptr0], #64]\n"
- "fmla v25.4s, v29.4s, v6.4s\n"
- "ldr s30, [x23, %[input_col_stride1]]\n"
- "fmla v14.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x16]\n"
- "fmla v9.4s, v23.4s, v5.4s\n"
- "ldr s23, [x22, x13]\n"
- "fmla v13.4s, v30.4s, v2.4s\n"
- "ldr s29, [x20, x19]\n"
- "fmla v16.4s, v23.4s, v0.4s\n"
- "prfm pldl1keep, [%[inptr0], x10]\n"
- "fmla v14.4s, v23.4s, v3.4s\n"
- "fmla v15.4s, v23.4s, v1.4s\n"
- "fmla v13.4s, v23.4s, v4.4s\n"
- "fmla v18.4s, v23.4s, v2.4s\n"
- "fmla v22.4s, v23.4s, v19.4s\n"
- "ldr s23, [x9, x17]\n"
- "fmla v7.4s, v29.4s, v0.4s\n"
- "fmla v15.4s, v29.4s, v3.4s\n"
- "fmla v17.4s, v29.4s, v1.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v2.4s\n"
- "fmla v22.4s, v29.4s, v6.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v21.4s, v29.4s, v20.4s\n"
- "ldr s26, [x24, x11]\n"
- "fmla v10.4s, v23.4s, v0.4s\n"
- "ldr s28, [x23, x13]\n"
- "fmla v17.4s, v23.4s, v3.4s\n"
- "add x24, x24, #4\n"
- "fmla v9.4s, v23.4s, v1.4s\n"
- "prfm pldl1keep, [x24, #64]\n"
- "fmla v18.4s, v23.4s, v5.4s\n"
- "prfm pldl1keep, [x24, x16]\n"
- "fmla v25.4s, v23.4s, v4.4s\n"
- "fmla v24.4s, v23.4s, v6.4s\n"
- "fmla v9.4s, v26.4s, v3.4s\n"
- "ldr s20, [x22, x19]\n"
- "fmla v14.4s, v28.4s, v0.4s\n"
- "fmla v13.4s, v28.4s, v1.4s\n"
- "fmla v25.4s, v26.4s, v5.4s\n"
- "ldr s26, [x20, x17]\n"
- "fmla v22.4s, v28.4s, v2.4s\n"
- "ldr s23, [x9, x11]\n"
- "fmla v15.4s, v20.4s, v0.4s\n"
- "add x9, x9, #4\n"
- "fmla v13.4s, v20.4s, v3.4s\n"
- "prfm pldl1keep, [x9, #64]\n"
- "fmla v18.4s, v20.4s, v1.4s\n"
- "prfm pldl1keep, [x9, x16]\n"
- "fmla v22.4s, v20.4s, v4.4s\n"
- "fmla v24.4s, v20.4s, v2.4s\n"
- "fmla v21.4s, v20.4s, v19.4s\n"
- "ldr s27, [x23, x19]\n"
- "fmla v17.4s, v26.4s, v0.4s\n"
- "ldr s20, [x22, x17]\n"
- "fmla v18.4s, v26.4s, v3.4s\n"
- "fmla v25.4s, v26.4s, v1.4s\n"
- "fmla v22.4s, v26.4s, v5.4s\n"
- "fmla v24.4s, v26.4s, v4.4s\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr s19, [x20, x11]\n"
- "fmla v9.4s, v23.4s, v0.4s\n"
- "ldr s28, [x23, x17]\n"
- "fmla v25.4s, v23.4s, v3.4s\n"
- "add x20, x20, #4\n"
- "fmla v24.4s, v23.4s, v5.4s\n"
- "ldr s29, [x22, x11]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "prfm pldl1keep, [x20, #64]\n"
- "fmla v22.4s, v27.4s, v1.4s\n"
- "add x22, x22, #4\n"
- "fmla v21.4s, v27.4s, v2.4s\n"
- "ldr s30, [x23, x11]\n"
- "fmla v18.4s, v20.4s, v0.4s\n"
- "ldr s23, [%[wbptr]]\n"
- "fmla v22.4s, v20.4s, v3.4s\n"
- "add x23, x23, #4\n"
- "fmla v24.4s, v20.4s, v1.4s\n"
- "fmla v21.4s, v20.4s, v4.4s\n"
- "fmla v25.4s, v19.4s, v0.4s\n"
- "ldr s20, [%[wbptr], #4]\n"
- "fmla v22.4s, v28.4s, v0.4s\n"
- "ldr s6, [%[wbptr], #8]\n"
- "fmla v21.4s, v19.4s, v5.4s\n"
- "movi v26.16b, #0\n"
- "fmla v24.4s, v19.4s, v3.4s\n"
- "ldr s19, [%[wbptr], #16]\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
- "fmax v11.4s, v11.4s, v26.4s\n"
- "fmla v21.4s, v28.4s, v1.4s\n"
- "ldr s5, [%[wbptr], #12]\n"
- "fmla v24.4s, v29.4s, v0.4s\n"
- "ldr s4, [%[wbptr], #20]\n"
- "fmax v10.4s, v10.4s, v26.4s\n"
- "fmax v9.4s, v9.4s, v26.4s\n"
- "fmla v21.4s, v29.4s, v3.4s\n"
- "ldr s2, [%[wbptr], #28]\n"
- "fmov v27.4s, #6.0\n"
- "fmax v8.4s, v8.4s, v26.4s\n"
- "fmax v7.4s, v7.4s, v26.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "fmla v21.4s, v30.4s, v0.4s\n"
- "ldr s3, [%[wbptr], #24]\n"
- "fmin v12.4s, v12.4s, v27.4s\n"
- "ldr s1, [%[wbptr], #32]\n"
- "fmin v11.4s, v11.4s, v27.4s\n"
- "fmin v10.4s, v10.4s, v27.4s\n"
- "str s12, [%[outptr0]]\n"
- "fmin v9.4s, v9.4s, v27.4s\n"
- "str s11, [%[outptr0], %[output_col_stride1]]\n"
- "fmin v8.4s, v8.4s, v27.4s\n"
- "str s10, [%[outptr0], x27]\n"
- "fmin v7.4s, v7.4s, v27.4s\n"
- "str s9, [%[outptr0], x28]\n"
- "fmin v17.4s, v17.4s, v27.4s\n"
- "str s8, [x8]\n"
- "fmax v25.4s, v25.4s, v26.4s\n"
- "str s7, [x8, %[output_col_stride1]]\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str s17, [x8, x27]\n"
- "fmin v25.4s, v25.4s, v27.4s\n"
- "fmin v16.4s, v16.4s, v27.4s\n"
- "ldr s0, [%[wbptr], #36]\n"
- "str s25, [x8, x28]\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "str s16, [x25]\n"
- "fmax v18.4s, v18.4s, v26.4s\n"
- "fmin v15.4s, v15.4s, v27.4s\n"
- "ldr s28, [%[inptr0]]\n"
- "fmin v18.4s, v18.4s, v27.4s\n"
- "ldr s25, [x24]\n"
- "str s15, [x25, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v26.4s\n"
- "str s18, [x25, x27]\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "fmin v24.4s, v24.4s, v27.4s\n"
- "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
- "fmin v14.4s, v14.4s, v27.4s\n"
- "ldr s30, [x9]\n"
- "str s24, [x25, x28]\n"
- "fmax v13.4s, v13.4s, v26.4s\n"
- "str s14, [x26]\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
- "fmin v13.4s, v13.4s, v27.4s\n"
- "ldr s29, [x24, %[input_col_stride1]]\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "ldr s24, [%[inptr0], x13]\n"
- "str s13, [x26, %[output_col_stride1]]\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
- "str s22, [x26, x27]\n"
- "mov v12.16b, v23.16b\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "ldr s27, [x20]\n"
- "mov v8.16b, v23.16b\n"
- "ldr s22, [x9, %[input_col_stride1]]\n"
- "str s21, [x26, x28]\n"
- "mov v11.16b, v23.16b\n"
- "mov v16.16b, v23.16b\n"
- "add %[outptr0], %[outptr0], #4\n"
- "mov v7.16b, v23.16b\n"
- "add x8, x8, #4\n"
- "mov v10.16b, v23.16b\n"
- "add x25, x25, #4\n"
- "mov v14.16b, v23.16b\n"
- "add x26, x26, #4\n"
- "mov v15.16b, v23.16b\n"
- "mov v17.16b, v23.16b\n"
- "mov v9.16b, v23.16b\n"
- "fmla v12.4s, v28.4s, v20.4s\n"
- "fmla v8.4s, v25.4s, v20.4s\n"
- "fmla v11.4s, v18.4s, v20.4s\n"
- "fmla v16.4s, v30.4s, v20.4s\n"
- "fmla v12.4s, v25.4s, v19.4s\n"
- "fmla v8.4s, v30.4s, v19.4s\n"
- "fmla v12.4s, v18.4s, v6.4s\n"
- "fmla v8.4s, v29.4s, v6.4s\n"
- "fmla v12.4s, v30.4s, v2.4s\n"
- "fmla v12.4s, v29.4s, v4.4s\n"
- "bne 5b\n"
- "6:\n"
- "mov v13.16b, v23.16b\n"
- "ldr s21, [x24, x13]\n"
- "mov v18.16b, v23.16b\n"
- "prfm pldl1keep, [x24, x10]\n"
- "fmla v11.4s, v29.4s, v19.4s\n"
- "prfm pldl1keep, [%[inptr0], x21]\n"
- "fmla v7.4s, v29.4s, v20.4s\n"
- "ldr s25, [%[inptr0], x19]\n"
- "fmla v12.4s, v24.4s, v5.4s\n"
- "prfm pldl1keep, [x22, #64]\n"
- "fmla v11.4s, v24.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x16]\n"
- "fmla v10.4s, v24.4s, v20.4s\n"
- "ldr s24, [x22]\n"
- "fmla v8.4s, v27.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x10]\n"
- "fmla v16.4s, v27.4s, v19.4s\n"
- "prfm pldl1keep, [x24, x21]\n"
- "fmla v14.4s, v27.4s, v20.4s\n"
- "ldr s26, [x20, %[input_col_stride1]]\n"
- "fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x7]\n"
- "fmla v8.4s, v22.4s, v4.4s\n"
- "prfm pldl1keep, [x23, #64]\n"
- "fmla v11.4s, v22.4s, v2.4s\n"
- "prfm pldl1keep, [x22, x16]\n"
- "fmla v16.4s, v22.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x10]\n"
- "fmla v7.4s, v22.4s, v19.4s\n"
- "prfm pldl1keep, [x9, x21]\n"
- "fmla v15.4s, v22.4s, v20.4s\n"
- "ldr s30, [x9, x13]\n"
- "fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x7]\n"
- "fmla v8.4s, v21.4s, v5.4s\n"
- "prfm pldl1keep, [%[inptr0], x12]\n"
- "fmla v11.4s, v21.4s, v4.4s\n"
- "prfm pldl1keep, [x23, x16]\n"
- "fmla v7.4s, v21.4s, v6.4s\n"
- "prfm pldl1keep, [x22, x10]\n"
- "fmla v10.4s, v21.4s, v19.4s\n"
- "prfm pldl1keep, [x20, x21]\n"
- "fmla v17.4s, v21.4s, v20.4s\n"
- "ldr s22, [x24, x19]\n"
- "fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x7]\n"
- "fmla v10.4s, v25.4s, v6.4s\n"
- "prfm pldl1keep, [x24, x12]\n"
- "fmla v9.4s, v25.4s, v20.4s\n"
- "ldr s21, [%[inptr0], x17]\n"
- "fmla v16.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x10]\n"
- "fmla v14.4s, v24.4s, v19.4s\n"
- "ldr s24, [x23]\n"
- "fmla v8.4s, v26.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x21]\n"
- "fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x7]\n"
- "fmla v7.4s, v26.4s, v2.4s\n"
- "prfm pldl1keep, [x9, x12]\n"
- "fmla v14.4s, v26.4s, v6.4s\n"
- "prfm pldl1keep, [x23, x21]\n"
- "fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x7]\n"
- "fmla v13.4s, v26.4s, v20.4s\n"
- "ldr s26, [x22, %[input_col_stride1]]\n"
- "fmla v12.4s, v30.4s, v0.4s\n"
- "prfm pldl1keep, [x20, x12]\n"
- "fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x7]\n"
- "fmla v11.4s, v30.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x12]\n"
- "fmla v16.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x23, x12]\n"
- "fmla v7.4s, v30.4s, v4.4s\n"
- "add %[wbptr], %[wbptr], #40\n"
- "fmla v10.4s, v30.4s, v2.4s\n"
- "prfm pldl1keep, [%[wbptr], #64]\n"
- "fmla v15.4s, v30.4s, v6.4s\n"
- "fmla v17.4s, v30.4s, v19.4s\n"
- "fmla v18.4s, v30.4s, v20.4s\n"
- "ldr s27, [x20, x13]\n"
- "fmla v11.4s, v22.4s, v3.4s\n"
- "fmla v7.4s, v22.4s, v5.4s\n"
- "fmla v10.4s, v22.4s, v4.4s\n"
- "fmla v17.4s, v22.4s, v6.4s\n"
- "fmla v9.4s, v22.4s, v19.4s\n"
- "fmla v14.4s, v24.4s, v2.4s\n"
- "mov v25.16b, v23.16b\n"
- "fmla v16.4s, v26.4s, v1.4s\n"
- "fmla v10.4s, v21.4s, v5.4s\n"
- "fmla v15.4s, v26.4s, v2.4s\n"
- "fmla v25.4s, v22.4s, v20.4s\n"
- "ldr s28, [x9, x19]\n"
- "fmla v9.4s, v21.4s, v6.4s\n"
- "ldr s29, [x24, x17]\n"
- "fmla v14.4s, v26.4s, v4.4s\n"
- "fmla v13.4s, v26.4s, v19.4s\n"
- "mov v22.16b, v23.16b\n"
- "fmla v8.4s, v27.4s, v0.4s\n"
- "fmla v16.4s, v27.4s, v3.4s\n"
- "fmla v7.4s, v27.4s, v1.4s\n"
- "fmla v14.4s, v27.4s, v5.4s\n"
- "fmla v15.4s, v27.4s, v4.4s\n"
- "fmla v17.4s, v27.4s, v2.4s\n"
- "fmla v13.4s, v27.4s, v6.4s\n"
- "fmla v18.4s, v27.4s, v19.4s\n"
- "fmla v22.4s, v27.4s, v20.4s\n"
- "mov v24.16b, v23.16b\n"
- "mov v21.16b, v23.16b\n"
- "fmla v11.4s, v28.4s, v0.4s\n"
- "fmla v7.4s, v28.4s, v3.4s\n"
- "fmla v10.4s, v28.4s, v1.4s\n"
- "fmla v15.4s, v28.4s, v5.4s\n"
- "fmla v17.4s, v28.4s, v4.4s\n"
- "fmla v9.4s, v28.4s, v2.4s\n"
- "fmla v18.4s, v28.4s, v6.4s\n"
- "fmla v25.4s, v28.4s, v19.4s\n"
- "fmla v24.4s, v28.4s, v20.4s\n"
- "ldr s23, [%[inptr0], x11]\n"
- "fmla v10.4s, v29.4s, v3.4s\n"
- "add %[inptr0], %[inptr0], #4\n"
- "fmla v17.4s, v29.4s, v5.4s\n"
- "fmla v9.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v6.4s\n"
- "ldr s30, [x23, %[input_col_stride1]]\n"
- "fmla v14.4s, v30.4s, v1.4s\n"
- "fmla v13.4s, v30.4s, v2.4s\n"
- "fmla v9.4s, v23.4s, v5.4s\n"
- "ldr s23, [x22, x13]\n"
- "fmla v16.4s, v23.4s, v0.4s\n"
- "ldr s29, [x20, x19]\n"
- "fmla v14.4s, v23.4s, v3.4s\n"
- "fmla v15.4s, v23.4s, v1.4s\n"
- "fmla v13.4s, v23.4s, v4.4s\n"
- "fmla v18.4s, v23.4s, v2.4s\n"
- "fmla v22.4s, v23.4s, v19.4s\n"
- "ldr s23, [x9, x17]\n"
- "fmla v7.4s, v29.4s, v0.4s\n"
- "fmla v15.4s, v29.4s, v3.4s\n"
- "fmla v17.4s, v29.4s, v1.4s\n"
- "fmla v13.4s, v29.4s, v5.4s\n"
- "fmla v18.4s, v29.4s, v4.4s\n"
- "fmla v25.4s, v29.4s, v2.4s\n"
- "fmla v22.4s, v29.4s, v6.4s\n"
- "fmla v24.4s, v29.4s, v19.4s\n"
- "fmla v21.4s, v29.4s, v20.4s\n"
- "ldr s26, [x24, x11]\n"
- "fmla v10.4s, v23.4s, v0.4s\n"
- "ldr s28, [x23, x13]\n"
- "fmla v17.4s, v23.4s, v3.4s\n"
- "add x24, x24, #4\n"
- "fmla v9.4s, v23.4s, v1.4s\n"
- "fmla v18.4s, v23.4s, v5.4s\n"
- "fmla v25.4s, v23.4s, v4.4s\n"
- "fmla v24.4s, v23.4s, v6.4s\n"
- "fmla v14.4s, v28.4s, v0.4s\n"
- "ldr s20, [x22, x19]\n"
- "fmla v9.4s, v26.4s, v3.4s\n"
- "fmla v13.4s, v28.4s, v1.4s\n"
- "fmla v25.4s, v26.4s, v5.4s\n"
- "ldr s26, [x20, x17]\n"
- "fmla v22.4s, v28.4s, v2.4s\n"
- "ldr s23, [x9, x11]\n"
- "fmla v15.4s, v20.4s, v0.4s\n"
- "add x9, x9, #4\n"
- "fmla v13.4s, v20.4s, v3.4s\n"
- "fmla v18.4s, v20.4s, v1.4s\n"
- "fmla v22.4s, v20.4s, v4.4s\n"
- "fmla v24.4s, v20.4s, v2.4s\n"
- "fmla v21.4s, v20.4s, v19.4s\n"
- "ldr s27, [x23, x19]\n"
- "fmla v17.4s, v26.4s, v0.4s\n"
- "ldr s20, [x22, x17]\n"
- "fmla v18.4s, v26.4s, v3.4s\n"
- "fmla v25.4s, v26.4s, v1.4s\n"
- "fmla v22.4s, v26.4s, v5.4s\n"
- "fmla v24.4s, v26.4s, v4.4s\n"
- "fmla v21.4s, v26.4s, v6.4s\n"
- "ldr s19, [x20, x11]\n"
- "fmla v9.4s, v23.4s, v0.4s\n"
- "ldr s28, [x23, x17]\n"
- "fmla v25.4s, v23.4s, v3.4s\n"
- "add x20, x20, #4\n"
- "fmla v24.4s, v23.4s, v5.4s\n"
- "ldr s29, [x22, x11]\n"
- "fmla v13.4s, v27.4s, v0.4s\n"
- "add x22, x22, #4\n"
- "fmla v22.4s, v27.4s, v1.4s\n"
- "fmla v21.4s, v27.4s, v2.4s\n"
- "fmla v18.4s, v20.4s, v0.4s\n"
- "ldr s30, [x23, x11]\n"
- "fmla v24.4s, v20.4s, v1.4s\n"
- "add x23, x23, #4\n"
- "fmla v22.4s, v20.4s, v3.4s\n"
- "fmla v21.4s, v20.4s, v4.4s\n"
- "fmla v25.4s, v19.4s, v0.4s\n"
- "movi v26.16b, #0\n"
- "fmla v24.4s, v19.4s, v3.4s\n"
- "fmov v27.4s, #6.0\n"
- "fmla v21.4s, v19.4s, v5.4s\n"
- "fmla v22.4s, v28.4s, v0.4s\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
- "fmax v11.4s, v11.4s, v26.4s\n"
- "fmla v24.4s, v29.4s, v0.4s\n"
- "fmax v10.4s, v10.4s, v26.4s\n"
- "fmla v21.4s, v28.4s, v1.4s\n"
- "fmin v12.4s, v12.4s, v27.4s\n"
- "fmin v11.4s, v11.4s, v27.4s\n"
- "fmin v10.4s, v10.4s, v27.4s\n"
- "str s12, [%[outptr0]]\n"
- "fmax v9.4s, v9.4s, v26.4s\n"
- "str s11, [%[outptr0], %[output_col_stride1]]\n"
- "fmla v21.4s, v29.4s, v3.4s\n"
- "str s10, [%[outptr0], x27]\n"
- "fmin v9.4s, v9.4s, v27.4s\n"
- "fmax v8.4s, v8.4s, v26.4s\n"
- "fmax v7.4s, v7.4s, v26.4s\n"
- "str s9, [%[outptr0], x28]\n"
- "fmla v21.4s, v30.4s, v0.4s\n"
- "fmin v8.4s, v8.4s, v27.4s\n"
- "add %[outptr0], %[outptr0], #4\n"
- "fmin v7.4s, v7.4s, v27.4s\n"
- "fmax v17.4s, v17.4s, v26.4s\n"
- "str s8, [x8]\n"
- "fmax v25.4s, v25.4s, v26.4s\n"
- "str s7, [x8, %[output_col_stride1]]\n"
- "fmin v17.4s, v17.4s, v27.4s\n"
- "fmin v25.4s, v25.4s, v27.4s\n"
- "fmax v16.4s, v16.4s, v26.4s\n"
- "str s17, [x8, x27]\n"
- "fmax v15.4s, v15.4s, v26.4s\n"
- "str s25, [x8, x28]\n"
- "fmin v16.4s, v16.4s, v27.4s\n"
- "fmin v15.4s, v15.4s, v27.4s\n"
- "add x8, x8, #4\n"
- "str s16, [x25]\n"
- "fmax v18.4s, v18.4s, v26.4s\n"
- "str s15, [x25, %[output_col_stride1]]\n"
- "fmax v24.4s, v24.4s, v26.4s\n"
- "fmin v18.4s, v18.4s, v27.4s\n"
- "fmax v14.4s, v14.4s, v26.4s\n"
- "fmin v24.4s, v24.4s, v27.4s\n"
- "fmax v13.4s, v13.4s, v26.4s\n"
- "str s18, [x25, x27]\n"
- "fmin v14.4s, v14.4s, v27.4s\n"
- "str s24, [x25, x28]\n"
- "fmin v13.4s, v13.4s, v27.4s\n"
- "str s14, [x26]\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
- "str s13, [x26, %[output_col_stride1]]\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "add x25, x25, #4\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "str s22, [x26, x27]\n"
- "str s21, [x26, x28]\n"
- "add x26, x26, #4\n"
- "7:\n"
- : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
- : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
- );
-}
-
-#endif // __aarch64__
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
deleted file mode 100644
index 27bfb843f6..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_dilated.hpp"
-
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
deleted file mode 100644
index 1bae815613..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-
-#include "depthwise.hpp"
-
-namespace depthwise
-{
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename TIn, typename TBias, typename TOut
->
-class DilatedDepthwiseConvolution : public IDepthwiseConvolution
-{
- public:
- /** Create a new dilated depthwise convolution engine.
- */
- DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- /** Create a new dilated depthwise convolution engine.
- */
- DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- // Cannot copy or move a DilatedDepthwiseConvolution.
- DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
- DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
-
- /* Set input tensor and stride. */
- void set_input(const void *inptr) override;
- void set_input(const void *inptr, int column_stride) override;
- void set_input(const void *inptr, int row_stride, int column_stride) override;
- void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
- /* Set output tensor and stride. */
- void set_output(void *outptr) override;
- void set_output(void *outptr, int column_stride) override;
- void set_output(void *outptr, int row_stride, int column_stride) override;
- void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
- static int get_output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after,
- int dilation_factor
- );
-
- int output_size(
- int dim_size, unsigned int padding_before, unsigned int padding_after
- ) const override;
-
- /* Weights and biases are re-ordered to improve memory access patterns. Use
- * these methods to determine the size of the re-pack buffer and to set the
- * address (and implicitly reorder the weights and biases into) the buffer.
- */
- size_t get_packed_params_size(void) const override;
- void set_packed_params_buffer(void *) override;
-
- void pack_params(const void *weights, const void *biases=nullptr) const override;
- void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
- void pack_params(
- void *buffer,
- const void* weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const override;
-
- /* Working space is used to pad tensors on the fly. Before running any
- * inference check the amount of space required, allocate and provide a
- * pointer to the convolution engine.
- */
- size_t get_working_space_size(unsigned int nthreads=1) const override;
- void set_working_space(void *) override;
-
- unsigned int get_window(void) const override;
- void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
- protected:
- /** Protected constructor which also accepts a function to construct a new
- * subconvolution
- */
- DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right,
- std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
- );
-
- const int _dilation_factor;
- const int _n_input_rows, _n_input_cols, _n_channels;
- const int _padding_top, _padding_left;
- const int _n_output_rows, _n_output_cols;
-
- /* Dilated depthwise convolution is performed through repeated calls to
- * non-dilated convolutions. If the dilation factor is $n$, then we perform
- * $(n + 1)^2$ depthwise convolutions.
- */
- using BaseDepthwise = DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TBias, TOut
- >;
- std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
-};
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
deleted file mode 100644
index e56583d6b3..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_quantized_dilated.hpp"
-#include "impl_dilated.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
- KernelCols, StrideRows, StrideCols>::
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right)
- : QAsymm8DilatedDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- QAsymm8DilatedDepthwiseConvolution::get_output_size(
- n_input_rows, padding_top, padding_bottom, dilation_factor),
- QAsymm8DilatedDepthwiseConvolution::get_output_size(
- n_input_cols, padding_left, padding_right, dilation_factor),
- activation, weight_quantisation, input_quantisation,
- output_quantisation, padding_top, padding_left, padding_bottom,
- padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
- KernelCols, StrideRows, StrideCols>::
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right)
- : QAsymm8DilatedDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- n_output_rows, n_output_cols, activation, weight_quantisation,
- input_quantisation, output_quantisation,
- qasymm8::QAsymm8RescaleParams::make_rescale_params(
- weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
- KernelCols, StrideRows, StrideCols>::
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- const qasymm8::QAsymm8RescaleParams &rescale_parameters,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right)
- : QAsymm8DilatedDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- QAsymm8DilatedDepthwiseConvolution::get_output_size(
- n_input_rows, padding_top, padding_bottom, dilation_factor),
- QAsymm8DilatedDepthwiseConvolution::get_output_size(
- n_input_cols, padding_left, padding_right, dilation_factor),
- activation, weight_quantisation, input_quantisation,
- output_quantisation, rescale_parameters, padding_top, padding_left,
- padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
- KernelCols, StrideRows, StrideCols>::
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- const qasymm8::QAsymm8RescaleParams &rescale_parameters,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right)
- : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
- KernelCols, StrideRows, StrideCols, uint8_t,
- int32_t, uint8_t>(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- n_output_rows, n_output_cols, activation, padding_top, padding_left,
- padding_bottom, padding_right,
- [weight_quantisation, input_quantisation, output_quantisation,
- rescale_parameters](
- const int n_batches, const int n_input_rows,
- const int n_input_cols, const int n_channels,
- const int n_output_rows, const int n_output_cols,
- const nck::ActivationFunction activation,
- const unsigned int padding_top, const unsigned int padding_left,
- const unsigned int padding_bottom,
- const unsigned int padding_right) -> IDepthwiseConvolution * {
- return new QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols, activation, weight_quantisation,
- input_quantisation, output_quantisation, rescale_parameters,
- padding_top, padding_left, padding_bottom, padding_right);
- }) {}
-
-} // namespace depthwise
-
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
deleted file mode 100644
index 99f0f53792..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-} // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
deleted file mode 100644
index c13dd70a61..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class DepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
deleted file mode 100644
index bddae51135..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_base.hpp"
-
-// TODO Move to common utilities somewhere
-template <size_t Size> struct DType { };
-template <> struct DType<1> { using scalar_type = uint8_t; };
-template <> struct DType<2> { using scalar_type = uint16_t; };
-template <> struct DType<4> { using scalar_type = uint32_t; };
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute(
- unsigned int n_channels,
- void *buffer,
- const void *weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void *biases
-)
-{
- using TWeight = typename DType<WeightSize>::scalar_type;
- using TBias = typename DType<BiasSize>::scalar_type;
-
- auto buffer_ptr = static_cast<uint8_t *>(buffer);
- auto weights_ptr = static_cast<const TWeight *>(weights);
- auto biases_ptr = static_cast<const TBias *>(biases);
-
- const unsigned int veclen = 16 / WeightSize;
- for (; n_channels >= veclen; n_channels -= veclen)
- {
- // Copy biases
- for (unsigned int i = 0; i < veclen; i++)
- {
- auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
- *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
- buffer_ptr += BiasSize;
- }
-
- // Copy weights
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelColumns; j++)
- {
- for (unsigned int c = 0; c < veclen; c++)
- {
- *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c];
- buffer_ptr += WeightSize;
- }
- }
- }
- weights_ptr += veclen;
- }
- for (; n_channels; n_channels--)
- {
- // Copy bias
- auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
- *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
- buffer_ptr += BiasSize;
-
- // Copy weights
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelColumns; j++)
- {
- *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride];
- buffer_ptr += WeightSize;
- }
- }
- weights_ptr++;
- }
-}
-
-template struct PackParameters<3, 3, 2ul, 2ul>;
-template struct PackParameters<3, 3, 4ul, 4ul>;
-template struct PackParameters<5, 5, 2ul, 2ul>;
-template struct PackParameters<5, 5, 4ul, 4ul>;
-} // namespace
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
deleted file mode 100644
index b09f620475..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qa8.hpp"
-
-namespace depthwise
-{
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
deleted file mode 100644
index 1ae48b9417..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qs8_per_channel.hpp"
-
-namespace depthwise {
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
deleted file mode 100644
index 4343f6ad45..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise.hpp"
-#include "qasymm8.hpp"
-#include "qsymm8.hpp"
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
-{
- return vqrdmulhq_s32(a, b);
-}
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
- return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
- return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
-{
- const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
- const int32x4_t fixed = vqaddq_s32(x, fixup);
- return vrshlq_s32(fixed, shift);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
- const int32x4_t shift = vdupq_n_s32(-exponent);
- const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
- const int32x4_t fixed = vqaddq_s32(x, fixup);
- return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
- const int32x2_t shift = vdup_n_s32(-exponent);
- const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
- const int32x2_t fixed = vqadd_s32(x, fixup);
- return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
- const int32x2_t xs = vdup_n_s32(x);
- return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- uint8_t, int32_t, uint8_t,
- QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
- using Base = DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- uint8_t, int32_t, uint8_t,
- QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
- >;
- friend Base;
- using InputType = typename Base::InputType;
- using OutputType = typename Base::OutputType;
-
- public:
- QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- const qasymm8::QAsymm8RescaleParams& rescale_parameters,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- const qasymm8::QAsymm8RescaleParams& rescale_parameters,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- protected:
- uint8_t _input_padding_value(void) const;
-
- void _pack_params(
- void *buffer,
- const void *weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const;
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- );
-
- private:
- // Quantization parameters
- const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
- const qasymm8::QAsymm8RescaleParams rescale_parameters;
-};
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- uint8_t, int32_t, uint8_t,
- QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
- using Base = DepthwiseConvolutionBase<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- uint8_t, int32_t, uint8_t,
- QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
- >;
- friend Base;
- using InputType = typename Base::InputType;
- using OutputType = typename Base::OutputType;
-
- public:
- QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- const qsymm8::QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- nck::ActivationFunction activation,
- const qsymm8::QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- );
-
- size_t get_packed_params_size(void) const override
- {
- return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
-
- }
-
- protected:
- uint8_t _input_padding_value(void) const;
-
- void _pack_params(
- void *buffer,
- const void *weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases=nullptr
- ) const;
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- );
-
- template <nck::ActivationFunction Activation>
- void execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- );
-
- private:
- // Quantization parameters
- const qsymm8::QSymm8PerChannelParams _weights_quant;
- const qasymm8::QAsymm8Params _input_quant, _output_quant;
- const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
-};
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
deleted file mode 100644
index a11b0981c9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise_dilated.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols>
-class QAsymm8DilatedDepthwiseConvolution
- : public DilatedDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
- StrideCols, uint8_t, int32_t, uint8_t> {
-public:
- /** Create a new dilated depthwise convolution engine.
- */
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right);
-
- /** Create a new dilated depthwise convolution engine.
- */
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right);
-
- /** Create a new dilated depthwise convolution engine.
- */
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- const qasymm8::QAsymm8RescaleParams &rescale_parameters,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right);
-
- /** Create a new dilated depthwise convolution engine.
- */
- QAsymm8DilatedDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int dilation_factor, int n_output_rows, int n_output_cols,
- nck::ActivationFunction activation,
- const qasymm8::QAsymm8Params &weight_quantisation,
- const qasymm8::QAsymm8Params &input_quantisation,
- const qasymm8::QAsymm8Params &output_quantisation,
- const qasymm8::QAsymm8RescaleParams& rescale_parameters,
- unsigned int padding_top, unsigned int padding_left,
- unsigned int padding_bottom, unsigned int padding_right);
-};
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
deleted file mode 100644
index 266d13d6fc..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <algorithm>
-#include <cstdint>
-#include "depthwise.hpp"
-#include "padding.hpp"
-#include "utils.hpp"
-
-#pragma once
-
-#define MEMBERFN(TOUT) template <\
- unsigned int OutputTileRows, unsigned int OutputTileColumns,\
- unsigned int KernelRows, unsigned int KernelColumns,\
- unsigned int StrideRows, unsigned int StrideColumns,\
- typename TIn, typename TBias, typename TOut,\
- typename Derived\
-> TOUT DepthwiseConvolutionBase<\
- OutputTileRows, OutputTileColumns,\
- KernelRows, KernelColumns,\
- StrideRows, StrideColumns,\
- TIn, TBias, TOut, Derived\
->
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-struct PackParameters
-{
- static void execute(
- unsigned int n_channels,
- void *buffer,
- const void *weights,
- unsigned int weight_row_stride,
- unsigned int weight_col_stride,
- const void *biases
- );
-};
-
-const unsigned int CHANNEL_BLOCK = 16;
-
-MEMBERFN(int)::get_output_size(
- const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-)
-{
- return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
-}
-
-MEMBERFN(int)::output_size(
- const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-) const
-{
- return get_output_size(dim_size, padding_before, padding_after);
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
- const int n_batches,
- const int n_input_rows,
- const int n_input_cols,
- const int n_channels,
- ActivationFunction activation,
- const unsigned int padding_top,
- const unsigned int padding_left,
- const unsigned int padding_bottom,
- const unsigned int padding_right
-) : DepthwiseConvolutionBase(
- n_batches, n_input_rows, n_input_cols, n_channels,
- get_output_size(n_input_rows, padding_top, padding_bottom),
- get_output_size(n_input_cols, padding_left, padding_right),
- activation,
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
- const int n_batches,
- const int n_input_rows,
- const int n_input_cols,
- const int n_channels,
- const int n_output_rows,
- const int n_output_cols,
- ActivationFunction activation,
- const unsigned int padding_top,
- const unsigned int padding_left,
- const unsigned int padding_bottom,
- const unsigned int padding_right
-) : _input(nullptr), _output(nullptr),
- _packed_parameters(nullptr),
- _working_space(nullptr),
- _n_batches(n_batches),
- _n_input_rows(n_input_rows),
- _n_input_cols(n_input_cols),
- _n_channels(n_channels),
- _n_output_rows(n_output_rows),
- _n_output_cols(n_output_cols),
- _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
- _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
- _padding_top(padding_top),
- _padding_left(padding_left),
- _padding_bottom(padding_bottom),
- _padding_right(padding_right),
- _activation(activation),
- _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
- _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
-{
-}
-
-MEMBERFN(void)::set_input(const void* const inptr)
-{
- set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
-{
- set_input(inptr, _n_input_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
-{
- set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
-{
- _input = static_cast<const TIn *>(inptr);
- _input_batch_stride = ld_batch;
- _input_row_stride = ld_row;
- _input_col_stride = ld_col;
-}
-
-MEMBERFN(void)::set_output(void* const outptr)
-{
- set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
-{
- set_output(outptr, _n_output_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
-{
- set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
-{
- _output = static_cast<TOut *>(outptr);
- _output_batch_stride = ld_batch;
- _output_row_stride = ld_row;
- _output_col_stride = ld_col;
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const
-{
- return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer)
-{
- _packed_parameters = buffer;
-}
-
-MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
-{
- static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
-}
-
-MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
-{
- const unsigned int weight_col_stride = _n_channels;
- const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
- static_cast<const Derived *>(this)->pack_params(
- buffer, weights, weight_row_stride, weight_col_stride, biases
- );
-}
-
-MEMBERFN(void)::pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
-) const
-{
- static_cast<const Derived *>(this)->_pack_params(
- buffer, weights, weight_row_stride, weight_col_stride, biases
- );
-}
-
-MEMBERFN(void)::_pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
-) const
-{
- // Default implementation
- PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
- _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
- );
-}
-
-MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
-{
- return nthreads * (
- _get_input_working_space_size() + _get_output_working_space_size()
- );
-}
-
-MEMBERFN(void)::set_working_space(void *buffer)
-{
- _working_space = buffer;
-}
-
-MEMBERFN(size_t)::_get_input_working_space_size(void) const
-{
- return sizeof(TIn) * _n_channels;
-}
-
-MEMBERFN(size_t)::_get_output_working_space_size(void) const
-{
- return sizeof(TOut) * _n_channels;
-}
-
-MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
-{
- return static_cast<uint8_t*>(_working_space) + threadid * (
- _get_input_working_space_size() + _get_output_working_space_size()
- );
-}
-
-MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
-{
- return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
-}
-
-MEMBERFN(unsigned int)::get_window() const
-{
- // Parallelise over blocks of channels.
- return iceildiv(_n_channels, CHANNEL_BLOCK);
-}
-
-MEMBERFN(void)::run(
- const unsigned int start,
- const unsigned int stop,
- const unsigned int threadid
-)
-{
- // Clear the input padding buffer
- TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
- const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
- for (int n = 0; n < _n_channels; n++)
- {
- buf[n] = pad_value;
- }
-
- // Parallelise over blocks of channels
- const auto start_channel = CHANNEL_BLOCK * start;
- const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
- const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
-
- // Compute top and bottom padding for input and output
- const int input_pad_top = _padding_top;
- const int input_pad_left = _padding_left;
- constexpr int tile_overlap = kernel_rows - stride_rows;
-
- // Perform the convolution by calling `process_tile_row` for each tile row in
- // each batch.
- for (int batch = 0; batch < _n_batches; batch++)
- {
- const TIn* const inptr_batch = _input + batch*_input_batch_stride;
- TOut* const outptr_batch = _output + batch*_output_batch_stride;
-
- // Loop over rows of tiles
- for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
- {
- // Pointer to the row
- const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
- const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
- TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
-
- // Input padding (top + bottom) for the row
- const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
- const int input_row_bottom = input_row_top + inner_tile_rows;
- const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
- const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
-
- // Output padding (bottom) for the row
- const int output_row_bottom = (tile_i + 1)*output_tile_rows;
- const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
-
- // Get the offset into the packed parameters
- const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
- start_channel*params_size_per_channel;
-
- // Process the row
- process_tile_row(
- threadid,
- stop_channel - start_channel,
- params_ptr,
- inptr_row + start_channel,
- outptr_row + start_channel,
- input_row_pad_top, input_pad_left, input_row_pad_bottom,
- output_row_pad_bottom,
- _n_tile_cols, _n_input_cols, _n_output_cols
- );
- }
- }
-}
-
-MEMBERFN(void)::process_tile_row(
- const unsigned int threadid,
- const int n_channels,
- const void* const packed_params,
- const TIn* const inptr,
- TOut* const outptr,
- const int row_pad_in_top,
- const int row_pad_in_left,
- const int row_pad_in_bottom,
- const int row_pad_out_bottom,
- const int n_tiles,
- const int n_input_cols,
- const int n_output_cols
-)
-{
- constexpr int tile_overlap = kernel_cols - stride_cols;
-
- // Loop over columns of tiles
- for (int tile_j = 0; tile_j < n_tiles; tile_j++)
- {
- // Input padding (left + right) for the tile
- const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
- const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
- const int t_in_end = t_in_start + inner_tile_cols;
- const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
-
- // Output padding (right) for the tile
- const int t_out_end = (tile_j + 1) * output_tile_cols;
- const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
-
- // Get pointers into the inputs and outputs
- const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
- const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
- TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
-
- // Process just this tile
- process_tile(
- threadid, n_channels, packed_params, inptr_col, outptr_col,
- row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings
- row_pad_out_bottom, t_pad_out_right // Output paddings
- );
- }
-}
-
-MEMBERFN(TIn)::_input_padding_value(void) const
-{
- return static_cast<TIn>(0);
-}
-
-MEMBERFN(void)::process_tile(
- const unsigned int threadid,
- const int n_channels,
- const void* const packed_params,
- const TIn* const inptr,
- TOut* const outptr,
- const int pad_in_top,
- const int pad_in_left,
- const int pad_in_bottom,
- const int pad_in_right,
- const int pad_out_bottom,
- const int pad_out_right
-)
-{
- Derived * dthis = static_cast<Derived *>(this);
- const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
- const bool pad_output = pad_out_bottom || pad_out_right;
-
- if (!pad_input && !pad_output)
- {
- switch(_activation)
- {
- case ActivationFunction::ReLU:
- dthis->template execute_tile<ActivationFunction::ReLU>(
- n_channels, packed_params,
- inptr, _input_row_stride, _input_col_stride,
- outptr, _output_row_stride, _output_col_stride
- );
- break;
- case ActivationFunction::ReLU6:
- dthis->template execute_tile<ActivationFunction::ReLU6>(
- n_channels, packed_params,
- inptr, _input_row_stride, _input_col_stride,
- outptr, _output_row_stride, _output_col_stride
- );
- break;
- default:
- dthis->template execute_tile<ActivationFunction::None>(
- n_channels, packed_params,
- inptr, _input_row_stride, _input_col_stride,
- outptr, _output_row_stride, _output_col_stride
- );
- break;
- }
- }
- else
- {
- // Create arrays of input and output pointers, pointing padded elements to
- // the working space padding buffers provided.
- const TIn *inptrs[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++)
- {
- if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
- j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
- {
- // Padded input
- inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
- }
- else
- {
- inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
- }
- }
- }
-
- TOut *outptrs[output_tile_rows][output_tile_cols];
- for (int i = 0; i < output_tile_rows; i++)
- {
- for (int j = 0; j < output_tile_cols; j++)
- {
- if (i < (output_tile_rows - pad_out_bottom) &&
- j < (output_tile_cols - pad_out_right))
- {
- outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
- }
- else
- {
- outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
- }
- }
- }
-
- switch(_activation)
- {
- case ActivationFunction::ReLU:
- dthis->template execute_tile<ActivationFunction::ReLU>(
- n_channels, packed_params, inptrs, outptrs
- );
- break;
- case ActivationFunction::ReLU6:
- dthis->template execute_tile<ActivationFunction::ReLU6>(
- n_channels, packed_params, inptrs, outptrs
- );
- break;
- default:
- dthis->template execute_tile<ActivationFunction::None>(
- n_channels, packed_params, inptrs, outptrs
- );
- break;
- }
- }
-}
-
-MEMBERFN(int)::n_channels(void) const
-{
- return _n_channels;
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
deleted file mode 100644
index 4130188187..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_dilated.hpp"
-#include "utils.hpp"
-
-#define MEMBERFN(TOUT) \
- template <unsigned int OutputTileRows, unsigned int OutputTileColumns, \
- unsigned int KernelRows, unsigned int KernelColumns, \
- unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \
- typename TBias, typename TOut> \
- TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns, \
- KernelRows, KernelColumns, StrideRows, \
- StrideColumns, TIn, TBias, TOut>
-
-namespace depthwise {
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
- const int n_input_cols, const int n_channels,
- const int dilation_factor,
- nck::ActivationFunction activation,
- const unsigned int padding_top,
- const unsigned int padding_left,
- const unsigned int padding_bottom,
- const unsigned int padding_right)
- : DilatedDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- DilatedDepthwiseConvolution::get_output_size(
- n_input_rows, padding_top, padding_bottom, dilation_factor),
- DilatedDepthwiseConvolution::get_output_size(
- n_input_cols, padding_left, padding_right, dilation_factor),
- activation, padding_top, padding_left, padding_bottom,
- padding_right) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
- const int n_input_cols, const int n_channels,
- const int dilation_factor,
- const int n_output_rows, const int n_output_cols,
- nck::ActivationFunction activation,
- const unsigned int padding_top,
- const unsigned int padding_left,
- const unsigned int, // padding_bottom
- const unsigned int // padding_right
- )
- : DilatedDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
- n_output_rows, n_output_cols, activation, padding_top, padding_left,
- 0, 0,
- // Function which creates a new (standard) depthwise convolution
- [](const int n_batches, const int n_input_rows,
- const int n_input_cols, const int n_channels,
- const int n_output_rows, const int n_output_cols,
- const nck::ActivationFunction activation,
- const unsigned int padding_top, const unsigned int padding_left,
- const unsigned int padding_bottom,
- const unsigned int padding_right) -> IDepthwiseConvolution * {
- return new DepthwiseConvolution<
- OutputTileRows, OutputTileColumns, KernelRows, KernelColumns,
- StrideRows, StrideColumns, TIn, TBias, TOut>(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols, activation, padding_top,
- padding_left, padding_bottom, padding_right);
- }) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(
- const int n_batches, const int n_input_rows, const int n_input_cols,
- const int n_channels, const int dilation_factor, const int n_output_rows,
- const int n_output_cols, nck::ActivationFunction activation,
- const unsigned int padding_top, const unsigned int padding_left,
- const unsigned int, // padding_bottom
- const unsigned int, // padding_right
- std::function<IDepthwiseConvolution *(
- int, int, int, int, int, int, nck::ActivationFunction, unsigned int,
- unsigned int, unsigned int, unsigned int)>
- subconvfn // Function to create a new convolution
- )
- : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows),
- _n_input_cols(n_input_cols), _n_channels(n_channels),
- _padding_top(static_cast<int>(padding_top)),
- _padding_left(static_cast<int>(padding_left)),
- _n_output_rows(n_output_rows), _n_output_cols(n_output_cols),
- _convs(_dilation_factor) {
- // Instantiate the base convolutions
- for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
- // Compute properties of this row of base convolutions
- const int row_top =
- i * StrideRows - _padding_top; // -ve values are in the padding
- const int row_pad_top =
- row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0;
-
- const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor);
- const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor);
-
- for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
- // Compute properties of the base convolution
- const int col_left =
- j * StrideColumns - padding_left; // -ve values are in the padding
- const int col_pad_left =
- col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0;
-
- const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor);
- const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor);
-
- // Create new depthwise convolution engine and include it in the vector
- // of engines. The new depthwise convolution engine is created by calling
- // the delegate function we received as an argument.
- _convs[i].emplace_back(subconvfn(
- n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows,
- _n_output_cols, activation,
- // Note: since we have computed the output tensor size we don't need
- // to explicitly provide bottom and right padding values to the
- // depthwise convolution.
- row_pad_top, col_pad_left, 0, 0));
- }
- }
-}
-
-MEMBERFN(void)::set_input(const void *const inptr) {
- set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) {
- set_input(inptr, _n_input_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldrow, const int ldcol) {
- set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldbatch, const int ldrow,
- const int ldcol) {
- // Compute dilated strides
- const int ldrow_dilated = ldrow * _dilation_factor;
- const int ldcol_dilated = ldcol * _dilation_factor;
-
- // Pass input parameters on to base convolutions
- for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
- const int top_pos =
- i * StrideRows - _padding_top +
- ((static_cast<int>(i * StrideRows) < _padding_top)
- ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) *
- _dilation_factor
- : 0);
- const TIn *const inptr_i =
- static_cast<const TIn *>(inptr) + top_pos * ldrow;
-
- for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
- int left_pos = j * StrideColumns - _padding_left;
- while (left_pos < 0)
- left_pos += _dilation_factor;
-
- // Modify the pointer to point to the first element of the dilated input
- // tensor, then set the input for this convolution engine.
- const void *const inptr_ij = inptr_i + left_pos * ldcol;
- _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated);
- }
- }
-}
-
-MEMBERFN(void)::set_output(void *const outptr) {
- set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void *const outptr, const int ldcol) {
- set_output(outptr, _n_output_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldrow, const int ldcol) {
- set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldbatch, const int ldrow,
- const int ldcol) {
- // Compute dilated strides
- const int ldrow_dilated = ldrow * _dilation_factor;
- const int ldcol_dilated = ldcol * _dilation_factor;
-
- // Pass input parameters on to base convolutions
- for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
- for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
- // Modify the pointer to point to the first element of the dilated input
- // tensor, then set the input for this convolution engine.
- void *const outptr_ij =
- static_cast<TOut *>(outptr) + i * ldrow + j * ldcol;
- _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated,
- ldcol_dilated);
- }
- }
-}
-
-MEMBERFN(int)
-::get_output_size(const int dim_size, const unsigned int padding_before,
- const unsigned int padding_after, const int dilation_factor) {
- const int input_size =
- dim_size + static_cast<int>(padding_before + padding_after);
- const int window_size = (KernelRows - 1) * dilation_factor + 1;
- return iceildiv(input_size - window_size + 1, StrideRows);
-}
-
-MEMBERFN(int)
-::output_size(const int dim_size, const unsigned int padding_before,
- const unsigned int padding_after) const {
- return get_output_size(dim_size, padding_before, padding_after,
- _dilation_factor);
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const {
- return _convs[0][0]->get_packed_params_size();
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer) {
- // Set the buffer for all convolution engines
- for (auto &&row : _convs) {
- for (auto &&conv : row) {
- conv->set_packed_params_buffer(buffer);
- }
- }
-}
-
-MEMBERFN(void)
-::pack_params(const void *const weights, const void *const biases) const {
- _convs[0][0]->pack_params(weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
- const void *const biases) const {
- _convs[0][0]->pack_params(buffer, weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
- const unsigned int ldrow, const unsigned int ldcol,
- const void *const biases) const {
- _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases);
-}
-
-MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const {
- return _convs[0][0]->get_working_space_size(nthreads);
-}
-
-MEMBERFN(void)::set_working_space(void *const ws) {
- // Use the same working space set for all contained depthwise engines.
- for (auto &&row : _convs) {
- for (auto &&conv : row) {
- conv->set_working_space(ws);
- }
- }
-}
-
-MEMBERFN(unsigned int)::get_window(void) const {
- return _convs[0][0]->get_window();
-}
-
-MEMBERFN(void)
-::run(const unsigned int start, const unsigned int stop,
- const unsigned int threadid) {
- // Run each contained convolution in turn
- for (auto &&row : _convs) {
- for (auto &&conv : row) {
- conv->run(start, stop, threadid);
- }
- }
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
deleted file mode 100644
index a00a1ef04a..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float16_t, float16_t, float16_t
->::DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float16_t, float16_t, float16_t
->::DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols, activation,
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float16_t, float16_t, float16_t
->::execute_tile(
- int n_channels,
- const void *weights_biases_ptr,
- const float16_t *input,
- const unsigned int in_row_stride,
- const unsigned int in_col_stride,
- float16_t *output,
- const unsigned int out_row_stride,
- const unsigned int out_col_stride
-)
-{
- // Instantiate pointers
- const float16_t* __restrict__ inptr_base = input;
- float16_t* __restrict__ outptr_base = output;
- const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
- for (; channels_remaining >= 8; channels_remaining -= 8)
- {
- // Load input tile
- float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- const float16_t* const inptr_row = inptr_base + i*in_row_stride;
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
- }
- }
- inptr_base += 8;
-
- // Load weights tile
- float16x8_t vbias = vld1q_f16(params);
- params += 8;
-
- float16x8_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = vld1q_f16(params);
- params += 8;
- }
- }
-
- // Perform the convolution
- float16x8_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- v[out_i][out_j] = vbias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const unsigned int j = base_j + in_j;
-
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- float16_t* const outptr_row = outptr_base + i*out_row_stride;
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
- }
- }
- outptr_base += 8;
- }
- for (; channels_remaining; channels_remaining--)
- {
- // Load input tile
- float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- const float16_t* const inptr_row = inptr_base + i*in_row_stride;
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = *(inptr_row + j*in_col_stride);
- }
- }
- inptr_base++;
-
- // Load weights tile
- float16_t bias = *(params++);
- float16_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = *(params++);
- }
- }
-
- // Perform the convolution
- float16_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = bias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- float16_t* const outptr_row = outptr_base + i*out_row_stride;
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- *(outptr_row + j*out_col_stride) = v[i][j];
- }
- }
- outptr_base++;
- }
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float16_t, float16_t, float16_t
->::execute_tile(
- int n_channels,
- const void *weights_biases_ptr,
- const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
- // Instantiate pointers
- const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
- int n = 0;
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
- for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
- {
- // Load input tile
- float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = vld1q_f16(inptrs[i][j] + n);
- }
- }
-
- // Load weights tile
- float16x8_t vbias = vld1q_f16(params);
- params += 8;
-
- float16x8_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = vld1q_f16(params);
- params += 8;
- }
- }
-
- // Perform the convolution
- float16x8_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- v[out_i][out_j] = vbias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const unsigned int j = base_j + in_j;
-
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- vst1q_f16(outptrs[i][j] + n, v[i][j]);
- }
- }
- }
- for (; channels_remaining; channels_remaining--, n++)
- {
- // Load input tile
- float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = *(inptrs[i][j] + n);
- }
- }
-
- // Load weights tile
- float16_t bias = *(params++);
- float16_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = *(params++);
- }
- }
-
- // Perform the convolution
- float16_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = bias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- *(outptrs[i][j] + n) = v[i][j];
- }
- }
- }
-}
-
-} // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
deleted file mode 100644
index b0d8126a40..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float, float, float
->::DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float, float, float
->::DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- ActivationFunction activation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols, activation,
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float, float, float
->::execute_tile(
- int n_channels,
- const void *weights_biases_ptr,
- const float *input,
- const unsigned int in_row_stride,
- const unsigned int in_col_stride,
- float *output,
- const unsigned int out_row_stride,
- const unsigned int out_col_stride
-)
-{
- // Instantiate pointers
- const float* __restrict__ inptr_base = input;
- float* __restrict__ outptr_base = output;
- const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Load input tile
- float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- const float* const inptr_row = inptr_base + i*in_row_stride;
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
- }
- }
- inptr_base += 4;
-
- // Load weights tile
- float32x4_t vbias = vld1q_f32(params);
- params += 4;
-
- float32x4_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = vld1q_f32(params);
- params += 4;
- }
- }
-
- // Perform the convolution
- float32x4_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- v[out_i][out_j] = vbias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const unsigned int j = base_j + in_j;
-
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- float* const outptr_row = outptr_base + i*out_row_stride;
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
- }
- }
- outptr_base += 4;
- }
- for (; channels_remaining; channels_remaining--)
- {
- // Load input tile
- float u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- const float* const inptr_row = inptr_base + i*in_row_stride;
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = *(inptr_row + j*in_col_stride);
- }
- }
- inptr_base++;
-
- // Load weights tile
- float bias = *(params++);
- float w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = *(params++);
- }
- }
-
- // Perform the convolution
- float v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = bias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- float* const outptr_row = outptr_base + i*out_row_stride;
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- *(outptr_row + j*out_col_stride) = v[i][j];
- }
- }
- outptr_base++;
- }
-}
-
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols, StrideRows, StrideCols,
- float, float, float
->::execute_tile(
- int n_channels,
- const void *weights_biases_ptr,
- const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
- const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
- int n = 0;
- for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
- {
- // Load input tile
- float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = vld1q_f32(inptrs[i][j] + n);
- }
- }
-
- // Load weights tile
- float32x4_t vbias = vld1q_f32(params);
- params += 4;
-
- float32x4_t w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = vld1q_f32(params);
- params += 4;
- }
- }
-
- // Perform the convolution
- float32x4_t v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- v[out_i][out_j] = vbias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const unsigned int j = base_j + in_j;
-
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- vst1q_f32(outptrs[i][j] + n, v[i][j]);
- }
- }
- }
- for (; channels_remaining; channels_remaining--, n++)
- {
- // Load input tile
- float u[Base::inner_tile_rows][Base::inner_tile_cols];
- for (int i = 0; i < Base::inner_tile_rows; i++)
- {
- for (int j = 0; j < Base::inner_tile_cols; j++)
- {
- u[i][j] = *(inptrs[i][j] + n);
- }
- }
-
- // Load weights tile
- float bias = *(params++);
- float w[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- w[i][j] = *(params++);
- }
- }
-
- // Perform the convolution
- float v[OutputTileRows][OutputTileCols];
- for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
- {
- for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = bias;
-
- // Base co-ordinate
- const int base_i = out_i * StrideRows;
- const int base_j = out_j * StrideCols;
-
- // Fill the accumulator
- for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
- {
- const unsigned int i = base_i + in_i;
- for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
-
- // Apply the activation function
- if (Activation == ActivationFunction::ReLU ||
- Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
- }
- if (Activation == ActivationFunction::ReLU6)
- {
- v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
- }
- }
- }
-
- // Store the output tile
- for (unsigned int i = 0; i < OutputTileRows; i++)
- {
- for (unsigned int j = 0; j < OutputTileCols; j++)
- {
- *(outptrs[i][j] + n) = v[i][j];
- }
- }
- }
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
deleted file mode 100644
index e8b4c7bc0f..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise
-{
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- activation, weight_quantisation, input_quantisation, output_quantisation,
- QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols,
- activation, weight_quantisation, input_quantisation, output_quantisation,
- QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- const QAsymm8RescaleParams& rescale_params,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
- ),
- _weights_quant(weight_quantisation),
- _inputs_quant(input_quantisation),
- _output_quant(output_quantisation),
- rescale_parameters(rescale_params)
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- int n_output_rows, int n_output_cols,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- const QAsymm8RescaleParams& rescale_params,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels,
- n_output_rows, n_output_cols, activation,
- padding_top, padding_left, padding_bottom, padding_right
- ),
- _weights_quant(weight_quantisation),
- _inputs_quant(input_quantisation),
- _output_quant(output_quantisation),
- rescale_parameters(rescale_params)
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
- return _inputs_quant.offset;
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
-) const
-{
- const uint8_t *wptr = static_cast<const uint8_t *>(weights);
- const int32_t *bptr = static_cast<const int32_t *>(biases);
- uint8_t *outptr = static_cast<uint8_t *>(buffer);
-
- // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
- // For SVE set this to half the vector length.
- unsigned int veclen = 8;
-
- // While there are channels left to process, pack a vector length of them at
- // a time and reduce the size of vector used as the size of the tensor
- // decreases.
- for (
- unsigned int n_channels = this->n_channels(); n_channels;
- n_channels -= veclen,
- outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
- )
- {
- // NOTE Ignore this section if using SVE, the vector length remains the
- // same and we just don't fill a full register for the tail.
- while (n_channels < veclen)
- {
- // Reduce the vector length to either 8 or 1 (scalar)
- // TODO Support more vector lengths in `execute_tile`.
- veclen = (veclen == 16) ? 8 : 1;
- }
-
- // Get pointers to bias and weight portions of the output structure.
- int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
- uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
-
- // Copy a vector length of elements
- for (unsigned int n = 0; n < veclen && n < n_channels; n++)
- {
- const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
- out_bptr[n] = bias;
-
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
- row_outptr[j*veclen + n] = w;
- }
- }
- wptr++;
- }
- }
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void tilefn(
- int n_channels,
- const void* packed_params,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- const int32_t clamp_max,
- const int32_t clamp_min,
- const uint8_t input_offset,
- const uint8_t weight_offset,
- const uint8_t output_offset,
- const int32_t requant_multiplier,
- const int32_t requant_shift
-)
-{
- constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
- constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
- // Offset into channels
- int channel = 0;
-
- // Byte type pointer to weights and biases
- const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
-
- for (; n_channels >= 8; n_channels -= 8, channel += 8)
- {
- const int32x4_t biases[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
- };
- wbptr += 8*sizeof(int32_t);
-
- int16x8_t weights[KernelRows][KernelCols];
- const uint8x8_t woffset = vdup_n_u8(weight_offset);
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- const uint8x8_t w = vld1_u8(wbptr);
- weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
- wbptr += 8;
- }
- }
-
- int16x8_t inputs[InnerTileRows][InnerTileCols];
- const uint8x8_t ioffset = vdup_n_u8(input_offset);
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- const auto x = vld1_u8(get_input_ptr(i, j, channel));
- inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
- }
- }
-
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32x4_t acc_a = biases[0], acc_b = biases[1];
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj];
- const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-#ifndef __aarch64__
- acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x));
- acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x));
-#else
- asm("smlal %[acc_a].4s, %[w].4h, %[x].4h\n"
- "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n"
- : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b)
- : [w] "w"(w), [x] "w"(x));
-#endif // __aarch64__
- }
- }
-
- int32x4_t final_accs[2];
- for (unsigned int i = 0; i < 2; i++)
- {
- const int32x4_t y = rounding_divide_by_exp2(
- saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier),
- requant_shift);
- const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
- final_accs[i] = vaddq_s32(y, offset);
- final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
- final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
- }
-
-#ifndef __aarch64__
- const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
- vreinterpretq_s16_s32(final_accs[1]));
- const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]);
-
- const int8x16x2_t zoutput = vuzpq_s8(elems, elems);
- const uint8x8_t output =
- vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0]));
- vst1_u8(get_output_ptr(oi, oj, channel), output);
-#else
- const int8x16_t elems = vreinterpretq_s8_s16(
- vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]),
- vreinterpretq_s16_s32(final_accs[1])));
- const uint8x8_t output =
- vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems)));
- vst1_u8(get_output_ptr(oi, oj, channel), output);
-#endif // __aarch64__
- }
- }
- }
- for (; n_channels; n_channels--, channel++)
- {
- // Load bias
- const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
- wbptr += sizeof(int32_t);
-
- // Load weights
- int16_t weights[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- weights[i][j] = *(wbptr++) - weight_offset;
- }
- }
-
- // Load the input activations
- int16_t inputs[InnerTileRows][InnerTileCols];
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
- }
- }
-
- // Perform the convolution
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32_t acc = bias;
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
- acc += w * x;
- }
- }
-
- // Requantize
- acc = rounding_divide_by_exp2(
- saturating_doubling_high_mul(acc, requant_multiplier),
- requant_shift);
- acc += output_offset;
- acc = std::max(acc, clamp_min);
- acc = std::min(acc, clamp_max);
- uint8_t output = static_cast<uint8_t>(acc);
- *(get_output_ptr(oi, oj, channel)) = output;
- }
- }
- }
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void execute_tilefn(
- int n_channels,
- const void* packed_params,
- const nck::ActivationFunction actfn,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- const QAsymm8Params &input_quant,
- const QAsymm8Params &weight_quant,
- const QAsymm8Params &output_quant,
- const QAsymm8RescaleParams &requant
-) {
- // Compute min/max clamp values
- int32_t clamp_min = std::numeric_limits<uint8_t>::min();
- int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
- if (actfn == nck::ActivationFunction::ReLU ||
- actfn == nck::ActivationFunction::ReLU6) {
- const int32_t bottom_rail = output_quant.offset;
- clamp_min = std::max(clamp_min, bottom_rail);
- }
-
- if (actfn == nck::ActivationFunction::ReLU6) {
- const int32_t top_rail = output_quant.quantize(6.0f);
- clamp_max = std::min(clamp_max, top_rail);
- }
-
- // Call the tile execution method
- tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
- StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
- clamp_max, clamp_min, input_quant.offset,
- weight_quant.offset, output_quant.offset,
- requant.multiplier, requant.shift);
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
-) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
- const int i, const int j, const int channel) {
- return inptr + i * in_row_stride + j * in_col_stride + channel;
- };
-
- const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
- const int i, const int j, const int channel) {
- return outptr + i * out_row_stride + j * out_col_stride + channel;
- };
-
- execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
- _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptrs](const int i, const int j,
- const int channel) {
- return inptrs[i][j] + channel;
- };
-
- const auto get_output_ptr = [outptrs](const int i, const int j,
- const int channel) {
- return outptrs[i][j] + channel;
- };
-
- // Call the tile execution method
- execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
- _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
deleted file mode 100644
index 68e20d98a9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-#pragma once
-
-namespace {
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void tilefn_hybrid(
- int n_channels,
- const void* packed_params,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- int32_t clamp_min,
- int32_t clamp_max,
- uint8_t input_offset,
- uint8_t output_offset
-)
-{
- constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
- constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
- // Offset into channels
- int channel = 0;
-
- // Byte type pointer to weights and biases
- const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
-
- for (; n_channels >= 8; n_channels -= 8, channel += 8)
- {
- const int32x4_t biases[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
- };
- const int32x4_t multipliers[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
- };
- const int32x4_t shifts[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
- };
- wbptr += 24*sizeof(int32_t);
-
- int16x8_t weights[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- const auto w = vld1_s8(wbptr);
- weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
- wbptr += 8;
- }
- }
-
- int16x8_t inputs[InnerTileRows][InnerTileCols];
- const uint8x8_t ioffset = vdup_n_u8(input_offset);
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- const auto x = vld1_u8(get_input_ptr(i, j, channel));
- inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
- }
- }
-
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32x4_t accs[2];
- for (unsigned int i = 0; i < 2; i++)
- {
- accs[i] = biases[i];
- }
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj];
- const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
- accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
- accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
- }
- }
-
- int32x4_t final_accs[2];
- for (unsigned int i = 0; i < 2; i++)
- {
- const int32x4_t y = rounding_divide_by_exp2(
- saturating_doubling_high_mul(accs[i], multipliers[i]),
- shifts[i]);
- const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
- final_accs[i] = vaddq_s32(y, offset);
- final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
- final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
- }
-
- const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
- vreinterpretq_s16_s32(final_accs[1]));
- const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
- const uint8x8_t output =
- vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-
- vst1_u8(get_output_ptr(oi, oj, channel), output);
- }
- }
- }
-
- for (; n_channels; n_channels--, channel++)
- {
- // Load bias
- const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
- const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
- const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
-
- wbptr += 3*sizeof(int32_t);
-
- // Load weights
- int16_t weights[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- weights[i][j] = *(wbptr++);
- }
- }
-
- // Load the input activations
- int16_t inputs[InnerTileRows][InnerTileCols];
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
- }
- }
-
- // Perform the convolution
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32_t acc = bias;
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
- acc += w * x;
- }
- }
-
- // Requantize
- acc = rounding_divide_by_exp2(
- saturating_doubling_high_mul(acc, multiplier),
- -shift);
- acc += output_offset;
- acc = std::max(acc, clamp_min);
- acc = std::min(acc, clamp_max);
- uint8_t output = static_cast<uint8_t>(acc);
- *(get_output_ptr(oi, oj, channel)) = output;
- }
- }
- }
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void execute_tilefn_hybrid(
- int n_channels,
- const void* packed_params,
- const ActivationFunction actfn,
- const qasymm8::QAsymm8Params &input_quant,
- const qasymm8::QAsymm8Params &output_quant,
- FInput &get_input_ptr,
- FOutput &get_output_ptr) {
-
- // Compute min/max clamp values
- int32_t clamp_min = std::numeric_limits<uint8_t>::min();
- int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
- if (actfn == ActivationFunction::ReLU) {
- clamp_min = output_quant.offset;
- }
-
- // Disabling Relu6 for now
- if (actfn == ActivationFunction::ReLU6) {
- const int32_t top_rail = output_quant.quantize(6.0f);
- clamp_max = std::min(clamp_max, top_rail);
- }
-
- // Call the tile execution method
- tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
- StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
-}
-}
-
-
-
-namespace depthwise {
-using namespace qsymm8;
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : QSymm8HybridPerChannelDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- activation, weight_quantisation, input_quantisation, output_quantisation,
- QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- const QSymm8PerChannelRescaleParams& rescale_params,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
- ),
- _weights_quant(weight_quantisation),
- _input_quant(input_quantisation),
- _output_quant(output_quantisation),
- _rescale_parameters(rescale_params)
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
- return _input_quant.offset;
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
-) const
-{
- const int8_t *wptr = static_cast<const int8_t *>(weights);
- const int32_t *bptr = static_cast<const int32_t *>(biases);
- const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
- const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
- int8_t *outptr = static_cast<int8_t *>(buffer);
-
- // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
- // For SVE set this to half the vector length.
- unsigned int veclen = 8;
-
- // While there are channels left to process, pack a vector length of them at
- // a time and reduce the size of vector used as the size of the tensor
- // decreases.
- for (
- unsigned int n_channels = this->n_channels(); n_channels;
- n_channels -= veclen,
- outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
- )
- {
- // NOTE Ignore this section if using SVE, the vector length remains the
- // same and we just don't fill a full register for the tail.
- while (n_channels < veclen)
- {
- // Reduce the vector length to either 8 or 1 (scalar)
- // TODO Support more vector lengths in `execute_tile`.
- veclen = (veclen == 16) ? 8 : 1;
- }
-
- // Get pointers to bias and weight portions of the output structure.
- int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
- int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
- int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
- int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t);
-
- // Copy a vector length of elements
- for (unsigned int n = 0; n < veclen && n < n_channels; n++)
- {
- const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
- const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
- const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
-
- out_bptr[n] = bias;
- out_mptr[n] = multiplier;
- out_sptr[n] = -shift;
-
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
- row_outptr[j*veclen + n] = w;
- }
- }
- wptr++;
- }
- }
-}
-
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
-) {
-
- // Construct methods to get pointers
- const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
- const int i, const int j, const int channel) {
- return inptr + i * in_row_stride + j * in_col_stride + channel;
- };
-
- const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
- const int i, const int j, const int channel) {
- return outptr + i * out_row_stride + j * out_col_stride + channel;
- };
-
- execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptrs](const int i, const int j,
- const int channel) {
- return inptrs[i][j] + channel;
- };
-
- const auto get_output_ptr = [outptrs](const int i, const int j,
- const int channel) {
- return outptrs[i][j] + channel;
- };
-
- // Call the tile execution method
- execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-} // namespace depthwise