diff options
author | Michele Di Giorgio <michele.digiorgio@arm.com> | 2021-01-22 09:47:04 +0000 |
---|---|---|
committer | Michele Di Giorgio <michele.digiorgio@arm.com> | 2021-06-18 10:33:48 +0000 |
commit | d02d5edfa15ba6c04a9986a8a362a945cb38ac31 (patch) | |
tree | ced4f49691d6c7038e347a8709b315bff59c64cf /src/core/NEON/kernels/convolution | |
parent | b014c27ba6db9840e4a72519760d51a87a2af7e7 (diff) | |
download | ComputeLibrary-d02d5edfa15ba6c04a9986a8a362a945cb38ac31.tar.gz |
Integrate improved CPU depthwise convolution kernels
* Replace assembly kernels for depthwise convolution with more optimized
ones.
* Add int8 assembly kernels.
* Fix implicit padding on optimized kernels
Resolves: COMPMID-3867, COMPMID-4361
Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/convolution')
22 files changed, 0 insertions, 17252 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp deleted file mode 100644 index 70d6689731..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include <arm_neon.h> -#include "activation.hpp" -#include "padding.hpp" - -namespace depthwise -{ - -namespace nck = neon_convolution_kernels; - -class IDepthwiseConvolution -{ - public: - virtual ~IDepthwiseConvolution() = default; - - virtual int output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after - ) const = 0; - - /* Set input tensor and stride. */ - virtual void set_input(const void *inptr) = 0; - virtual void set_input(const void *inptr, int column_stride) = 0; - virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0; - virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0; - - /* Set output tensor and stride. */ - virtual void set_output(void *outptr) = 0; - virtual void set_output(void *outptr, int column_stride) = 0; - virtual void set_output(void *outptr, int row_stride, int column_stride) = 0; - virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0; - - /* Weights and biases are re-ordered to improve memory access patterns. Use - * these methods to determine the size of the re-pack buffer and to set the - * address (and implicitly reorder the weights and biases into) the buffer. - */ - virtual size_t get_packed_params_size(void) const = 0; - virtual void set_packed_params_buffer(void *) = 0; - - virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0; - virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0; - virtual void pack_params( - void *buffer, - const void* weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const = 0; - - /* Working space is used to pad tensors on the fly. Before running any - * inference check the amount of space required, allocate and provide a - * pointer to the convolution engine. - */ - virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; - virtual void set_working_space(void *) = 0; - - virtual unsigned int get_window(void) const = 0; - virtual void run( - unsigned int start, - unsigned int stop, - unsigned int threadid=0 - ) = 0; -}; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut, - typename Derived -> -class DepthwiseConvolutionBase : public IDepthwiseConvolution -{ - public: - // Information about the specific convolution instance - using InputType = TIn; - using BiasType = TBias; - using OutputType = TOut; - static constexpr int output_tile_rows = OutputTileRows; - static constexpr int output_tile_cols = OutputTileCols; - static constexpr int kernel_rows = KernelRows; - static constexpr int kernel_cols = KernelCols; - static constexpr int stride_rows = StrideRows; - static constexpr int stride_cols = StrideCols; - static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows; - static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols; - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - */ - DepthwiseConvolutionBase( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - */ - DepthwiseConvolutionBase( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - // Cannot copy or move a DepthwiseConvolution. - DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete; - DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete; - - /* Set input tensor and stride. */ - void set_input(const void *inptr) override; - void set_input(const void *inptr, int column_stride) override; - void set_input(const void *inptr, int row_stride, int column_stride) override; - void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; - - /* Set output tensor and stride. */ - void set_output(void *outptr) override; - void set_output(void *outptr, int column_stride) override; - void set_output(void *outptr, int row_stride, int column_stride) override; - void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; - - /** Get the number of output rows/columns. - * - * @param[in] dim_size Number of elements in the dimension (rows/columns) - * @param[in] same_padding True if the padding is SAME, otherwise false. - */ - static int get_output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ); - - int output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ) const override; - - /* Determine how much memory is required to store the packed weights and - * biases. - */ - size_t get_packed_params_size(void) const override; - - /* Set the buffer for the packed weights and biases, and perform the - * packing. - */ - void set_packed_params_buffer(void *buffer) override; - - void pack_params(const void *weights, const void *biases=nullptr) const override; - - void pack_params( - void *buffer, - const void *weights, - const void *biases=nullptr - ) const override; - - void pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const override; - - /** Query the amount of working space required. - * @param[in] The largest number of threads which will be used to execute - * the kernel. - */ - size_t get_working_space_size(unsigned int n_threads=1) const override; - - /** Set the working space buffer. - */ - void set_working_space(void *buffer) override; - - /** Get the window of work to be performed by an instance of the operator. - */ - unsigned int get_window(void) const override; - - /** Perform a portion of the work associated with the operator. - * - * Will perform the window of work described by $[start, stop)$. - * - * @param[in] start Start of the window of work to perform. - * @param[in] stop End of the work to perform. - * @param[in] ID of the thread performing the work. - */ - void run( - unsigned int start, - unsigned int stop, - unsigned int threadid=0 - ) override; - - protected: - /** Get the value to use to pad the tensor. - */ - TIn _input_padding_value(void) const; - - /** Implementation of the parameter packing. - */ - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - /** Process a tile-row of the tensors. - */ - void process_tile_row( - unsigned int threadid, - int n_channels, - const void* packed_params, - const InputType* inptr, - OutputType* outptr, - int row_pad_in_top, - int row_pad_in_left, - int row_pad_in_bottom, - int row_pad_out_bottom, - int n_tiles, - int n_input_cols, - int n_output_cols - ); - - /** Process a single tile of the tensor. - * - * This method will apply input/output padding (if required) and call the - * depthwise tile implementation. - */ - void process_tile( - unsigned int threadid, - int n_channels, - const void* packed_params, - const InputType* inptr, - OutputType* outptr, - int pad_in_top, - int pad_in_left, - int pad_in_bottom, - int pad_in_right, - int pad_out_bottom, - int pad_out_right - ); - - /** Perform depthwise convolution on a single tile. - */ - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - OutputType* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptrs[inner_tile_rows][inner_tile_cols], - OutputType* outptrs[output_tile_rows][output_tile_cols] - ); - - int n_channels(void) const; - - private: - // Member variables of instances of a convolution engine. - const InputType* _input; - OutputType* _output; - void* _packed_parameters; - void* _working_space; // Per-thread working space - const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, - _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; - const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right; - const nck::ActivationFunction _activation; - - // Stride information for a convolution instance - int _input_col_stride, _input_row_stride, _input_batch_stride; - int _output_col_stride, _output_row_stride, _output_batch_stride; - - // Methods for getting access to working space - size_t _get_input_working_space_size(void) const; - size_t _get_output_working_space_size(void) const; - - void *_get_input_working_space(unsigned int threadid) const; - void *_get_output_working_space(unsigned int threadid) const; -}; - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut -> -class DepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - using Base::DepthwiseConvolutionBase; - - protected: - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const TIn* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - TOut* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float -> : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const float* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - float* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t -> : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const float16_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - float16_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 864c6e24a0..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x26, %[inptr0], %[input_row_stride]\n" - "add x21, %[input_col_stride1], %[input_col_stride1]\n" - "add x23, %[outptr0], %[output_row_stride]\n" - "add x27, x26, %[input_row_stride]\n" - "add x22, x21, %[input_col_stride1]\n" - "and x24, %[n_channels], #3\n" - "add x28, x27, %[input_row_stride]\n" - "lsr x25, %[n_channels], #2\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q15, [%[wbptr]]\n" - "subs x25, x25, #1\n" - "mov v3.16b, v15.16b\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v1.16b, v15.16b\n" - "ldr q13, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v0.16b, v15.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q9, [%[wbptr], #96]\n" - "ldr q8, [%[wbptr], #112]\n" - "ldr q7, [%[wbptr], #128]\n" - "ldr q6, [%[wbptr], #144]\n" - "ldr q24, [%[inptr0]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr q22, [x26]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr q18, [x27]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr q21, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "ldr q17, [%[inptr0], x21]\n" - "ldr q20, [x28]\n" - "ldr q5, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr q19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr q23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr q22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr q21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr q16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr q20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "ldr q15, [%[wbptr]]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "ldr q14, [%[wbptr], #16]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "ldr q24, [%[inptr0]]\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "ldr q17, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "ldr q13, [%[wbptr], #32]\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "ldr q22, [x26]\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "ldr q8, [%[wbptr], #112]\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "ldr q21, [x26, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "add x27, x27, #16\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "str q1, [x23]\n" - "mov v3.16b, v15.16b\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "ldr q18, [x27]\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v1.16b, v15.16b\n" - "ldr q5, [x27, %[input_col_stride1]]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "ldr q7, [%[wbptr], #128]\n" - "mov v2.16b, v15.16b\n" - "add x28, x28, #16\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr q20, [x28]\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "ldr q9, [%[wbptr], #96]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "subs x25, x25, #1\n" - "str q0, [x23, %[output_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr q6, [%[wbptr], #144]\n" - "add x23, x23, #16\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "mov v0.16b, v15.16b\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr q19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr q23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr q22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr q21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr q16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr q20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add x27, x27, #16\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "add x28, x28, #16\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "str q1, [x23]\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "str q0, [x23, %[output_col_stride1]]\n" - "add x23, x23, #16\n" - "4:\n" - "cbz x24, 7f\n" - "ldr s15, [%[wbptr]]\n" - "mov v3.16b, v15.16b\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v1.16b, v15.16b\n" - "ldr s13, [%[wbptr], #8]\n" - "mov v2.16b, v15.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v0.16b, v15.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x24, x24, #1\n" - "ldr s9, [%[wbptr], #24]\n" - "ldr s8, [%[wbptr], #28]\n" - "ldr s7, [%[wbptr], #32]\n" - "ldr s6, [%[wbptr], #36]\n" - "ldr s24, [%[inptr0]]\n" - "ldr s22, [x26]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "ldr s18, [x27]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr s21, [x26, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr s17, [%[inptr0], x21]\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "ldr s20, [x28]\n" - "ldr s5, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr s19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr s23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr s22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr s21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr s16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr s20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "ldr s15, [%[wbptr]]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "ldr s14, [%[wbptr], #4]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "ldr s24, [%[inptr0]]\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "ldr s17, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "ldr s13, [%[wbptr], #8]\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "ldr s22, [x26]\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "ldr s8, [%[wbptr], #28]\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "ldr s21, [x26, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "add x27, x27, #4\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "str s1, [x23]\n" - "mov v3.16b, v15.16b\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "ldr s18, [x27]\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v1.16b, v15.16b\n" - "ldr s5, [x27, %[input_col_stride1]]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "ldr s7, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "add x28, x28, #4\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr s20, [x28]\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "ldr s9, [%[wbptr], #24]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "subs x24, x24, #1\n" - "str s0, [x23, %[output_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr s6, [%[wbptr], #36]\n" - "add x23, x23, #4\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "mov v0.16b, v15.16b\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr s19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr s23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr s22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr s21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr s16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr s20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add x27, x27, #4\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "add x28, x28, #4\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "str s1, [x23]\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "str s0, [x23, %[output_col_stride1]]\n" - "add x23, x23, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x24, %[input_col_stride1], %[input_col_stride1]\n" - "add x22, %[outptr0], %[output_row_stride]\n" - "add x23, x21, %[input_row_stride]\n" - "add x27, x24, %[input_col_stride1]\n" - "and x25, %[n_channels], #3\n" - "add x28, x23, %[input_row_stride]\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q11, [%[wbptr]]\n" - "subs x26, x26, #1\n" - "mov v17.16b, v11.16b\n" - "ldr q13, [%[wbptr], #16]\n" - "mov v15.16b, v11.16b\n" - "ldr q4, [%[wbptr], #32]\n" - "mov v16.16b, v11.16b\n" - "ldr q2, [%[wbptr], #48]\n" - "mov v14.16b, v11.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q1, [%[wbptr], #96]\n" - "ldr q12, [%[wbptr], #112]\n" - "ldr q0, [%[wbptr], #128]\n" - "ldr q3, [%[wbptr], #144]\n" - "ldr q6, [%[inptr0]]\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "ldr q27, [x21]\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr q24, [x23]\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "ldr q22, [x21, %[input_col_stride1]]\n" - "ldr q9, [%[inptr0], x24]\n" - "ldr q8, [x28]\n" - "ldr q20, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr q26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr q25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr q24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr q21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr q7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr q19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr q18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "ldr q11, [%[wbptr]]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "ldr q13, [%[wbptr], #16]\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "ldr q6, [%[inptr0]]\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr q9, [%[inptr0], x24]\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "add x21, x21, #16\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "ldr q27, [x21]\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "ldr q4, [%[wbptr], #32]\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "ldr q22, [x21, %[input_col_stride1]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "add x23, x23, #16\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "ldr q12, [%[wbptr], #112]\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "ldr q24, [x23]\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "ldr q20, [x23, %[input_col_stride1]]\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "add x28, x28, #16\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "movi v26.16b, #0\n" - "ldr q8, [x28]\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "subs x26, x26, #1\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "ldr q2, [%[wbptr], #48]\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [%[outptr0]]\n" - "str q16, [%[outptr0], %[output_col_stride1]]\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str q15, [x22]\n" - "mov v17.16b, v11.16b\n" - "mov v15.16b, v11.16b\n" - "ldr q0, [%[wbptr], #128]\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #96]\n" - "mov v16.16b, v11.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "ldr q3, [%[wbptr], #144]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "str q14, [x22, %[output_col_stride1]]\n" - "mov v14.16b, v11.16b\n" - "add x22, x22, #16\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr q26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr q25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr q24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr q21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr q7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr q19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr q18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "add x21, x21, #16\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add x23, x23, #16\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "add x28, x28, #16\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "movi v26.16b, #0\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "str q17, [%[outptr0]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "str q15, [x22]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str q16, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "str q14, [x22, %[output_col_stride1]]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s11, [%[wbptr]]\n" - "mov v17.16b, v11.16b\n" - "ldr s13, [%[wbptr], #4]\n" - "mov v15.16b, v11.16b\n" - "ldr s4, [%[wbptr], #8]\n" - "mov v16.16b, v11.16b\n" - "ldr s2, [%[wbptr], #12]\n" - "mov v14.16b, v11.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x25, x25, #1\n" - "ldr s1, [%[wbptr], #24]\n" - "ldr s12, [%[wbptr], #28]\n" - "ldr s0, [%[wbptr], #32]\n" - "ldr s3, [%[wbptr], #36]\n" - "ldr s6, [%[inptr0]]\n" - "ldr s27, [x21]\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "ldr s24, [x23]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr s22, [x21, %[input_col_stride1]]\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "ldr s9, [%[inptr0], x24]\n" - "ldr s8, [x28]\n" - "ldr s20, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr s26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr s25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr s24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr s21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr s7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr s19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr s18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "ldr s11, [%[wbptr]]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "ldr s13, [%[wbptr], #4]\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "ldr s6, [%[inptr0]]\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr s9, [%[inptr0], x24]\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "add x21, x21, #4\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "ldr s27, [x21]\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "ldr s4, [%[wbptr], #8]\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "ldr s22, [x21, %[input_col_stride1]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "add x23, x23, #4\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "ldr s12, [%[wbptr], #28]\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "ldr s24, [x23]\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "ldr s20, [x23, %[input_col_stride1]]\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "add x28, x28, #4\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "movi v26.16b, #0\n" - "ldr s8, [x28]\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "subs x25, x25, #1\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "ldr s2, [%[wbptr], #12]\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [%[outptr0]]\n" - "str s16, [%[outptr0], %[output_col_stride1]]\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str s15, [x22]\n" - "mov v17.16b, v11.16b\n" - "mov v15.16b, v11.16b\n" - "ldr s0, [%[wbptr], #32]\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #24]\n" - "mov v16.16b, v11.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "ldr s3, [%[wbptr], #36]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "str s14, [x22, %[output_col_stride1]]\n" - "mov v14.16b, v11.16b\n" - "add x22, x22, #4\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr s26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr s25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr s24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr s21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr s7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr s19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr s18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "add x21, x21, #4\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add x23, x23, #4\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "add x28, x28, #4\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "movi v26.16b, #0\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "str s17, [%[outptr0]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "str s15, [x22]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str s16, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "str s14, [x22, %[output_col_stride1]]\n" - "add x22, x22, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU6>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x23, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x27, x21, %[input_row_stride]\n" - "add x22, x23, %[input_col_stride1]\n" - "and x25, %[n_channels], #3\n" - "add x28, x27, %[input_row_stride]\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q19, [%[wbptr]]\n" - "subs x26, x26, #1\n" - "mov v3.16b, v19.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v1.16b, v19.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v2.16b, v19.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v0.16b, v19.16b\n" - "ldr q13, [%[wbptr], #64]\n" - "ldr q23, [%[wbptr], #80]\n" - "ldr q15, [%[wbptr], #96]\n" - "ldr q20, [%[wbptr], #112]\n" - "ldr q21, [%[wbptr], #128]\n" - "ldr q14, [%[wbptr], #144]\n" - "ldr q16, [%[inptr0]]\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "ldr q28, [x21]\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q24, [x27]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr q8, [x21, %[input_col_stride1]]\n" - "ldr q9, [%[inptr0], x23]\n" - "ldr q18, [x28]\n" - "ldr q6, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr q25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr q28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr q24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr q27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr q7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr q17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr q5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "ldr q19, [%[wbptr]]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "ldr q16, [%[inptr0]]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "ldr q13, [%[wbptr], #64]\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "ldr q9, [%[inptr0], x23]\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "add x21, x21, #16\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr q28, [x21]\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "ldr q8, [x21, %[input_col_stride1]]\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "add x27, x27, #16\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "ldr q20, [%[wbptr], #112]\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "ldr q24, [x27]\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "ldr q6, [x27, %[input_col_stride1]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "add x28, x28, #16\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "ldr q23, [%[wbptr], #80]\n" - "movi v25.16b, #0\n" - "ldr q18, [x28]\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "subs x26, x26, #1\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "ldr q10, [%[wbptr], #48]\n" - "fmov v26.4s, #6.0\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "ldr q21, [%[wbptr], #128]\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "str q3, [%[outptr0]]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "str q1, [x24]\n" - "mov v3.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "ldr q15, [%[wbptr], #96]\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "ldr q14, [%[wbptr], #144]\n" - "mov v2.16b, v19.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "str q0, [x24, %[output_col_stride1]]\n" - "mov v0.16b, v19.16b\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "add x24, x24, #16\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr q25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr q28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr q24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr q27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr q7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr q17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr q5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "add x21, x21, #16\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add x27, x27, #16\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "add x28, x28, #16\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "movi v25.16b, #0\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmov v26.4s, #6.0\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "str q1, [x24]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "str q0, [x24, %[output_col_stride1]]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s19, [%[wbptr]]\n" - "mov v3.16b, v19.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v1.16b, v19.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v2.16b, v19.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v0.16b, v19.16b\n" - "ldr s13, [%[wbptr], #16]\n" - "ldr s23, [%[wbptr], #20]\n" - "subs x25, x25, #1\n" - "ldr s15, [%[wbptr], #24]\n" - "ldr s20, [%[wbptr], #28]\n" - "ldr s21, [%[wbptr], #32]\n" - "ldr s14, [%[wbptr], #36]\n" - "ldr s16, [%[inptr0]]\n" - "ldr s28, [x21]\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "ldr s24, [x27]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s8, [x21, %[input_col_stride1]]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr s9, [%[inptr0], x23]\n" - "ldr s18, [x28]\n" - "ldr s6, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr s25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr s28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr s24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr s27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr s7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr s17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr s5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "ldr s19, [%[wbptr]]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "ldr s16, [%[inptr0]]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "ldr s13, [%[wbptr], #16]\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "ldr s9, [%[inptr0], x23]\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "add x21, x21, #4\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr s28, [x21]\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "ldr s8, [x21, %[input_col_stride1]]\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "add x27, x27, #4\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "ldr s20, [%[wbptr], #28]\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "ldr s24, [x27]\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "ldr s6, [x27, %[input_col_stride1]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "add x28, x28, #4\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "ldr s23, [%[wbptr], #20]\n" - "movi v25.16b, #0\n" - "ldr s18, [x28]\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "subs x25, x25, #1\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "ldr s10, [%[wbptr], #12]\n" - "fmov v26.4s, #6.0\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "ldr s21, [%[wbptr], #32]\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "str s3, [%[outptr0]]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "str s1, [x24]\n" - "mov v3.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "ldr s15, [%[wbptr], #24]\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "ldr s14, [%[wbptr], #36]\n" - "mov v2.16b, v19.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "str s0, [x24, %[output_col_stride1]]\n" - "mov v0.16b, v19.16b\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "add x24, x24, #4\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr s25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr s28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr s24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr s27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr s7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr s17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr s5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "add x21, x21, #4\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add x27, x27, #4\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "add x28, x28, #4\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "movi v25.16b, #0\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmov v26.4s, #6.0\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "str s1, [x24]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "str s0, [x24, %[output_col_stride1]]\n" - "add x24, x24, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp deleted file mode 100644 index 2554436172..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp +++ /dev/null @@ -1,2809 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x23, %[inptr0], %[input_row_stride]\n" - "add x19, %[input_col_stride1], %[input_col_stride1]\n" - "add x22, %[outptr0], %[output_row_stride]\n" - "add x24, x23, %[input_row_stride]\n" - "add x20, x19, %[input_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "add x25, x24, %[input_row_stride]\n" - "add x21, x20, %[input_col_stride1]\n" - "lsr x28, %[n_channels], #2\n" - "add x26, x25, %[input_row_stride]\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v12.16b, v14.16b\n" - "ldr q8, [%[wbptr], #16]\n" - "mov v10.16b, v14.16b\n" - "ldr q7, [%[wbptr], #32]\n" - "mov v11.16b, v14.16b\n" - "ldr q6, [%[wbptr], #48]\n" - "mov v9.16b, v14.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "ldr q4, [%[wbptr], #80]\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr q2, [%[wbptr], #112]\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q15, [%[inptr0]]\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr q20, [x23]\n" - "ldr q13, [%[inptr0], %[input_col_stride1]]\n" - "ldr q17, [x24]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr q16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr q18, [%[inptr0], x19]\n" - "ldr q14, [x25]\n" - "ldr q15, [x24, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "beq 3f\n" - "2:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr q19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr q20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr q14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr q13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr q17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr q19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr q18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr q16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr q14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr q15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr q13, [x23, x21]\n" - "str q12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr q20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr q17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr q19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr q18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr q16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "ldr q8, [%[wbptr], #16]\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "ldr q7, [%[wbptr], #32]\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "str q10, [x22]\n" - "mov v12.16b, v14.16b\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v14.16b\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr q6, [%[wbptr], #48]\n" - "ldr q15, [%[inptr0]]\n" - "add x23, x23, #16\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr q20, [x23]\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q13, [%[inptr0], %[input_col_stride1]]\n" - "add x24, x24, #16\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr q17, [x24]\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr q16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "ldr q18, [%[inptr0], x19]\n" - "str q9, [x22, %[output_col_stride1]]\n" - "add x25, x25, #16\n" - "mov v9.16b, v14.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "ldr q14, [x25]\n" - "ldr q15, [x24, %[input_col_stride1]]\n" - "add x26, x26, #16\n" - "add %[outptr0], %[outptr0], #16\n" - "add x22, x22, #16\n" - "subs x28, x28, #1\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "bne 2b\n" - "3:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr q19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr q20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr q14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr q13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr q17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr q19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr q18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr q16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr q14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr q15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr q13, [x23, x21]\n" - "str q12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr q20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr q17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr q19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr q18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr q16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "add x23, x23, #16\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "add x25, x25, #16\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "add x26, x26, #16\n" - "str q10, [x22]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "str q9, [x22, %[output_col_stride1]]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v12.16b, v14.16b\n" - "ldr s8, [%[wbptr], #4]\n" - "mov v10.16b, v14.16b\n" - "ldr s7, [%[wbptr], #8]\n" - "mov v11.16b, v14.16b\n" - "ldr s6, [%[wbptr], #12]\n" - "mov v9.16b, v14.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "ldr s4, [%[wbptr], #20]\n" - "subs x27, x27, #1\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr s2, [%[wbptr], #28]\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s0, [%[wbptr], #36]\n" - "ldr s15, [%[inptr0]]\n" - "ldr s20, [x23]\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr s13, [%[inptr0], %[input_col_stride1]]\n" - "ldr s17, [x24]\n" - "ldr s16, [x23, %[input_col_stride1]]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr s18, [%[inptr0], x19]\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr s14, [x25]\n" - "ldr s15, [x24, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "beq 6f\n" - "5:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr s19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr s20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr s14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr s13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr s17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr s19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr s18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr s16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr s14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr s15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr s13, [x23, x21]\n" - "str s12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr s20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr s17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr s19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr s18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr s16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "ldr s8, [%[wbptr], #4]\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "ldr s7, [%[wbptr], #8]\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "str s10, [x22]\n" - "mov v12.16b, v14.16b\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v14.16b\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr s6, [%[wbptr], #12]\n" - "ldr s15, [%[inptr0]]\n" - "add x23, x23, #4\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr s20, [x23]\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s13, [%[inptr0], %[input_col_stride1]]\n" - "add x24, x24, #4\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr s17, [x24]\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr s16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "ldr s18, [%[inptr0], x19]\n" - "str s9, [x22, %[output_col_stride1]]\n" - "add x25, x25, #4\n" - "mov v9.16b, v14.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "ldr s14, [x25]\n" - "ldr s15, [x24, %[input_col_stride1]]\n" - "add x26, x26, #4\n" - "add %[outptr0], %[outptr0], #4\n" - "add x22, x22, #4\n" - "subs x27, x27, #1\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "bne 5b\n" - "6:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr s19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr s20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr s14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr s13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr s17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr s19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr s18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr s16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr s14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr s15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr s13, [x23, x21]\n" - "str s12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr s20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr s17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr s19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr s18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr s16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "add x23, x23, #4\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "add x25, x25, #4\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "add x26, x26, #4\n" - "str s10, [x22]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "str s9, [x22, %[output_col_stride1]]\n" - "add x22, x22, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x23, xzr\n" - "mov x24, xzr\n" - "and x25, %[n_channels], #3\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q13, [%[wbptr]]\n" - "ldr x19, [%[inptrs], 0]\n" - "mov v10.16b, v13.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v8.16b, v13.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "mov v9.16b, v13.16b\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v7.16b, v13.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q4, [%[wbptr], #80]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr q2, [%[wbptr], #112]\n" - "ldr x27, [%[inptrs], 120]\n" - "ldr q1, [%[wbptr], #128]\n" - "subs x26, x26, #1\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr q18, [x20, x23]\n" - "ldr q14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr q19, [x19, x23]\n" - "ldr x21, [%[inptrs], 88]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr q15, [x20, x23]\n" - "ldr q18, [x21, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr q13, [x19, x23]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "beq 3f\n" - "2:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr q14, [x20, x23]\n" - "ldr q17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr q19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr q13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr q17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr q19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr q12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr q11, [x22, x23]\n" - "ldr q13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr q14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "ldr q13, [%[wbptr]]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v10.16b, v13.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q8, [x28, x24]\n" - "add x23, x23, #16\n" - "mov v8.16b, v13.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "str q9, [x21, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v9.16b, v13.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x27, [%[inptrs], 120]\n" - "subs x26, x26, #1\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr q18, [x20, x23]\n" - "ldr q14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr q19, [x19, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x20, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "str q7, [x28, x24]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v7.16b, v13.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x19, x23]\n" - "ldr q18, [x21, x23]\n" - "add x24, x24, #16\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "bne 2b\n" - "3:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr q14, [x20, x23]\n" - "ldr q17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr q19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr q13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr q17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr q19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr q12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr q11, [x22, x23]\n" - "ldr q13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr q14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "add x23, x23, #16\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "str q8, [x28, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "str q9, [x21, x24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "str q7, [x28, x24]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s13, [%[wbptr]]\n" - "mov v10.16b, v13.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v8.16b, v13.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "mov v9.16b, v13.16b\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v7.16b, v13.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s4, [%[wbptr], #20]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr s2, [%[wbptr], #28]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr x27, [%[inptrs], 120]\n" - "ldr s0, [%[wbptr], #36]\n" - "subs x25, x25, #1\n" - "ldr s14, [x19, x23]\n" - "ldr s18, [x20, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr s14, [x21, x23]\n" - "ldr s16, [x27, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr x21, [%[inptrs], 88]\n" - "ldr s19, [x19, x23]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr s15, [x20, x23]\n" - "ldr s18, [x21, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr s13, [x19, x23]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "beq 6f\n" - "5:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr s14, [x20, x23]\n" - "ldr s17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr s19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr s13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr s17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr s16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr s19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr s12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr s11, [x22, x23]\n" - "ldr s13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr s14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "ldr s13, [%[wbptr]]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v10.16b, v13.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s8, [x28, x24]\n" - "add x23, x23, #4\n" - "mov v8.16b, v13.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "str s9, [x21, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v9.16b, v13.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x27, [%[inptrs], 120]\n" - "subs x25, x25, #1\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr s18, [x20, x23]\n" - "ldr s14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr s19, [x19, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x20, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "str s7, [x28, x24]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v7.16b, v13.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x19, x23]\n" - "ldr s18, [x21, x23]\n" - "add x24, x24, #4\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "bne 5b\n" - "6:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr s14, [x20, x23]\n" - "ldr s17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr s19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr s13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr s17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr s16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr s19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr s12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr s11, [x22, x23]\n" - "ldr s13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr s14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "add x23, x23, #4\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "str s8, [x28, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "str s9, [x21, x24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "str s7, [x28, x24]\n" - "add x24, x24, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x24, %[inptr0], %[input_row_stride]\n" - "add x27, %[input_col_stride1], %[input_col_stride1]\n" - "add x19, %[outptr0], %[output_row_stride]\n" - "add x25, x24, %[input_row_stride]\n" - "add x23, x27, %[input_col_stride1]\n" - "and x20, %[n_channels], #3\n" - "add x28, x25, %[input_row_stride]\n" - "add x22, x23, %[input_col_stride1]\n" - "lsr x21, %[n_channels], #2\n" - "add x26, x28, %[input_row_stride]\n" - "cbz x21, 4f\n" - "1:\n" - "ldr q16, [%[wbptr]]\n" - "subs x21, x21, #1\n" - "mov v3.16b, v16.16b\n" - "ldr q4, [%[wbptr], #16]\n" - "mov v1.16b, v16.16b\n" - "ldr q5, [%[wbptr], #32]\n" - "mov v2.16b, v16.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v0.16b, v16.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q6, [%[wbptr], #96]\n" - "ldr q9, [%[wbptr], #112]\n" - "ldr q8, [%[wbptr], #128]\n" - "ldr q7, [%[wbptr], #144]\n" - "ldr q21, [%[inptr0]]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "ldr q23, [x24]\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "ldr q14, [x25]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q13, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr q18, [%[inptr0], x27]\n" - "ldr q15, [x28]\n" - "ldr q22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr q17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr q20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr q19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr q15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr q16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr q20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr q19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr q21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr q22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr q23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr q18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr q13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr q14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr q17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr q15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q16, [%[wbptr]]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "ldr q4, [%[wbptr], #16]\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "ldr q5, [%[wbptr], #32]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "ldr q21, [%[inptr0]]\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "movi v20.16b, #0\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "ldr q18, [%[inptr0], x27]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "add x24, x24, #16\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "ldr q23, [x24]\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "mov v3.16b, v16.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "str q1, [x19]\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "mov v1.16b, v16.16b\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v2.16b, v16.16b\n" - "ldr q13, [x24, %[input_col_stride1]]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "ldr q6, [%[wbptr], #96]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "add x25, x25, #16\n" - "ldr q14, [x25]\n" - "add x28, x28, #16\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "ldr q7, [%[wbptr], #144]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr q15, [x28]\n" - "str q0, [x19, %[output_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "mov v0.16b, v16.16b\n" - "ldr q22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "add x26, x26, #16\n" - "add %[outptr0], %[outptr0], #16\n" - "add x19, x19, #16\n" - "subs x21, x21, #1\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr q17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr q20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr q19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr q15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr q16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr q20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr q19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr q21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr q22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr q23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr q18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr q13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr q14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr q17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr q15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "add x24, x24, #16\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "add x25, x25, #16\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "add x28, x28, #16\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add x26, x26, #16\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "movi v20.16b, #0\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "str q1, [x19]\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "str q0, [x19, %[output_col_stride1]]\n" - "add x19, x19, #16\n" - "4:\n" - "cbz x20, 7f\n" - "ldr s16, [%[wbptr]]\n" - "mov v3.16b, v16.16b\n" - "ldr s4, [%[wbptr], #4]\n" - "mov v1.16b, v16.16b\n" - "ldr s5, [%[wbptr], #8]\n" - "mov v2.16b, v16.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v0.16b, v16.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x20, x20, #1\n" - "ldr s6, [%[wbptr], #24]\n" - "ldr s9, [%[wbptr], #28]\n" - "ldr s8, [%[wbptr], #32]\n" - "ldr s7, [%[wbptr], #36]\n" - "ldr s21, [%[inptr0]]\n" - "ldr s23, [x24]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "ldr s14, [x25]\n" - "ldr s13, [x24, %[input_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s18, [%[inptr0], x27]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr s15, [x28]\n" - "ldr s22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr s17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr s20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr s19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr s15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr s16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr s20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr s19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr s21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr s22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr s23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr s18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr s13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr s14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr s17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr s15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s16, [%[wbptr]]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "ldr s4, [%[wbptr], #4]\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "ldr s5, [%[wbptr], #8]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "ldr s21, [%[inptr0]]\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "movi v20.16b, #0\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "ldr s18, [%[inptr0], x27]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "add x24, x24, #4\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "ldr s23, [x24]\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "mov v3.16b, v16.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "str s1, [x19]\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "mov v1.16b, v16.16b\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v2.16b, v16.16b\n" - "ldr s13, [x24, %[input_col_stride1]]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "ldr s6, [%[wbptr], #24]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "add x25, x25, #4\n" - "ldr s14, [x25]\n" - "add x28, x28, #4\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "ldr s7, [%[wbptr], #36]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr s15, [x28]\n" - "str s0, [x19, %[output_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "mov v0.16b, v16.16b\n" - "ldr s22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "add x26, x26, #4\n" - "add %[outptr0], %[outptr0], #4\n" - "add x19, x19, #4\n" - "subs x20, x20, #1\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr s17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr s20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr s19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr s15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr s16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr s20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr s19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr s21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr s22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr s23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr s18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr s13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr s14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr s17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr s15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "add x24, x24, #4\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "add x25, x25, #4\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "add x28, x28, #4\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add x26, x26, #4\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "movi v20.16b, #0\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "str s1, [x19]\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "str s0, [x19, %[output_col_stride1]]\n" - "add x19, x19, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x22, xzr\n" - "mov x26, xzr\n" - "and x23, %[n_channels], #3\n" - "lsr x24, %[n_channels], #2\n" - "cbz x24, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "ldr x19, [%[inptrs], 0]\n" - "mov v3.16b, v14.16b\n" - "ldr q13, [%[wbptr], #16]\n" - "mov v1.16b, v14.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v2.16b, v14.16b\n" - "ldr q4, [%[wbptr], #48]\n" - "mov v0.16b, v14.16b\n" - "ldr q12, [%[wbptr], #64]\n" - "ldr q9, [%[wbptr], #80]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr q8, [%[wbptr], #96]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr q7, [%[wbptr], #112]\n" - "ldr x25, [%[inptrs], 120]\n" - "ldr q6, [%[wbptr], #128]\n" - "subs x24, x24, #1\n" - "ldr q5, [%[wbptr], #144]\n" - "ldr q15, [x19, x22]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr q10, [x19, x22]\n" - "ldr x21, [%[inptrs], 88]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q14, [x21, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr q18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "beq 3f\n" - "2:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr q16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr q17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr q14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr q18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr q13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr q16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr q17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr q12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr q11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr q16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "ldr q14, [%[wbptr]]\n" - "add x22, x22, #16\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "ldr q13, [%[wbptr], #16]\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "ldr q12, [%[wbptr], #64]\n" - "mov v3.16b, v14.16b\n" - "ldr q9, [%[wbptr], #80]\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "ldr q7, [%[wbptr], #112]\n" - "str q1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v1.16b, v14.16b\n" - "ldr q4, [%[wbptr], #48]\n" - "str q2, [x21, x26]\n" - "ldr x28, [%[outptrs], 24]\n" - "mov v2.16b, v14.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "ldr q6, [%[wbptr], #128]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x25, [%[inptrs], 120]\n" - "subs x24, x24, #1\n" - "ldr q15, [x19, x22]\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "ldr q8, [%[wbptr], #96]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "ldr q5, [%[wbptr], #144]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q10, [x19, x22]\n" - "ldr q17, [x20, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "str q0, [x28, x26]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v0.16b, v14.16b\n" - "ldr q18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "ldr q14, [x21, x22]\n" - "add x26, x26, #16\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "bne 2b\n" - "3:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr q16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr q17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr q14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr q18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr q13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr q16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr q17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr q12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr q11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr q16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x22, x22, #16\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "str q1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "ldr x28, [%[outptrs], 24]\n" - "str q2, [x21, x26]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "str q0, [x28, x26]\n" - "add x26, x26, #16\n" - "4:\n" - "cbz x23, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v3.16b, v14.16b\n" - "ldr s13, [%[wbptr], #4]\n" - "mov v1.16b, v14.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v2.16b, v14.16b\n" - "ldr s4, [%[wbptr], #12]\n" - "mov v0.16b, v14.16b\n" - "ldr s12, [%[wbptr], #16]\n" - "ldr s9, [%[wbptr], #20]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr s8, [%[wbptr], #24]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr s7, [%[wbptr], #28]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x25, [%[inptrs], 120]\n" - "ldr s5, [%[wbptr], #36]\n" - "subs x23, x23, #1\n" - "ldr s15, [x19, x22]\n" - "ldr s17, [x20, x22]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr s16, [x21, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr x21, [%[inptrs], 88]\n" - "ldr s10, [x19, x22]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s17, [x20, x22]\n" - "ldr s14, [x21, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr s18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "beq 6f\n" - "5:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr s16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr s17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr s14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr s18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr s13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr s16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr s17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr s12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr s11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr s16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "ldr s14, [%[wbptr]]\n" - "add x22, x22, #4\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "ldr s13, [%[wbptr], #4]\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "ldr s12, [%[wbptr], #16]\n" - "mov v3.16b, v14.16b\n" - "ldr s9, [%[wbptr], #20]\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "ldr s7, [%[wbptr], #28]\n" - "str s1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v1.16b, v14.16b\n" - "ldr s4, [%[wbptr], #12]\n" - "str s2, [x21, x26]\n" - "ldr x28, [%[outptrs], 24]\n" - "mov v2.16b, v14.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x25, [%[inptrs], 120]\n" - "subs x23, x23, #1\n" - "ldr s15, [x19, x22]\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "ldr s8, [%[wbptr], #24]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr s17, [x20, x22]\n" - "ldr s16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr s15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "ldr s5, [%[wbptr], #36]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s10, [x19, x22]\n" - "ldr s17, [x20, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "str s0, [x28, x26]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v0.16b, v14.16b\n" - "ldr s18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "ldr s14, [x21, x22]\n" - "add x26, x26, #4\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "bne 5b\n" - "6:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr s16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr s17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr s14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr s18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr s13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr s16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr s17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr s12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr s11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr s16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x22, x22, #4\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "str s1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "ldr x28, [%[outptrs], 24]\n" - "str s2, [x21, x26]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "str s0, [x28, x26]\n" - "add x26, x26, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU6>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x23, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x28, x21, %[input_row_stride]\n" - "add x26, x23, %[input_col_stride1]\n" - "and x19, %[n_channels], #3\n" - "add x27, x28, %[input_row_stride]\n" - "add x25, x26, %[input_col_stride1]\n" - "lsr x20, %[n_channels], #2\n" - "add x22, x27, %[input_row_stride]\n" - "cbz x20, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x20, x20, #1\n" - "mov v5.16b, v14.16b\n" - "ldr q0, [%[wbptr], #16]\n" - "mov v11.16b, v14.16b\n" - "ldr q1, [%[wbptr], #32]\n" - "mov v12.16b, v14.16b\n" - "ldr q2, [%[wbptr], #48]\n" - "mov v10.16b, v14.16b\n" - "ldr q6, [%[wbptr], #64]\n" - "ldr q3, [%[wbptr], #80]\n" - "ldr q7, [%[wbptr], #96]\n" - "ldr q4, [%[wbptr], #112]\n" - "ldr q8, [%[wbptr], #128]\n" - "ldr q9, [%[wbptr], #144]\n" - "ldr q19, [%[inptr0]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "ldr q15, [x21]\n" - "ldr q21, [%[inptr0], %[input_col_stride1]]\n" - "ldr q16, [x28]\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "ldr q23, [x21, %[input_col_stride1]]\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "ldr q18, [%[inptr0], x23]\n" - "ldr q17, [x27]\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr q21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr q20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr q19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr q15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr q16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr q17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr q14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr q20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr q22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr q21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr q23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr q18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr q15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr q16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr q17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "ldr q0, [%[wbptr], #16]\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "ldr q6, [%[wbptr], #64]\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "ldr q1, [%[wbptr], #32]\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "ldr q19, [%[inptr0]]\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "ldr q4, [%[wbptr], #112]\n" - "movi v20.16b, #0\n" - "ldr q21, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "ldr q18, [%[inptr0], x23]\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "ldr q3, [%[wbptr], #80]\n" - "fmov v22.4s, #6.0\n" - "add x21, x21, #16\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "ldr q15, [x21]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "ldr q2, [%[wbptr], #48]\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "ldr q23, [x21, %[input_col_stride1]]\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "add x28, x28, #16\n" - "str q5, [%[outptr0]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "ldr q8, [%[wbptr], #128]\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "ldr q16, [x28]\n" - "str q12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v5.16b, v14.16b\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "str q11, [x24]\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "mov v11.16b, v14.16b\n" - "ldr q9, [%[wbptr], #144]\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "add x27, x27, #16\n" - "mov v12.16b, v14.16b\n" - "ldr q17, [x27]\n" - "str q10, [x24, %[output_col_stride1]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "mov v10.16b, v14.16b\n" - "add x22, x22, #16\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "add x24, x24, #16\n" - "subs x20, x20, #1\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr q21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr q20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr q19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr q15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr q16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr q17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr q14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr q20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr q22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr q21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr q23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr q18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr q15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr q16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr q17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "add x21, x21, #16\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "add x28, x28, #16\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "add x27, x27, #16\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add x22, x22, #16\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "movi v20.16b, #0\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "fmov v22.4s, #6.0\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "str q5, [%[outptr0]]\n" - "str q11, [x24]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "str q12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "str q10, [x24, %[output_col_stride1]]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v5.16b, v14.16b\n" - "ldr s0, [%[wbptr], #4]\n" - "mov v11.16b, v14.16b\n" - "ldr s1, [%[wbptr], #8]\n" - "mov v12.16b, v14.16b\n" - "ldr s2, [%[wbptr], #12]\n" - "mov v10.16b, v14.16b\n" - "ldr s6, [%[wbptr], #16]\n" - "ldr s3, [%[wbptr], #20]\n" - "subs x19, x19, #1\n" - "ldr s7, [%[wbptr], #24]\n" - "ldr s4, [%[wbptr], #28]\n" - "ldr s8, [%[wbptr], #32]\n" - "ldr s9, [%[wbptr], #36]\n" - "ldr s19, [%[inptr0]]\n" - "ldr s15, [x21]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "ldr s21, [%[inptr0], %[input_col_stride1]]\n" - "ldr s16, [x28]\n" - "ldr s23, [x21, %[input_col_stride1]]\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "ldr s18, [%[inptr0], x23]\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "ldr s17, [x27]\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr s21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr s20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr s19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr s15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr s16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr s17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr s14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr s20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr s22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr s21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr s23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr s18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr s15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr s16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr s17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "ldr s0, [%[wbptr], #4]\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "ldr s6, [%[wbptr], #16]\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "ldr s1, [%[wbptr], #8]\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "ldr s19, [%[inptr0]]\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "ldr s4, [%[wbptr], #28]\n" - "movi v20.16b, #0\n" - "ldr s21, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "ldr s18, [%[inptr0], x23]\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "ldr s3, [%[wbptr], #20]\n" - "fmov v22.4s, #6.0\n" - "add x21, x21, #4\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "ldr s15, [x21]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "ldr s2, [%[wbptr], #12]\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "ldr s23, [x21, %[input_col_stride1]]\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "add x28, x28, #4\n" - "str s5, [%[outptr0]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "ldr s8, [%[wbptr], #32]\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "ldr s16, [x28]\n" - "str s12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v5.16b, v14.16b\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "str s11, [x24]\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "mov v11.16b, v14.16b\n" - "ldr s9, [%[wbptr], #36]\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "add x27, x27, #4\n" - "mov v12.16b, v14.16b\n" - "ldr s17, [x27]\n" - "str s10, [x24, %[output_col_stride1]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "mov v10.16b, v14.16b\n" - "add x22, x22, #4\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "add x24, x24, #4\n" - "subs x19, x19, #1\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr s21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr s20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr s19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr s15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr s16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr s17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr s14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr s20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr s22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr s21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr s23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr s18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr s15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr s16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr s17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "add x21, x21, #4\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "add x28, x28, #4\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "add x27, x27, #4\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add x22, x22, #4\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "movi v20.16b, #0\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "fmov v22.4s, #6.0\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "str s5, [%[outptr0]]\n" - "str s11, [x24]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "str s12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "str s10, [x24, %[output_col_stride1]]\n" - "add x24, x24, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU6>( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x26, %[n_channels], #3\n" - "lsr x25, %[n_channels], #2\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q15, [%[wbptr]]\n" - "ldr x21, [%[inptrs], 0]\n" - "mov v8.16b, v15.16b\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v3.16b, v15.16b\n" - "ldr q10, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "ldr q7, [%[wbptr], #48]\n" - "mov v4.16b, v15.16b\n" - "ldr q13, [%[wbptr], #64]\n" - "ldr q5, [%[wbptr], #80]\n" - "ldr x22, [%[inptrs], 40]\n" - "ldr q0, [%[wbptr], #96]\n" - "ldr x20, [%[inptrs], 80]\n" - "ldr q9, [%[wbptr], #112]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr q6, [%[wbptr], #128]\n" - "subs x25, x25, #1\n" - "ldr q1, [%[wbptr], #144]\n" - "ldr q17, [x21, x27]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "ldr q17, [x23, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "ldr q11, [x21, x27]\n" - "ldr x20, [%[inptrs], 88]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr q19, [x22, x27]\n" - "ldr q15, [x20, x27]\n" - "ldr x21, [%[inptrs], 16]\n" - "ldr q12, [x21, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr q16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr q18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr q19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr q17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr q19, [x21, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr q17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr q13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr q10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr q9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr q19, [x23, x27]\n" - "str q8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr q16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "ldr q15, [%[wbptr]]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v8.16b, v15.16b\n" - "ldr q10, [%[wbptr], #32]\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "ldr q13, [%[wbptr], #64]\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "ldr q7, [%[wbptr], #48]\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "ldr q5, [%[wbptr], #80]\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x27, x27, #16\n" - "str q3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str q2, [x22, x28]\n" - "mov v3.16b, v15.16b\n" - "mov v2.16b, v15.16b\n" - "ldr q6, [%[wbptr], #128]\n" - "ldr x24, [%[outptrs], 24]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr x22, [%[inptrs], 40]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "ldr q0, [%[wbptr], #96]\n" - "ldr q17, [x21, x27]\n" - "ldr x20, [%[inptrs], 80]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "ldr q1, [%[wbptr], #144]\n" - "ldr q11, [x21, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr x21, [%[inptrs], 16]\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "ldr q19, [x22, x27]\n" - "ldr q12, [x21, x27]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr x20, [%[inptrs], 88]\n" - "subs x25, x25, #1\n" - "str q4, [x24, x28]\n" - "mov v4.16b, v15.16b\n" - "ldr q17, [x23, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "add x28, x28, #16\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr q16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr q18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr q19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr q17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr q19, [x21, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr q17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr q13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr q10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr q9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr q19, [x23, x27]\n" - "str q8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr q16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "add x27, x27, #16\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "str q3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str q2, [x22, x28]\n" - "ldr x24, [%[outptrs], 24]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "str q4, [x24, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x26, 7f\n" - "ldr s15, [%[wbptr]]\n" - "mov v8.16b, v15.16b\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v3.16b, v15.16b\n" - "ldr s10, [%[wbptr], #8]\n" - "mov v2.16b, v15.16b\n" - "ldr s7, [%[wbptr], #12]\n" - "mov v4.16b, v15.16b\n" - "ldr s13, [%[wbptr], #16]\n" - "ldr s5, [%[wbptr], #20]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr s0, [%[wbptr], #24]\n" - "ldr x22, [%[inptrs], 40]\n" - "ldr s9, [%[wbptr], #28]\n" - "ldr x20, [%[inptrs], 80]\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr s1, [%[wbptr], #36]\n" - "subs x26, x26, #1\n" - "ldr s17, [x21, x27]\n" - "ldr s18, [x22, x27]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr s16, [x20, x27]\n" - "ldr s17, [x23, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "ldr x22, [%[inptrs], 48]\n" - "ldr x20, [%[inptrs], 88]\n" - "ldr s11, [x21, x27]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr s19, [x22, x27]\n" - "ldr s15, [x20, x27]\n" - "ldr x21, [%[inptrs], 16]\n" - "ldr s12, [x21, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr s16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr s18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr s19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr s17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr s19, [x21, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr s17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr s13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr s10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr s9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr s19, [x23, x27]\n" - "str s8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr s16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "ldr s15, [%[wbptr]]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v8.16b, v15.16b\n" - "ldr s10, [%[wbptr], #8]\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "ldr s13, [%[wbptr], #16]\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "ldr s7, [%[wbptr], #12]\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "ldr s5, [%[wbptr], #20]\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x27, x27, #4\n" - "str s3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str s2, [x22, x28]\n" - "mov v3.16b, v15.16b\n" - "mov v2.16b, v15.16b\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x24, [%[outptrs], 24]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr x22, [%[inptrs], 40]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "ldr s0, [%[wbptr], #24]\n" - "ldr s17, [x21, x27]\n" - "ldr x20, [%[inptrs], 80]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "ldr s1, [%[wbptr], #36]\n" - "ldr s11, [x21, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr x21, [%[inptrs], 16]\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "ldr s19, [x22, x27]\n" - "ldr s12, [x21, x27]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr x20, [%[inptrs], 88]\n" - "subs x26, x26, #1\n" - "str s4, [x24, x28]\n" - "mov v4.16b, v15.16b\n" - "ldr s17, [x23, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "add x28, x28, #4\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr s16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr s18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr s19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr s17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr s19, [x21, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr s17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr s13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr s10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr s9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr s19, [x23, x27]\n" - "str s8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr s16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "add x27, x27, #4\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "str s3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str s2, [x22, x28]\n" - "ldr x24, [%[outptrs], 24]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "str s4, [x24, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 2142c431ac..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,2341 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x20, %[inptr0], %[input_row_stride]\n" - "add x13, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x21, x20, %[input_row_stride]\n" - "add x14, x13, #64\n" - "add x15, x13, %[input_col_stride1]\n" - "add x22, x21, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x17, x15, %[input_col_stride1]\n" - "add x23, x22, %[input_row_stride]\n" - "add x9, x17, #64\n" - "add x25, x24, %[output_row_stride]\n" - "add x26, %[output_col_stride1], %[output_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "lsr x28, %[n_channels], #2\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q25, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v17.16b, v25.16b\n" - "ldr q16, [%[wbptr], #16]\n" - "mov v13.16b, v25.16b\n" - "ldr q7, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "ldr q6, [%[wbptr], #48]\n" - "mov v10.16b, v25.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "mov v12.16b, v25.16b\n" - "ldr q4, [%[wbptr], #80]\n" - "mov v14.16b, v25.16b\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v9.16b, v25.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "mov v11.16b, v25.16b\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v8.16b, v25.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q26, [%[inptr0]]\n" - "ldr q28, [x20]\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "ldr q29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "ldr q27, [x21]\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "ldr q21, [x20, %[input_col_stride1]]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "ldr q20, [%[inptr0], x13]\n" - "ldr q23, [x22]\n" - "ldr q19, [x21, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x20, x19]\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x21, x19]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr q30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr q29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr q28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr q24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr q18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr q22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr q25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr q26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr q27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr q21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr q19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr q23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "str q17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr q17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "add x20, x20, #16\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr q24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr q18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x20, x19]\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "ldr q22, [x23, x17]\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "add x21, x21, #16\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "ldr q25, [%[wbptr]]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "ldr q16, [%[wbptr], #16]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "prfm pldl1keep, [x21, x19]\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "add x22, x22, #16\n" - "str q13, [x24]\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "ldr q26, [%[inptr0]]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "ldr q28, [x20]\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "ldr q29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "add x23, x23, #16\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "ldr q27, [x21]\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "ldr q21, [x20, %[input_col_stride1]]\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "ldr q20, [%[inptr0], x13]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "subs x28, x28, #1\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "ldr q7, [%[wbptr], #32]\n" - "str q10, [x25]\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "str q12, [x24, %[output_col_stride1]]\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "ldr q23, [x22]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "ldr q19, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q14, [%[outptr0], x26]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "ldr q6, [%[wbptr], #48]\n" - "str q9, [x25, %[output_col_stride1]]\n" - "mov v17.16b, v25.16b\n" - "str q11, [x24, x26]\n" - "mov v13.16b, v25.16b\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v15.16b, v25.16b\n" - "add x24, x24, #16\n" - "mov v10.16b, v25.16b\n" - "mov v12.16b, v25.16b\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v14.16b, v25.16b\n" - "mov v9.16b, v25.16b\n" - "mov v11.16b, v25.16b\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "str q8, [x25, x26]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "mov v8.16b, v25.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "add x25, x25, #16\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr q30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr q29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr q28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr q24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr q18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr q22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr q25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr q26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr q27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr q21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr q19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr q23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "add x20, x20, #16\n" - "str q17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr q17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "add x21, x21, #16\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr q24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr q18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "add x22, x22, #16\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "ldr q22, [x23, x17]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "add x23, x23, #16\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "str q13, [x24]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "str q10, [x25]\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "str q12, [x24, %[output_col_stride1]]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "str q14, [%[outptr0], x26]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q9, [x25, %[output_col_stride1]]\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "str q11, [x24, x26]\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "str q8, [x25, x26]\n" - "add x25, x25, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s25, [%[wbptr]]\n" - "mov v17.16b, v25.16b\n" - "ldr s16, [%[wbptr], #4]\n" - "mov v13.16b, v25.16b\n" - "ldr s7, [%[wbptr], #8]\n" - "mov v15.16b, v25.16b\n" - "ldr s6, [%[wbptr], #12]\n" - "mov v10.16b, v25.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "mov v12.16b, v25.16b\n" - "ldr s4, [%[wbptr], #20]\n" - "mov v14.16b, v25.16b\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v9.16b, v25.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "mov v11.16b, v25.16b\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v8.16b, v25.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "ldr s26, [%[inptr0]]\n" - "subs x27, x27, #1\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "ldr s28, [x20]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "ldr s29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "ldr s27, [x21]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "ldr s21, [x20, %[input_col_stride1]]\n" - "ldr s20, [%[inptr0], x13]\n" - "ldr s23, [x22]\n" - "ldr s19, [x21, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x20, x19]\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x21, x19]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr s30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr s29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr s28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr s24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr s18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr s22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr s25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr s26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr s27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr s21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr s19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr s23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "str s17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr s17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "add x20, x20, #4\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr s24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr s18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x20, x19]\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "ldr s22, [x23, x17]\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "add x21, x21, #4\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "ldr s25, [%[wbptr]]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "ldr s16, [%[wbptr], #4]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "prfm pldl1keep, [x21, x19]\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "add x22, x22, #4\n" - "str s13, [x24]\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "ldr s26, [%[inptr0]]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "ldr s28, [x20]\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "ldr s29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "add x23, x23, #4\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "ldr s27, [x21]\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "ldr s21, [x20, %[input_col_stride1]]\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "ldr s20, [%[inptr0], x13]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "subs x27, x27, #1\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "ldr s7, [%[wbptr], #8]\n" - "str s10, [x25]\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "str s12, [x24, %[output_col_stride1]]\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "ldr s23, [x22]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "ldr s19, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s14, [%[outptr0], x26]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "ldr s6, [%[wbptr], #12]\n" - "str s9, [x25, %[output_col_stride1]]\n" - "mov v17.16b, v25.16b\n" - "str s11, [x24, x26]\n" - "mov v13.16b, v25.16b\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "add x24, x24, #4\n" - "mov v10.16b, v25.16b\n" - "mov v12.16b, v25.16b\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v14.16b, v25.16b\n" - "mov v9.16b, v25.16b\n" - "mov v11.16b, v25.16b\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "str s8, [x25, x26]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "mov v8.16b, v25.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "add x25, x25, #4\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr s30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr s29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr s28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr s24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr s18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr s22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr s25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr s26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr s27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr s21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr s19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr s23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "add x20, x20, #4\n" - "str s17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr s17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "add x21, x21, #4\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr s24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr s18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "add x22, x22, #4\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "ldr s22, [x23, x17]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "add x23, x23, #4\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "str s13, [x24]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "str s10, [x25]\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "str s12, [x24, %[output_col_stride1]]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "str s14, [%[outptr0], x26]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s9, [x25, %[output_col_stride1]]\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "str s11, [x24, x26]\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "str s8, [x25, x26]\n" - "add x25, x25, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x25, %[inptr0], %[input_row_stride]\n" - "add x16, %[input_col_stride1], %[input_col_stride1]\n" - "add x21, %[outptr0], %[output_row_stride]\n" - "add x22, x25, %[input_row_stride]\n" - "add x23, x16, #64\n" - "add x26, x16, %[input_col_stride1]\n" - "add x13, x22, %[input_row_stride]\n" - "add x20, x26, #64\n" - "add x9, x26, %[input_col_stride1]\n" - "add x24, x13, %[input_row_stride]\n" - "add x15, x9, #64\n" - "add x14, x21, %[output_row_stride]\n" - "add x19, %[output_col_stride1], %[output_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "lsr x28, %[n_channels], #2\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q20, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v4.16b, v20.16b\n" - "ldr q15, [%[wbptr], #16]\n" - "mov v1.16b, v20.16b\n" - "ldr q0, [%[wbptr], #32]\n" - "mov v3.16b, v20.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v7.16b, v20.16b\n" - "ldr q16, [%[wbptr], #64]\n" - "mov v9.16b, v20.16b\n" - "ldr q12, [%[wbptr], #80]\n" - "mov v2.16b, v20.16b\n" - "ldr q17, [%[wbptr], #96]\n" - "mov v6.16b, v20.16b\n" - "ldr q11, [%[wbptr], #112]\n" - "mov v8.16b, v20.16b\n" - "ldr q10, [%[wbptr], #128]\n" - "mov v5.16b, v20.16b\n" - "ldr q14, [%[wbptr], #144]\n" - "ldr q27, [%[inptr0]]\n" - "ldr q24, [x25]\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "ldr q21, [x22]\n" - "ldr q19, [x25, %[input_col_stride1]]\n" - "ldr q31, [%[inptr0], x16]\n" - "ldr q28, [x13]\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "ldr q18, [x22, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x25, #64]\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x25, x17]\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "prfm pldl1keep, [x13, #64]\n" - "prfm pldl1keep, [x22, x17]\n" - "beq 3f\n" - "2:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr q24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr q29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr q30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr q25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr q26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr q23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr q28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr q27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr q24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr q21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr q19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "add x25, x25, #16\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "prfm pldl1keep, [x25, #64]\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "prfm pldl1keep, [x25, x17]\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "subs x28, x28, #1\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr q18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #16\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "ldr q23, [x13, x9]\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x22, x17]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "ldr q25, [x24, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr q20, [%[wbptr]]\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x13, x13, #16\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [x13, #64]\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "add x24, x24, #16\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "ldr q15, [%[wbptr], #16]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr q27, [%[inptr0]]\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "ldr q16, [%[wbptr], #64]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "ldr q24, [x25]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "ldr q0, [%[wbptr], #32]\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "ldr q21, [x22]\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "ldr q11, [%[wbptr], #112]\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "movi v29.16b, #0\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "ldr q12, [%[wbptr], #80]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "ldr q19, [x25, %[input_col_stride1]]\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str q4, [%[outptr0]]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "str q3, [%[outptr0], %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "str q2, [%[outptr0], x19]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str q1, [x21]\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "ldr q10, [%[wbptr], #128]\n" - "str q9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "str q8, [x21, x19]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "ldr q17, [%[wbptr], #96]\n" - "str q7, [x14]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str q6, [x14, %[output_col_stride1]]\n" - "mov v4.16b, v20.16b\n" - "str q5, [x14, x19]\n" - "mov v1.16b, v20.16b\n" - "mov v3.16b, v20.16b\n" - "ldr q14, [%[wbptr], #144]\n" - "mov v7.16b, v20.16b\n" - "ldr q31, [%[inptr0], x16]\n" - "mov v9.16b, v20.16b\n" - "ldr q28, [x13]\n" - "mov v2.16b, v20.16b\n" - "ldr q18, [x22, %[input_col_stride1]]\n" - "mov v6.16b, v20.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v8.16b, v20.16b\n" - "add x21, x21, #16\n" - "mov v5.16b, v20.16b\n" - "add x14, x14, #16\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "bne 2b\n" - "3:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr q24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr q29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr q30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr q25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr q26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr q23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr q28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr q27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr q24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr q21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "add x25, x25, #16\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr q19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr q18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #16\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "ldr q23, [x13, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr q25, [x24, x9]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "add x13, x13, #16\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x24, x24, #16\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "movi v29.16b, #0\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str q4, [%[outptr0]]\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "str q3, [%[outptr0], %[output_col_stride1]]\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "str q2, [%[outptr0], x19]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q1, [x21]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str q9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str q8, [x21, x19]\n" - "str q7, [x14]\n" - "str q6, [x14, %[output_col_stride1]]\n" - "add x21, x21, #16\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str q5, [x14, x19]\n" - "add x14, x14, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s20, [%[wbptr]]\n" - "mov v4.16b, v20.16b\n" - "ldr s15, [%[wbptr], #4]\n" - "mov v1.16b, v20.16b\n" - "ldr s0, [%[wbptr], #8]\n" - "mov v3.16b, v20.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v7.16b, v20.16b\n" - "ldr s16, [%[wbptr], #16]\n" - "mov v9.16b, v20.16b\n" - "ldr s12, [%[wbptr], #20]\n" - "mov v2.16b, v20.16b\n" - "ldr s17, [%[wbptr], #24]\n" - "mov v6.16b, v20.16b\n" - "ldr s11, [%[wbptr], #28]\n" - "mov v8.16b, v20.16b\n" - "ldr s10, [%[wbptr], #32]\n" - "mov v5.16b, v20.16b\n" - "ldr s14, [%[wbptr], #36]\n" - "ldr s27, [%[inptr0]]\n" - "subs x27, x27, #1\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "ldr s24, [x25]\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "ldr s21, [x22]\n" - "ldr s19, [x25, %[input_col_stride1]]\n" - "ldr s31, [%[inptr0], x16]\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "ldr s28, [x13]\n" - "ldr s18, [x22, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x25, #64]\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x25, x17]\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "prfm pldl1keep, [x13, #64]\n" - "prfm pldl1keep, [x22, x17]\n" - "beq 6f\n" - "5:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr s24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr s29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr s30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr s25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr s26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr s23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr s28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr s27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr s24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr s21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr s19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "add x25, x25, #4\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "prfm pldl1keep, [x25, #64]\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "prfm pldl1keep, [x25, x17]\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "subs x27, x27, #1\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr s18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #4\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "ldr s23, [x13, x9]\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x22, x17]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "ldr s25, [x24, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr s20, [%[wbptr]]\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x13, x13, #4\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [x13, #64]\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "add x24, x24, #4\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "ldr s15, [%[wbptr], #4]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr s27, [%[inptr0]]\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "ldr s16, [%[wbptr], #16]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "ldr s24, [x25]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "ldr s0, [%[wbptr], #8]\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "ldr s21, [x22]\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "ldr s11, [%[wbptr], #28]\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "movi v29.16b, #0\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "ldr s12, [%[wbptr], #20]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "ldr s19, [x25, %[input_col_stride1]]\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str s4, [%[outptr0]]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "str s3, [%[outptr0], %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "str s2, [%[outptr0], x19]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str s1, [x21]\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "ldr s10, [%[wbptr], #32]\n" - "str s9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "str s8, [x21, x19]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "ldr s17, [%[wbptr], #24]\n" - "str s7, [x14]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str s6, [x14, %[output_col_stride1]]\n" - "mov v4.16b, v20.16b\n" - "str s5, [x14, x19]\n" - "mov v1.16b, v20.16b\n" - "mov v3.16b, v20.16b\n" - "ldr s14, [%[wbptr], #36]\n" - "mov v7.16b, v20.16b\n" - "ldr s31, [%[inptr0], x16]\n" - "mov v9.16b, v20.16b\n" - "ldr s28, [x13]\n" - "mov v2.16b, v20.16b\n" - "ldr s18, [x22, %[input_col_stride1]]\n" - "mov v6.16b, v20.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v8.16b, v20.16b\n" - "add x21, x21, #4\n" - "mov v5.16b, v20.16b\n" - "add x14, x14, #4\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "bne 5b\n" - "6:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr s24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr s29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr s30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr s25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr s26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr s23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr s28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr s27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr s24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr s21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "add x25, x25, #4\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr s19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr s18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #4\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "ldr s23, [x13, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr s25, [x24, x9]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "add x13, x13, #4\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x24, x24, #4\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "movi v29.16b, #0\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str s4, [%[outptr0]]\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "str s3, [%[outptr0], %[output_col_stride1]]\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "str s2, [%[outptr0], x19]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s1, [x21]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str s9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str s8, [x21, x19]\n" - "str s7, [x14]\n" - "str s6, [x14, %[output_col_stride1]]\n" - "add x21, x21, #4\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str s5, [x14, x19]\n" - "add x14, x14, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU6>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x17, %[inptr0], %[input_row_stride]\n" - "add x9, %[input_col_stride1], %[input_col_stride1]\n" - "add x25, %[outptr0], %[output_row_stride]\n" - "add x14, x17, %[input_row_stride]\n" - "add x22, x9, #64\n" - "add x15, x9, %[input_col_stride1]\n" - "add x21, x14, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x24, x15, %[input_col_stride1]\n" - "add x26, x21, %[input_row_stride]\n" - "add x23, x24, #64\n" - "add x13, x25, %[output_row_stride]\n" - "add x27, %[output_col_stride1], %[output_col_stride1]\n" - "and x19, %[n_channels], #3\n" - "lsr x20, %[n_channels], #2\n" - "cbz x20, 4f\n" - "1:\n" - "ldr q19, [%[wbptr]]\n" - "subs x20, x20, #1\n" - "mov v8.16b, v19.16b\n" - "ldr q17, [%[wbptr], #16]\n" - "mov v5.16b, v19.16b\n" - "ldr q16, [%[wbptr], #32]\n" - "mov v7.16b, v19.16b\n" - "ldr q15, [%[wbptr], #48]\n" - "mov v2.16b, v19.16b\n" - "ldr q14, [%[wbptr], #64]\n" - "mov v4.16b, v19.16b\n" - "ldr q13, [%[wbptr], #80]\n" - "mov v6.16b, v19.16b\n" - "ldr q12, [%[wbptr], #96]\n" - "mov v1.16b, v19.16b\n" - "ldr q11, [%[wbptr], #112]\n" - "mov v3.16b, v19.16b\n" - "ldr q10, [%[wbptr], #128]\n" - "mov v0.16b, v19.16b\n" - "ldr q9, [%[wbptr], #144]\n" - "ldr q25, [%[inptr0]]\n" - "ldr q27, [x17]\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "ldr q20, [x14]\n" - "ldr q22, [x17, %[input_col_stride1]]\n" - "ldr q28, [%[inptr0], x9]\n" - "ldr q23, [x21]\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "ldr q18, [x14, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x17, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x14, #64]\n" - "prfm pldl1keep, [x17, x28]\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x14, x28]\n" - "beq 3f\n" - "2:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr q30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr q31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr q24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr q21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr q19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr q28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr q27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr q26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr q18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "add x17, x17, #16\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "prfm pldl1keep, [x17, #64]\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "prfm pldl1keep, [x17, x28]\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "subs x20, x20, #1\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr q22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr q23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #16\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x14, #64]\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "ldr q24, [x21, x24]\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x14, x28]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "ldr q21, [x26, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr q19, [%[wbptr]]\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x21, x21, #16\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "ldr q17, [%[wbptr], #16]\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "ldr q25, [%[inptr0]]\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "ldr q14, [%[wbptr], #64]\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "ldr q27, [x17]\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "ldr q16, [%[wbptr], #32]\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "ldr q20, [x14]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "ldr q11, [%[wbptr], #112]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "movi v30.16b, #0\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "ldr q13, [%[wbptr], #80]\n" - "fmov v29.4s, #6.0\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "ldr q15, [%[wbptr], #48]\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "ldr q22, [x17, %[input_col_stride1]]\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "str q8, [%[outptr0]]\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "str q7, [%[outptr0], %[output_col_stride1]]\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "ldr q10, [%[wbptr], #128]\n" - "str q6, [%[outptr0], x27]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "ldr q12, [%[wbptr], #96]\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "ldr q28, [%[inptr0], x9]\n" - "str q5, [x25]\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "ldr q9, [%[wbptr], #144]\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "ldr q23, [x21]\n" - "str q4, [x25, %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str q3, [x25, x27]\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "ldr q18, [x14, %[input_col_stride1]]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q2, [x13]\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "str q1, [x13, %[output_col_stride1]]\n" - "mov v8.16b, v19.16b\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "add x25, x25, #16\n" - "mov v5.16b, v19.16b\n" - "mov v7.16b, v19.16b\n" - "str q0, [x13, x27]\n" - "mov v2.16b, v19.16b\n" - "mov v4.16b, v19.16b\n" - "add x13, x13, #16\n" - "mov v6.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "mov v3.16b, v19.16b\n" - "mov v0.16b, v19.16b\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "bne 2b\n" - "3:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr q30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr q31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr q24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr q21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr q19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr q28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr q27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr q26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "add x17, x17, #16\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr q18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr q22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr q23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #16\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr q24, [x21, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr q21, [x26, x24]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "add x21, x21, #16\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x26, x26, #16\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "movi v30.16b, #0\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "fmov v29.4s, #6.0\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "str q8, [%[outptr0]]\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "str q7, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "str q6, [%[outptr0], x27]\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "str q5, [x25]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str q4, [x25, %[output_col_stride1]]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "str q3, [x25, x27]\n" - "str q2, [x13]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "add x25, x25, #16\n" - "str q1, [x13, %[output_col_stride1]]\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "str q0, [x13, x27]\n" - "add x13, x13, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s19, [%[wbptr]]\n" - "mov v8.16b, v19.16b\n" - "ldr s17, [%[wbptr], #4]\n" - "mov v5.16b, v19.16b\n" - "ldr s16, [%[wbptr], #8]\n" - "mov v7.16b, v19.16b\n" - "ldr s15, [%[wbptr], #12]\n" - "mov v2.16b, v19.16b\n" - "ldr s14, [%[wbptr], #16]\n" - "mov v4.16b, v19.16b\n" - "ldr s13, [%[wbptr], #20]\n" - "mov v6.16b, v19.16b\n" - "ldr s12, [%[wbptr], #24]\n" - "mov v1.16b, v19.16b\n" - "ldr s11, [%[wbptr], #28]\n" - "mov v3.16b, v19.16b\n" - "ldr s10, [%[wbptr], #32]\n" - "mov v0.16b, v19.16b\n" - "ldr s9, [%[wbptr], #36]\n" - "ldr s25, [%[inptr0]]\n" - "subs x19, x19, #1\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "ldr s27, [x17]\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "ldr s20, [x14]\n" - "ldr s22, [x17, %[input_col_stride1]]\n" - "ldr s28, [%[inptr0], x9]\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "ldr s23, [x21]\n" - "ldr s18, [x14, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x17, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x14, #64]\n" - "prfm pldl1keep, [x17, x28]\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x14, x28]\n" - "beq 6f\n" - "5:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr s30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr s31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr s24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr s21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr s19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr s28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr s27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr s26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr s18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "add x17, x17, #4\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "prfm pldl1keep, [x17, #64]\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "prfm pldl1keep, [x17, x28]\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "subs x19, x19, #1\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr s22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr s23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #4\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x14, #64]\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "ldr s24, [x21, x24]\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x14, x28]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "ldr s21, [x26, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr s19, [%[wbptr]]\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x21, x21, #4\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "ldr s17, [%[wbptr], #4]\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "ldr s25, [%[inptr0]]\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "ldr s14, [%[wbptr], #16]\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "ldr s27, [x17]\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "ldr s16, [%[wbptr], #8]\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "ldr s20, [x14]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "ldr s11, [%[wbptr], #28]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "movi v30.16b, #0\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "ldr s13, [%[wbptr], #20]\n" - "fmov v29.4s, #6.0\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "ldr s15, [%[wbptr], #12]\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "ldr s22, [x17, %[input_col_stride1]]\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "str s8, [%[outptr0]]\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "str s7, [%[outptr0], %[output_col_stride1]]\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "ldr s10, [%[wbptr], #32]\n" - "str s6, [%[outptr0], x27]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "ldr s12, [%[wbptr], #24]\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "ldr s28, [%[inptr0], x9]\n" - "str s5, [x25]\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "ldr s9, [%[wbptr], #36]\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "ldr s23, [x21]\n" - "str s4, [x25, %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str s3, [x25, x27]\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "ldr s18, [x14, %[input_col_stride1]]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s2, [x13]\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "str s1, [x13, %[output_col_stride1]]\n" - "mov v8.16b, v19.16b\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "add x25, x25, #4\n" - "mov v5.16b, v19.16b\n" - "mov v7.16b, v19.16b\n" - "str s0, [x13, x27]\n" - "mov v2.16b, v19.16b\n" - "mov v4.16b, v19.16b\n" - "add x13, x13, #4\n" - "mov v6.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "mov v3.16b, v19.16b\n" - "mov v0.16b, v19.16b\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "bne 5b\n" - "6:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr s30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr s31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr s24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr s21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr s19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr s28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr s27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr s26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "add x17, x17, #4\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr s18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr s22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr s23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #4\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr s24, [x21, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr s21, [x26, x24]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "add x21, x21, #4\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x26, x26, #4\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "movi v30.16b, #0\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "fmov v29.4s, #6.0\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "str s8, [%[outptr0]]\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "str s7, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "str s6, [%[outptr0], x27]\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "str s5, [x25]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str s4, [x25, %[output_col_stride1]]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "str s3, [x25, x27]\n" - "str s2, [x13]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "add x25, x25, #4\n" - "str s1, [x13, %[output_col_stride1]]\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "str s0, [x13, x27]\n" - "add x13, x13, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp deleted file mode 100644 index b798b8cdbe..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void* weight_bias_ptr, - const float* input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float* output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x15, %[inptr0], %[input_row_stride]\n" - "add x26, %[input_col_stride1], %[input_col_stride1]\n" - "add x21, %[outptr0], %[output_row_stride]\n" - "add x16, x15, %[input_row_stride]\n" - "add x27, x26, %[input_col_stride1]\n" - "add x22, x21, %[output_row_stride]\n" - "add x17, x16, %[input_row_stride]\n" - "add x28, x27, %[input_col_stride1]\n" - "add x23, %[output_col_stride1], %[output_col_stride1]\n" - "add x9, x17, %[input_row_stride]\n" - "add x13, x28, %[input_col_stride1]\n" - "and x24, %[n_channels], #3\n" - "add x19, x9, %[input_row_stride]\n" - "add x14, x13, %[input_col_stride1]\n" - "lsr x25, %[n_channels], #2\n" - "add x20, x19, %[input_row_stride]\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q27, [%[wbptr]]\n" - "subs x25, x25, #1\n" - "mov v17.16b, v27.16b\n" - "ldr q6, [%[wbptr], #16]\n" - "mov v16.16b, v27.16b\n" - "ldr q14, [%[wbptr], #32]\n" - "mov v15.16b, v27.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v2.16b, v27.16b\n" - "ldr q12, [%[wbptr], #64]\n" - "mov v4.16b, v27.16b\n" - "ldr q11, [%[wbptr], #80]\n" - "mov v5.16b, v27.16b\n" - "ldr q10, [%[wbptr], #96]\n" - "mov v1.16b, v27.16b\n" - "ldr q9, [%[wbptr], #112]\n" - "mov v3.16b, v27.16b\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v0.16b, v27.16b\n" - "ldr q7, [%[wbptr], #144]\n" - "ldr q29, [%[inptr0]]\n" - "ldr q28, [x15]\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "ldr q22, [x16]\n" - "ldr q20, [x15, %[input_col_stride1]]\n" - "ldr q19, [%[inptr0], x26]\n" - "ldr q30, [x17]\n" - "ldr q18, [x16, %[input_col_stride1]]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr q21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr q25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr q24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr q23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "subs x25, x25, #1\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr q22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "ldr q30, [x19]\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr q29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr q28, [x17, x26]\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr q24, [x16, x27]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr q25, [x15, x28]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr q19, [%[inptr0], x13]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q18, [x20]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "ldr q22, [x19, %[input_col_stride1]]\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr q26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr q20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr q27, [x16, x28]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "ldr q30, [x15, x13]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr q24, [%[inptr0], x14]\n" - "str q17, [%[outptr0]]\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr q28, [x20, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "ldr q17, [x19, x26]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr q18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr q25, [x17, x28]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr q22, [x16, x13]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "str q16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "ldr q21, [x15, x14]\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr q23, [x20, x26]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr q19, [x19, x27]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "add x15, x15, #16\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr q29, [x9, x28]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr q27, [x17, x13]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr q28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr q26, [x20, x27]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr q20, [x19, x28]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr q17, [x9, x13]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "ldr q18, [x17, x14]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr q16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x16, x16, #16\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr q15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "add x17, x17, #16\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr q21, [x9, x14]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr q23, [x20, x13]\n" - "str q2, [x22]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr q24, [x19, x14]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "ldr q25, [x20, x14]\n" - "str q4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr q27, [%[wbptr]]\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr q29, [%[inptr0]]\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "ldr q6, [%[wbptr], #16]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr q28, [x15]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "str q5, [%[outptr0], x23]\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "ldr q22, [x16]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "ldr q20, [x15, %[input_col_stride1]]\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "ldr q12, [%[wbptr], #64]\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "ldr q19, [%[inptr0], x26]\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "ldr q30, [x17]\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "ldr q14, [%[wbptr], #32]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "ldr q18, [x16, %[input_col_stride1]]\n" - "str q1, [x22, %[output_col_stride1]]\n" - "mov v17.16b, v27.16b\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "str q3, [x21, x23]\n" - "mov v16.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "add x9, x9, #16\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "ldr q11, [%[wbptr], #80]\n" - "mov v2.16b, v27.16b\n" - "add x19, x19, #16\n" - "mov v4.16b, v27.16b\n" - "add x20, x20, #16\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v5.16b, v27.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v1.16b, v27.16b\n" - "add x21, x21, #16\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v3.16b, v27.16b\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "ldr q10, [%[wbptr], #96]\n" - "str q0, [x22, x23]\n" - "mov v0.16b, v27.16b\n" - "ldr q7, [%[wbptr], #144]\n" - "add x22, x22, #16\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr q21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr q25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr q24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr q23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr q22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "ldr q30, [x19]\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr q29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr q28, [x17, x26]\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "ldr q24, [x16, x27]\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr q25, [x15, x28]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "ldr q19, [%[inptr0], x13]\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr q18, [x20]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q22, [x19, %[input_col_stride1]]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr q26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr q20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "ldr q27, [x16, x28]\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr q30, [x15, x13]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "ldr q24, [%[inptr0], x14]\n" - "str q17, [%[outptr0]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr q28, [x20, %[input_col_stride1]]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr q17, [x19, x26]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr q18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr q25, [x17, x28]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "ldr q22, [x16, x13]\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "str q16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr q21, [x15, x14]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr q23, [x20, x26]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "ldr q19, [x19, x27]\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "add x15, x15, #16\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr q29, [x9, x28]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "ldr q27, [x17, x13]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr q28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr q26, [x20, x27]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr q20, [x19, x28]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr q17, [x9, x13]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr q18, [x17, x14]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "add x16, x16, #16\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr q16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x17, x17, #16\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr q15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr q21, [x9, x14]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "ldr q23, [x20, x13]\n" - "str q2, [x22]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr q24, [x19, x14]\n" - "str q4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr q25, [x20, x14]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "add x19, x19, #16\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "add x20, x20, #16\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "str q5, [%[outptr0], x23]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "str q1, [x22, %[output_col_stride1]]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "str q3, [x21, x23]\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "add x21, x21, #16\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "str q0, [x22, x23]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x24, 7f\n" - "ldr s27, [%[wbptr]]\n" - "mov v17.16b, v27.16b\n" - "ldr s6, [%[wbptr], #4]\n" - "mov v16.16b, v27.16b\n" - "ldr s14, [%[wbptr], #8]\n" - "mov v15.16b, v27.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v2.16b, v27.16b\n" - "ldr s12, [%[wbptr], #16]\n" - "mov v4.16b, v27.16b\n" - "ldr s11, [%[wbptr], #20]\n" - "mov v5.16b, v27.16b\n" - "ldr s10, [%[wbptr], #24]\n" - "mov v1.16b, v27.16b\n" - "ldr s9, [%[wbptr], #28]\n" - "mov v3.16b, v27.16b\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v0.16b, v27.16b\n" - "ldr s7, [%[wbptr], #36]\n" - "ldr s29, [%[inptr0]]\n" - "subs x24, x24, #1\n" - "ldr s28, [x15]\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "ldr s22, [x16]\n" - "ldr s20, [x15, %[input_col_stride1]]\n" - "ldr s19, [%[inptr0], x26]\n" - "ldr s30, [x17]\n" - "ldr s18, [x16, %[input_col_stride1]]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr s21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr s25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr s24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr s23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "subs x24, x24, #1\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr s22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "ldr s30, [x19]\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr s29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr s28, [x17, x26]\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr s24, [x16, x27]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr s25, [x15, x28]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr s19, [%[inptr0], x13]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s18, [x20]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "ldr s22, [x19, %[input_col_stride1]]\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr s26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr s20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr s27, [x16, x28]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "ldr s30, [x15, x13]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr s24, [%[inptr0], x14]\n" - "str s17, [%[outptr0]]\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr s28, [x20, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "ldr s17, [x19, x26]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr s18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr s25, [x17, x28]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr s22, [x16, x13]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "str s16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "ldr s21, [x15, x14]\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr s23, [x20, x26]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr s19, [x19, x27]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "add x15, x15, #4\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr s29, [x9, x28]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr s27, [x17, x13]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr s28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr s26, [x20, x27]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr s20, [x19, x28]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr s17, [x9, x13]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "ldr s18, [x17, x14]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr s16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x16, x16, #4\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr s15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "add x17, x17, #4\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr s21, [x9, x14]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr s23, [x20, x13]\n" - "str s2, [x22]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr s24, [x19, x14]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "ldr s25, [x20, x14]\n" - "str s4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr s27, [%[wbptr]]\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr s29, [%[inptr0]]\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "ldr s6, [%[wbptr], #4]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr s28, [x15]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "str s5, [%[outptr0], x23]\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "ldr s22, [x16]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "ldr s20, [x15, %[input_col_stride1]]\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "ldr s12, [%[wbptr], #16]\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "ldr s19, [%[inptr0], x26]\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "ldr s30, [x17]\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "ldr s14, [%[wbptr], #8]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "ldr s18, [x16, %[input_col_stride1]]\n" - "str s1, [x22, %[output_col_stride1]]\n" - "mov v17.16b, v27.16b\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "str s3, [x21, x23]\n" - "mov v16.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "add x9, x9, #4\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "ldr s11, [%[wbptr], #20]\n" - "mov v2.16b, v27.16b\n" - "add x19, x19, #4\n" - "mov v4.16b, v27.16b\n" - "add x20, x20, #4\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v5.16b, v27.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v1.16b, v27.16b\n" - "add x21, x21, #4\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v3.16b, v27.16b\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "ldr s10, [%[wbptr], #24]\n" - "str s0, [x22, x23]\n" - "mov v0.16b, v27.16b\n" - "ldr s7, [%[wbptr], #36]\n" - "add x22, x22, #4\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr s21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr s25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr s24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr s23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr s22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "ldr s30, [x19]\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr s29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr s28, [x17, x26]\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "ldr s24, [x16, x27]\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr s25, [x15, x28]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "ldr s19, [%[inptr0], x13]\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr s18, [x20]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s22, [x19, %[input_col_stride1]]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr s26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr s20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "ldr s27, [x16, x28]\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr s30, [x15, x13]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "ldr s24, [%[inptr0], x14]\n" - "str s17, [%[outptr0]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr s28, [x20, %[input_col_stride1]]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr s17, [x19, x26]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr s18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr s25, [x17, x28]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "ldr s22, [x16, x13]\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "str s16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr s21, [x15, x14]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr s23, [x20, x26]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "ldr s19, [x19, x27]\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "add x15, x15, #4\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr s29, [x9, x28]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "ldr s27, [x17, x13]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr s28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr s26, [x20, x27]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr s20, [x19, x28]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr s17, [x9, x13]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr s18, [x17, x14]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "add x16, x16, #4\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr s16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x17, x17, #4\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr s15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr s21, [x9, x14]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "ldr s23, [x20, x13]\n" - "str s2, [x22]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr s24, [x19, x14]\n" - "str s4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr s25, [x20, x14]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "add x19, x19, #4\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "add x20, x20, #4\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "str s5, [%[outptr0], x23]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "str s1, [x22, %[output_col_stride1]]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "str s3, [x21, x23]\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "add x21, x21, #4\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "str s0, [x22, x23]\n" - "add x22, x22, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output) - : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory" - ); -} -#endif // __aarch64__ - -template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 89d1f2238b..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,6018 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x8, %[inptr0], %[input_row_stride]\n" - "add x15, %[input_col_stride1], %[input_col_stride1]\n" - "add x23, %[outptr0], %[output_row_stride]\n" - "add x9, x8, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x17, x15, %[input_col_stride1]\n" - "add x10, x9, %[input_row_stride]\n" - "add x7, x17, #64\n" - "add x19, x17, %[input_col_stride1]\n" - "add x11, x10, %[input_row_stride]\n" - "add x20, x19, #64\n" - "add x21, x19, %[input_col_stride1]\n" - "add x12, x11, %[input_row_stride]\n" - "add x22, x21, #64\n" - "add x24, x23, %[output_row_stride]\n" - "add x25, x24, %[output_row_stride]\n" - "add x26, %[output_col_stride1], %[output_col_stride1]\n" - "and x13, %[n_channels], #3\n" - "add x27, x26, %[output_col_stride1]\n" - "lsr x14, %[n_channels], #2\n" - "cbz x14, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x14, x14, #1\n" - "mov v17.16b, v14.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v23.16b, v14.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v24.16b, v14.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v20.16b, v14.16b\n" - "ldr q9, [%[wbptr], #64]\n" - "mov v16.16b, v14.16b\n" - "ldr q8, [%[wbptr], #80]\n" - "mov v13.16b, v14.16b\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v0.16b, v14.16b\n" - "ldr q6, [%[wbptr], #112]\n" - "mov v1.16b, v14.16b\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v2.16b, v14.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "mov v3.16b, v14.16b\n" - "ldr q29, [%[inptr0]]\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "ldr q28, [x8]\n" - "ldr q30, [%[inptr0], %[input_col_stride1]]\n" - "ldr q25, [x9]\n" - "ldr q26, [x8, %[input_col_stride1]]\n" - "ldr q27, [%[inptr0], x15]\n" - "ldr q15, [x10]\n" - "ldr q18, [x9, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x8, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x8, x28]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x10, #64]\n" - "prfm pldl1keep, [x9, x28]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr q22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr q29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr q25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr q28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr q19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr q21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr q27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr q22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr q25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "subs x14, x14, #1\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "str q17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr q29, [x9, x17]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr q26, [x8, x19]\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "str q23, [x23]\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr q28, [%[inptr0], x21]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr q30, [x12, %[input_col_stride1]]\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "str q24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "ldr q27, [x11, x15]\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr q23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr q24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr q29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr q14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #16\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "prfm pldl1keep, [x8, #64]\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "prfm pldl1keep, [x8, x28]\n" - "str q20, [x24]\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "ldr q25, [x11, x17]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "ldr q30, [x10, x19]\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "str q16, [x23, %[output_col_stride1]]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "ldr q26, [x9, x21]\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "ldr q27, [x12, x17]\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr q20, [x11, x19]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "add x9, x9, #16\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "str q13, [%[outptr0], x26]\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "ldr q23, [x10, x21]\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr q24, [x12, x19]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x9, x28]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr q16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #16\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr q13, [x12, x21]\n" - "str q0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "add x11, x11, #16\n" - "str q1, [x24, %[output_col_stride1]]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "ldr q29, [%[inptr0]]\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "ldr q28, [x8]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "add x12, x12, #16\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "str q2, [x23, x26]\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr q9, [%[wbptr], #64]\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "ldr q30, [%[inptr0], %[input_col_stride1]]\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "ldr q25, [x9]\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "str q3, [%[outptr0], x27]\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "ldr q26, [x8, %[input_col_stride1]]\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "ldr q27, [%[inptr0], x15]\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "ldr q15, [x10]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "ldr q6, [%[wbptr], #112]\n" - "str q18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "ldr q18, [x9, %[input_col_stride1]]\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "ldr q8, [%[wbptr], #80]\n" - "str q17, [x23, x27]\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "add x23, x23, #16\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "ldr q10, [%[wbptr], #48]\n" - "str q19, [x25, x26]\n" - "mov v17.16b, v14.16b\n" - "str q22, [x24, x27]\n" - "mov v23.16b, v14.16b\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v24.16b, v14.16b\n" - "add x24, x24, #16\n" - "mov v20.16b, v14.16b\n" - "mov v16.16b, v14.16b\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v13.16b, v14.16b\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v14.16b\n" - "mov v2.16b, v14.16b\n" - "str q21, [x25, x27]\n" - "mov v3.16b, v14.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "add x25, x25, #16\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr q22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr q29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr q25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr q28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr q19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr q21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr q27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr q22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr q25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "str q17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr q29, [x9, x17]\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "ldr q26, [x8, x19]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr q28, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr q30, [x12, %[input_col_stride1]]\n" - "str q23, [x23]\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "str q24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "ldr q27, [x11, x15]\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr q23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr q24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr q29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr q14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #16\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "str q20, [x24]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "ldr q25, [x11, x17]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "ldr q30, [x10, x19]\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "str q16, [x23, %[output_col_stride1]]\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "ldr q26, [x9, x21]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr q27, [x12, x17]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "ldr q20, [x11, x19]\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "str q13, [%[outptr0], x26]\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr q23, [x10, x21]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "ldr q24, [x12, x19]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr q16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #16\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "add x11, x11, #16\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr q13, [x12, x21]\n" - "str q0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "add x12, x12, #16\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "str q1, [x24, %[output_col_stride1]]\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "str q2, [x23, x26]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "str q3, [%[outptr0], x27]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "str q17, [x23, x27]\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "str q19, [x25, x26]\n" - "add x23, x23, #16\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "str q22, [x24, x27]\n" - "add x24, x24, #16\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "str q21, [x25, x27]\n" - "add x25, x25, #16\n" - "4:\n" - "cbz x13, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v17.16b, v14.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v23.16b, v14.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v24.16b, v14.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v20.16b, v14.16b\n" - "ldr s9, [%[wbptr], #16]\n" - "mov v16.16b, v14.16b\n" - "ldr s8, [%[wbptr], #20]\n" - "mov v13.16b, v14.16b\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v0.16b, v14.16b\n" - "ldr s6, [%[wbptr], #28]\n" - "mov v1.16b, v14.16b\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v2.16b, v14.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "mov v3.16b, v14.16b\n" - "ldr s29, [%[inptr0]]\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "ldr s28, [x8]\n" - "ldr s30, [%[inptr0], %[input_col_stride1]]\n" - "subs x13, x13, #1\n" - "ldr s25, [x9]\n" - "ldr s26, [x8, %[input_col_stride1]]\n" - "ldr s27, [%[inptr0], x15]\n" - "ldr s15, [x10]\n" - "ldr s18, [x9, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x8, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x8, x28]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x10, #64]\n" - "prfm pldl1keep, [x9, x28]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr s22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr s29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr s25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr s28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr s19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr s21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr s27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr s22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr s25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "subs x13, x13, #1\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "str s17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr s29, [x9, x17]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr s26, [x8, x19]\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "str s23, [x23]\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr s28, [%[inptr0], x21]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr s30, [x12, %[input_col_stride1]]\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "str s24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "ldr s27, [x11, x15]\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr s23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr s24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr s29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr s14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #4\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "prfm pldl1keep, [x8, #64]\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "prfm pldl1keep, [x8, x28]\n" - "str s20, [x24]\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "ldr s25, [x11, x17]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "ldr s30, [x10, x19]\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "str s16, [x23, %[output_col_stride1]]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "ldr s26, [x9, x21]\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "ldr s27, [x12, x17]\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr s20, [x11, x19]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "add x9, x9, #4\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "str s13, [%[outptr0], x26]\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "ldr s23, [x10, x21]\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr s24, [x12, x19]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x9, x28]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr s16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #4\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr s13, [x12, x21]\n" - "str s0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "add x11, x11, #4\n" - "str s1, [x24, %[output_col_stride1]]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "ldr s29, [%[inptr0]]\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "ldr s28, [x8]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "add x12, x12, #4\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "str s2, [x23, x26]\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr s9, [%[wbptr], #16]\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "ldr s30, [%[inptr0], %[input_col_stride1]]\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "ldr s25, [x9]\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "str s3, [%[outptr0], x27]\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "ldr s26, [x8, %[input_col_stride1]]\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "ldr s27, [%[inptr0], x15]\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "ldr s15, [x10]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "ldr s6, [%[wbptr], #28]\n" - "str s18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "ldr s18, [x9, %[input_col_stride1]]\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "ldr s8, [%[wbptr], #20]\n" - "str s17, [x23, x27]\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "add x23, x23, #4\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "ldr s10, [%[wbptr], #12]\n" - "str s19, [x25, x26]\n" - "mov v17.16b, v14.16b\n" - "str s22, [x24, x27]\n" - "mov v23.16b, v14.16b\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v24.16b, v14.16b\n" - "add x24, x24, #4\n" - "mov v20.16b, v14.16b\n" - "mov v16.16b, v14.16b\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v13.16b, v14.16b\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v14.16b\n" - "mov v2.16b, v14.16b\n" - "str s21, [x25, x27]\n" - "mov v3.16b, v14.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "add x25, x25, #4\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr s22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr s29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr s25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr s28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr s19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr s21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr s27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr s22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr s25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "str s17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr s29, [x9, x17]\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "ldr s26, [x8, x19]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr s28, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr s30, [x12, %[input_col_stride1]]\n" - "str s23, [x23]\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "str s24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "ldr s27, [x11, x15]\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr s23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr s24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr s29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr s14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #4\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "str s20, [x24]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "ldr s25, [x11, x17]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "ldr s30, [x10, x19]\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "str s16, [x23, %[output_col_stride1]]\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "ldr s26, [x9, x21]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr s27, [x12, x17]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "ldr s20, [x11, x19]\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "str s13, [%[outptr0], x26]\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr s23, [x10, x21]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "ldr s24, [x12, x19]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr s16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #4\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "add x11, x11, #4\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr s13, [x12, x21]\n" - "str s0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "add x12, x12, #4\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "str s1, [x24, %[output_col_stride1]]\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "str s2, [x23, x26]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "str s3, [%[outptr0], x27]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "str s17, [x23, x27]\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "str s19, [x25, x26]\n" - "add x23, x23, #4\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "str s22, [x24, x27]\n" - "add x24, x24, #4\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "str s21, [x25, x27]\n" - "add x25, x25, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::None>( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[6][6], - float *outptrs[4][4] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x15, %[n_channels], #3\n" - "lsr x16, %[n_channels], #2\n" - "cbz x16, 4f\n" - "1:\n" - "ldr q13, [%[wbptr]]\n" - "ldr x17, [%[inptrs], 0]\n" - "mov v18.16b, v13.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v22.16b, v13.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v23.16b, v13.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v19.16b, v13.16b\n" - "ldr q9, [%[wbptr], #64]\n" - "mov v17.16b, v13.16b\n" - "ldr q8, [%[wbptr], #80]\n" - "mov v14.16b, v13.16b\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v0.16b, v13.16b\n" - "ldr q6, [%[wbptr], #112]\n" - "mov v1.16b, v13.16b\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v2.16b, v13.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "ldr q29, [x17, x27]\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "ldr q27, [x7, x27]\n" - "ldr x19, [%[inptrs], 96]\n" - "ldr q28, [x17, x27]\n" - "ldr x7, [%[inptrs], 56]\n" - "ldr q25, [x19, x27]\n" - "ldr x17, [%[inptrs], 16]\n" - "ldr q16, [x7, x27]\n" - "ldr x20, [%[inptrs], 144]\n" - "ldr q15, [x17, x27]\n" - "ldr x19, [%[inptrs], 104]\n" - "ldr q21, [x20, x27]\n" - "subs x16, x16, #1\n" - "ldr q29, [x19, x27]\n" - "beq 3f\n" - "2:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr q31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr q25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr q21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "subs x16, x16, #1\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q27, [x22, x27]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr q30, [x21, x27]\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "ldr q24, [x20, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str q18, [x23, x28]\n" - "mov v16.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr q27, [x17, x27]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "ldr q28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "ldr x17, [%[inptrs], 0]\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "str q22, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr q22, [x21, x27]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "str q23, [x23, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr q30, [x20, x27]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr q31, [x19, x27]\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x7, x27]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr q23, [x22, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "str q19, [x25, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr q27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "ldr q28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str q17, [x24, x28]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "ldr q22, [x19, x27]\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x19, [%[inptrs], 96]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr q30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr q19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str q14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr q31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr q17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr q26, [x22, x27]\n" - "str q0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr q13, [%[wbptr]]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "add x27, x27, #16\n" - "str q1, [x25, x28]\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr q29, [x17, x27]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "str q2, [x24, x28]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "ldr q9, [%[wbptr], #64]\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "ldr q28, [x17, x27]\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "str q3, [x23, x28]\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x7, [%[inptrs], 56]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "ldr x17, [%[inptrs], 16]\n" - "str q16, [x26, x28]\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "ldr q16, [x7, x27]\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "ldr q6, [%[wbptr], #112]\n" - "str q15, [x25, x28]\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "ldr q15, [x17, x27]\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "ldr q8, [%[wbptr], #80]\n" - "str q18, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "mov v22.16b, v13.16b\n" - "ldr x20, [%[inptrs], 144]\n" - "str q21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "mov v23.16b, v13.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "str q24, [x25, x28]\n" - "mov v19.16b, v13.16b\n" - "mov v17.16b, v13.16b\n" - "ldr q21, [x20, x27]\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v14.16b, v13.16b\n" - "ldr x26, [%[outptrs], 120]\n" - "mov v0.16b, v13.16b\n" - "ldr x19, [%[inptrs], 104]\n" - "mov v1.16b, v13.16b\n" - "mov v2.16b, v13.16b\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr q29, [x19, x27]\n" - "str q20, [x26, x28]\n" - "ldr q4, [%[wbptr], #144]\n" - "add x28, x28, #16\n" - "bne 2b\n" - "3:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr q31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr q25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr q21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "mov v16.16b, v13.16b\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr q27, [x22, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr q30, [x21, x27]\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str q18, [x23, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr q24, [x20, x27]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr q27, [x17, x27]\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "ldr q28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "str q22, [x24, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr q22, [x21, x27]\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "str q23, [x23, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q30, [x20, x27]\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr q31, [x19, x27]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x7, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr q23, [x22, x27]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "str q19, [x25, x28]\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "ldr q27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr q28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str q17, [x24, x28]\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr q22, [x19, x27]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr q30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr q19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str q14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr q31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr q17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr q26, [x22, x27]\n" - "str q0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "add x27, x27, #16\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "str q1, [x25, x28]\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "str q2, [x24, x28]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "str q3, [x23, x28]\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "str q16, [x26, x28]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "str q15, [x25, x28]\n" - "str q18, [x24, x28]\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "str q21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "ldr x26, [%[outptrs], 120]\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "str q24, [x25, x28]\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "str q20, [x26, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x15, 7f\n" - "ldr s13, [%[wbptr]]\n" - "mov v18.16b, v13.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v22.16b, v13.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v23.16b, v13.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v19.16b, v13.16b\n" - "ldr s9, [%[wbptr], #16]\n" - "mov v17.16b, v13.16b\n" - "ldr s8, [%[wbptr], #20]\n" - "mov v14.16b, v13.16b\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v0.16b, v13.16b\n" - "ldr s6, [%[wbptr], #28]\n" - "mov v1.16b, v13.16b\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v2.16b, v13.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "ldr x17, [%[inptrs], 0]\n" - "ldr x7, [%[inptrs], 48]\n" - "ldr x19, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 144]\n" - "subs x15, x15, #1\n" - "ldr s29, [x17, x27]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr s27, [x7, x27]\n" - "ldr s25, [x19, x27]\n" - "ldr x17, [%[inptrs], 8]\n" - "ldr s21, [x20, x27]\n" - "ldr x7, [%[inptrs], 56]\n" - "ldr s28, [x17, x27]\n" - "ldr x19, [%[inptrs], 104]\n" - "ldr s16, [x7, x27]\n" - "ldr x17, [%[inptrs], 16]\n" - "ldr s29, [x19, x27]\n" - "ldr s15, [x17, x27]\n" - "beq 6f\n" - "5:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr s31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr s25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr s21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "subs x15, x15, #1\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s27, [x22, x27]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr s30, [x21, x27]\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "ldr s24, [x20, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str s18, [x23, x28]\n" - "mov v16.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr s27, [x17, x27]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "ldr s28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "ldr x17, [%[inptrs], 0]\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "str s22, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr s22, [x21, x27]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "str s23, [x23, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr s30, [x20, x27]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr s31, [x19, x27]\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x7, x27]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr s23, [x22, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "str s19, [x25, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr s27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "ldr s28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str s17, [x24, x28]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "ldr s22, [x19, x27]\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x19, [%[inptrs], 96]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr s30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr s19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str s14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr s31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr s17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr s26, [x22, x27]\n" - "str s0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr s13, [%[wbptr]]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "add x27, x27, #4\n" - "str s1, [x25, x28]\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr s29, [x17, x27]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "str s2, [x24, x28]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "ldr s9, [%[wbptr], #16]\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "ldr s28, [x17, x27]\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "str s3, [x23, x28]\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x7, [%[inptrs], 56]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "ldr x17, [%[inptrs], 16]\n" - "str s16, [x26, x28]\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "ldr s16, [x7, x27]\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "ldr s6, [%[wbptr], #28]\n" - "str s15, [x25, x28]\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "ldr s15, [x17, x27]\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "ldr s8, [%[wbptr], #20]\n" - "str s18, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "mov v22.16b, v13.16b\n" - "ldr x20, [%[inptrs], 144]\n" - "str s21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "mov v23.16b, v13.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "str s24, [x25, x28]\n" - "mov v19.16b, v13.16b\n" - "mov v17.16b, v13.16b\n" - "ldr s21, [x20, x27]\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v14.16b, v13.16b\n" - "ldr x26, [%[outptrs], 120]\n" - "mov v0.16b, v13.16b\n" - "ldr x19, [%[inptrs], 104]\n" - "mov v1.16b, v13.16b\n" - "mov v2.16b, v13.16b\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr s29, [x19, x27]\n" - "str s20, [x26, x28]\n" - "ldr s4, [%[wbptr], #36]\n" - "add x28, x28, #4\n" - "bne 5b\n" - "6:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr s31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr s25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr s21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "mov v16.16b, v13.16b\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr s27, [x22, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr s30, [x21, x27]\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str s18, [x23, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr s24, [x20, x27]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr s27, [x17, x27]\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "ldr s28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "str s22, [x24, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr s22, [x21, x27]\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "str s23, [x23, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s30, [x20, x27]\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr s31, [x19, x27]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x7, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr s23, [x22, x27]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "str s19, [x25, x28]\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "ldr s27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr s28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str s17, [x24, x28]\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr s22, [x19, x27]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr s30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr s19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str s14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr s31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr s17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr s26, [x22, x27]\n" - "str s0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "add x27, x27, #4\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "str s1, [x25, x28]\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "str s2, [x24, x28]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "str s3, [x23, x28]\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "str s16, [x26, x28]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "str s15, [x25, x28]\n" - "str s18, [x24, x28]\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "str s21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "ldr x26, [%[outptrs], 120]\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "str s24, [x25, x28]\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "str s20, [x26, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x9, %[inptr0], %[input_row_stride]\n" - "add x28, %[input_col_stride1], %[input_col_stride1]\n" - "add x16, %[outptr0], %[output_row_stride]\n" - "add x24, x9, %[input_row_stride]\n" - "add x25, x28, #64\n" - "add x23, x28, %[input_col_stride1]\n" - "add x26, x24, %[input_row_stride]\n" - "add x11, x23, #64\n" - "add x12, x23, %[input_col_stride1]\n" - "add x10, x26, %[input_row_stride]\n" - "add x13, x12, #64\n" - "add x14, x12, %[input_col_stride1]\n" - "add x27, x10, %[input_row_stride]\n" - "add x15, x14, #64\n" - "add x17, x16, %[output_row_stride]\n" - "add x7, x17, %[output_row_stride]\n" - "add x19, %[output_col_stride1], %[output_col_stride1]\n" - "and x21, %[n_channels], #3\n" - "add x20, x19, %[output_col_stride1]\n" - "lsr x22, %[n_channels], #2\n" - "cbz x22, 4f\n" - "1:\n" - "ldr q21, [%[wbptr]]\n" - "subs x22, x22, #1\n" - "mov v7.16b, v21.16b\n" - "ldr q20, [%[wbptr], #16]\n" - "mov v3.16b, v21.16b\n" - "ldr q14, [%[wbptr], #32]\n" - "mov v6.16b, v21.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v15.16b, v21.16b\n" - "ldr q17, [%[wbptr], #64]\n" - "mov v2.16b, v21.16b\n" - "ldr q12, [%[wbptr], #80]\n" - "mov v5.16b, v21.16b\n" - "ldr q11, [%[wbptr], #96]\n" - "mov v0.16b, v21.16b\n" - "ldr q10, [%[wbptr], #112]\n" - "mov v16.16b, v21.16b\n" - "ldr q9, [%[wbptr], #128]\n" - "mov v1.16b, v21.16b\n" - "ldr q8, [%[wbptr], #144]\n" - "mov v4.16b, v21.16b\n" - "ldr q22, [%[inptr0]]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr q19, [x9]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "ldr q18, [x24]\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "ldr q27, [x9, %[input_col_stride1]]\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "ldr q28, [%[inptr0], x28]\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "ldr q25, [x26]\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "ldr q22, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "prfm pldl1keep, [x9, x8]\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "prfm pldl1keep, [x26, #64]\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "beq 3f\n" - "2:\n" - "mov v18.16b, v21.16b\n" - "ldr q23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr q24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr q26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr q25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr q22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr q23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr q24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr q29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr q26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "subs x22, x22, #1\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr q27, [x26, x28]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "ldr q29, [x24, x23]\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "ldr q28, [x9, x12]\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "ldr q21, [%[inptr0], x14]\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr q26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr q21, [x10, x28]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q29, [x26, x23]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr q21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr q28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr q27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "ldr q20, [x10, x23]\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr q26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr q21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #16\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr q28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr q17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr q26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #16\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr q27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "add x10, x10, #16\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "ldr q28, [x27, x14]\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr q21, [%[wbptr]]\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "add x27, x27, #16\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "ldr q20, [%[wbptr], #16]\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "ldr q14, [%[wbptr], #32]\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "ldr q17, [%[wbptr], #64]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "str q7, [%[outptr0]]\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "str q6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "ldr q12, [%[wbptr], #80]\n" - "str q5, [%[outptr0], x19]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "ldr q10, [%[wbptr], #112]\n" - "str q4, [%[outptr0], x20]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str q3, [x16]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q11, [%[wbptr], #96]\n" - "str q2, [x16, %[output_col_stride1]]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "str q1, [x16, x19]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str q15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str q16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str q19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x7, x19]\n" - "mov v7.16b, v21.16b\n" - "str q24, [x7, x20]\n" - "mov v3.16b, v21.16b\n" - "mov v6.16b, v21.16b\n" - "ldr q9, [%[wbptr], #128]\n" - "mov v15.16b, v21.16b\n" - "ldr q8, [%[wbptr], #144]\n" - "mov v2.16b, v21.16b\n" - "ldr q22, [%[inptr0]]\n" - "mov v5.16b, v21.16b\n" - "ldr q19, [x9]\n" - "mov v0.16b, v21.16b\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "mov v16.16b, v21.16b\n" - "ldr q18, [x24]\n" - "mov v1.16b, v21.16b\n" - "ldr q27, [x9, %[input_col_stride1]]\n" - "mov v4.16b, v21.16b\n" - "ldr q28, [%[inptr0], x28]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr q25, [x26]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr q22, [x24, %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "add x16, x16, #16\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "add x17, x17, #16\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "add x7, x7, #16\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "bne 2b\n" - "3:\n" - "mov v18.16b, v21.16b\n" - "ldr q23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr q24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr q26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr q25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr q22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr q23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr q24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr q29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr q26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "ldr q27, [x26, x28]\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr q29, [x24, x23]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr q28, [x9, x12]\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "ldr q21, [%[inptr0], x14]\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr q26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr q21, [x10, x28]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "ldr q29, [x26, x23]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr q21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr q28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr q27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "ldr q20, [x10, x23]\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr q26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr q21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #16\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr q28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr q17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr q26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #16\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr q27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "add x10, x10, #16\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr q28, [x27, x14]\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "add x27, x27, #16\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str q7, [%[outptr0]]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "str q6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str q5, [%[outptr0], x19]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "str q4, [%[outptr0], x20]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str q3, [x16]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str q2, [x16, %[output_col_stride1]]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str q1, [x16, x19]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str q15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str q16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str q19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x7, x19]\n" - "add x16, x16, #16\n" - "str q24, [x7, x20]\n" - "add x17, x17, #16\n" - "add x7, x7, #16\n" - "4:\n" - "cbz x21, 7f\n" - "ldr s21, [%[wbptr]]\n" - "mov v7.16b, v21.16b\n" - "ldr s20, [%[wbptr], #4]\n" - "mov v3.16b, v21.16b\n" - "ldr s14, [%[wbptr], #8]\n" - "mov v6.16b, v21.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v15.16b, v21.16b\n" - "ldr s17, [%[wbptr], #16]\n" - "mov v2.16b, v21.16b\n" - "ldr s12, [%[wbptr], #20]\n" - "mov v5.16b, v21.16b\n" - "ldr s11, [%[wbptr], #24]\n" - "mov v0.16b, v21.16b\n" - "ldr s10, [%[wbptr], #28]\n" - "mov v16.16b, v21.16b\n" - "ldr s9, [%[wbptr], #32]\n" - "mov v1.16b, v21.16b\n" - "ldr s8, [%[wbptr], #36]\n" - "mov v4.16b, v21.16b\n" - "ldr s22, [%[inptr0]]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr s19, [x9]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "ldr s18, [x24]\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "ldr s27, [x9, %[input_col_stride1]]\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "ldr s28, [%[inptr0], x28]\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "ldr s25, [x26]\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "ldr s22, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x9, #64]\n" - "subs x21, x21, #1\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "prfm pldl1keep, [x26, #64]\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "beq 6f\n" - "5:\n" - "mov v18.16b, v21.16b\n" - "ldr s23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr s24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr s26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr s25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr s22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr s23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr s24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr s29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr s26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "subs x21, x21, #1\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr s27, [x26, x28]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "ldr s29, [x24, x23]\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "ldr s28, [x9, x12]\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "ldr s21, [%[inptr0], x14]\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr s26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr s21, [x10, x28]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s29, [x26, x23]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr s21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr s28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr s27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "ldr s20, [x10, x23]\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr s26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr s21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #4\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr s28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr s17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr s26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #4\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr s27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "add x10, x10, #4\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "ldr s28, [x27, x14]\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr s21, [%[wbptr]]\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "add x27, x27, #4\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "ldr s20, [%[wbptr], #4]\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "ldr s14, [%[wbptr], #8]\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "ldr s17, [%[wbptr], #16]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "str s7, [%[outptr0]]\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "str s6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "ldr s12, [%[wbptr], #20]\n" - "str s5, [%[outptr0], x19]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "ldr s10, [%[wbptr], #28]\n" - "str s4, [%[outptr0], x20]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str s3, [x16]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s11, [%[wbptr], #24]\n" - "str s2, [x16, %[output_col_stride1]]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "str s1, [x16, x19]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str s15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str s16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str s19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x7, x19]\n" - "mov v7.16b, v21.16b\n" - "str s24, [x7, x20]\n" - "mov v3.16b, v21.16b\n" - "mov v6.16b, v21.16b\n" - "ldr s9, [%[wbptr], #32]\n" - "mov v15.16b, v21.16b\n" - "ldr s8, [%[wbptr], #36]\n" - "mov v2.16b, v21.16b\n" - "ldr s22, [%[inptr0]]\n" - "mov v5.16b, v21.16b\n" - "ldr s19, [x9]\n" - "mov v0.16b, v21.16b\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "mov v16.16b, v21.16b\n" - "ldr s18, [x24]\n" - "mov v1.16b, v21.16b\n" - "ldr s27, [x9, %[input_col_stride1]]\n" - "mov v4.16b, v21.16b\n" - "ldr s28, [%[inptr0], x28]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr s25, [x26]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr s22, [x24, %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "add x16, x16, #4\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "add x17, x17, #4\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "add x7, x7, #4\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "bne 5b\n" - "6:\n" - "mov v18.16b, v21.16b\n" - "ldr s23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr s24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr s26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr s25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr s22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr s23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr s24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr s29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr s26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "ldr s27, [x26, x28]\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr s29, [x24, x23]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr s28, [x9, x12]\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "ldr s21, [%[inptr0], x14]\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr s26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr s21, [x10, x28]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "ldr s29, [x26, x23]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr s21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr s28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr s27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "ldr s20, [x10, x23]\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr s26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr s21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #4\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr s28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr s17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr s26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #4\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr s27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "add x10, x10, #4\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr s28, [x27, x14]\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "add x27, x27, #4\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str s7, [%[outptr0]]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "str s6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str s5, [%[outptr0], x19]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "str s4, [%[outptr0], x20]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str s3, [x16]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str s2, [x16, %[output_col_stride1]]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str s1, [x16, x19]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str s15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str s16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str s19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x7, x19]\n" - "add x16, x16, #4\n" - "str s24, [x7, x20]\n" - "add x17, x17, #4\n" - "add x7, x7, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU>( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[6][6], - float *outptrs[4][4] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x19, %[n_channels], #3\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q25, [%[wbptr]]\n" - "ldr x25, [%[inptrs], 0]\n" - "mov v2.16b, v25.16b\n" - "ldr q22, [%[wbptr], #16]\n" - "mov v16.16b, v25.16b\n" - "ldr q9, [%[wbptr], #32]\n" - "mov v18.16b, v25.16b\n" - "ldr q8, [%[wbptr], #48]\n" - "mov v13.16b, v25.16b\n" - "ldr q19, [%[wbptr], #64]\n" - "mov v0.16b, v25.16b\n" - "ldr q7, [%[wbptr], #80]\n" - "mov v17.16b, v25.16b\n" - "ldr q6, [%[wbptr], #96]\n" - "mov v14.16b, v25.16b\n" - "ldr q5, [%[wbptr], #112]\n" - "mov v12.16b, v25.16b\n" - "ldr q4, [%[wbptr], #128]\n" - "mov v15.16b, v25.16b\n" - "ldr q3, [%[wbptr], #144]\n" - "ldr q27, [x25, x27]\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "ldr q26, [x17, x27]\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr q31, [x25, x27]\n" - "ldr q28, [x24, x27]\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr q29, [x17, x27]\n" - "ldr x7, [%[inptrs], 144]\n" - "ldr x24, [%[inptrs], 104]\n" - "subs x26, x26, #1\n" - "ldr q30, [x25, x27]\n" - "ldr q27, [x7, x27]\n" - "ldr q21, [x24, x27]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr q23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr q26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr q20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "subs x26, x26, #1\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr q23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v11.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr q30, [x24, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "ldr q31, [x17, x27]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr q25, [x25, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 0]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr q26, [x16, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr q28, [x24, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "ldr q25, [x17, x27]\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr q29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "ldr q22, [x15, x27]\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "ldr q26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr q25, [x16, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr q22, [x7, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x16, x27]\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x7, [%[inptrs], 144]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "add x27, x27, #16\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "ldr q27, [x25, x27]\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "movi v29.16b, #0\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "ldr q25, [%[wbptr]]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "ldr q22, [%[wbptr], #16]\n" - "str q2, [x20, x28]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "ldr q9, [%[wbptr], #32]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "ldr q8, [%[wbptr], #48]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q19, [%[wbptr], #64]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q18, [x20, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str q16, [x21, x28]\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr q7, [%[wbptr], #80]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "ldr q5, [%[wbptr], #112]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "ldr q6, [%[wbptr], #96]\n" - "str q13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr q4, [%[wbptr], #128]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr q31, [x25, x27]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "ldr q3, [%[wbptr], #144]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr q28, [x24, x27]\n" - "str q14, [x23, x28]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "mov v2.16b, v25.16b\n" - "ldr q29, [x17, x27]\n" - "ldr x20, [%[outptrs], 16]\n" - "ldr x21, [%[outptrs], 40]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr x24, [%[inptrs], 104]\n" - "str q17, [x20, x28]\n" - "mov v16.16b, v25.16b\n" - "str q0, [x21, x28]\n" - "mov v18.16b, v25.16b\n" - "str q12, [x22, x28]\n" - "mov v13.16b, v25.16b\n" - "str q10, [x23, x28]\n" - "mov v0.16b, v25.16b\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr q30, [x25, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "mov v17.16b, v25.16b\n" - "ldr x21, [%[outptrs], 48]\n" - "str q1, [x20, x28]\n" - "mov v14.16b, v25.16b\n" - "str q15, [x21, x28]\n" - "mov v12.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - "ldr x21, [%[outptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr q27, [x7, x27]\n" - "str q21, [x21, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr q21, [x24, x27]\n" - "ldr x23, [%[outptrs], 112]\n" - "str q11, [x22, x28]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "str q20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str q24, [x22, x28]\n" - "str q23, [x23, x28]\n" - "add x28, x28, #16\n" - "bne 2b\n" - "3:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "mov v11.16b, v25.16b\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr q23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr q26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr q20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr q23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "ldr q30, [x24, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr q31, [x17, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "ldr q25, [x25, x27]\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr q26, [x16, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr q28, [x24, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr q25, [x17, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr q29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr q22, [x15, x27]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr q26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr q25, [x16, x27]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr q22, [x7, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "ldr q19, [x16, x27]\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "add x27, x27, #16\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "movi v29.16b, #0\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str q2, [x20, x28]\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q18, [x20, x28]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q16, [x21, x28]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "ldr x20, [%[outptrs], 16]\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr x21, [%[outptrs], 40]\n" - "str q17, [x20, x28]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "str q0, [x21, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str q13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr x21, [%[outptrs], 48]\n" - "str q1, [x20, x28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "str q15, [x21, x28]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "str q14, [x23, x28]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr x21, [%[outptrs], 56]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q21, [x21, x28]\n" - "str q12, [x22, x28]\n" - "str q10, [x23, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr x23, [%[outptrs], 112]\n" - "str q11, [x22, x28]\n" - "str q20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str q24, [x22, x28]\n" - "str q23, [x23, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s25, [%[wbptr]]\n" - "mov v2.16b, v25.16b\n" - "ldr s22, [%[wbptr], #4]\n" - "mov v16.16b, v25.16b\n" - "ldr s9, [%[wbptr], #8]\n" - "mov v18.16b, v25.16b\n" - "ldr s8, [%[wbptr], #12]\n" - "mov v13.16b, v25.16b\n" - "ldr s19, [%[wbptr], #16]\n" - "mov v0.16b, v25.16b\n" - "ldr s7, [%[wbptr], #20]\n" - "mov v17.16b, v25.16b\n" - "ldr s6, [%[wbptr], #24]\n" - "mov v14.16b, v25.16b\n" - "ldr s5, [%[wbptr], #28]\n" - "mov v12.16b, v25.16b\n" - "ldr s4, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "ldr s3, [%[wbptr], #36]\n" - "ldr x25, [%[inptrs], 0]\n" - "ldr x17, [%[inptrs], 48]\n" - "ldr x24, [%[inptrs], 96]\n" - "ldr x7, [%[inptrs], 144]\n" - "subs x19, x19, #1\n" - "ldr s27, [x25, x27]\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr s28, [x24, x27]\n" - "ldr s27, [x7, x27]\n" - "ldr x25, [%[inptrs], 8]\n" - "ldr x17, [%[inptrs], 56]\n" - "ldr x24, [%[inptrs], 104]\n" - "ldr s31, [x25, x27]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr s29, [x17, x27]\n" - "ldr s21, [x24, x27]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr s30, [x25, x27]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr s23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr s26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr s20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "subs x19, x19, #1\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr s23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v11.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr s30, [x24, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "ldr s31, [x17, x27]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr s25, [x25, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 0]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr s26, [x16, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr s28, [x24, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "ldr s25, [x17, x27]\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr s29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "ldr s22, [x15, x27]\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "ldr s26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr s25, [x16, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr s22, [x7, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x16, x27]\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x7, [%[inptrs], 144]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "add x27, x27, #4\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "ldr s27, [x25, x27]\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "movi v29.16b, #0\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "ldr s25, [%[wbptr]]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "ldr s22, [%[wbptr], #4]\n" - "str s2, [x20, x28]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "ldr s9, [%[wbptr], #8]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "ldr s8, [%[wbptr], #12]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s19, [%[wbptr], #16]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s18, [x20, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str s16, [x21, x28]\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr s7, [%[wbptr], #20]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "ldr s5, [%[wbptr], #28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "ldr s6, [%[wbptr], #24]\n" - "str s13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr s4, [%[wbptr], #32]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr s31, [x25, x27]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "ldr s3, [%[wbptr], #36]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr s28, [x24, x27]\n" - "str s14, [x23, x28]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "mov v2.16b, v25.16b\n" - "ldr s29, [x17, x27]\n" - "ldr x20, [%[outptrs], 16]\n" - "ldr x21, [%[outptrs], 40]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr x24, [%[inptrs], 104]\n" - "str s17, [x20, x28]\n" - "mov v16.16b, v25.16b\n" - "str s0, [x21, x28]\n" - "mov v18.16b, v25.16b\n" - "str s12, [x22, x28]\n" - "mov v13.16b, v25.16b\n" - "str s10, [x23, x28]\n" - "mov v0.16b, v25.16b\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr s30, [x25, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "mov v17.16b, v25.16b\n" - "ldr x21, [%[outptrs], 48]\n" - "str s1, [x20, x28]\n" - "mov v14.16b, v25.16b\n" - "str s15, [x21, x28]\n" - "mov v12.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - "ldr x21, [%[outptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr s27, [x7, x27]\n" - "str s21, [x21, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr s21, [x24, x27]\n" - "ldr x23, [%[outptrs], 112]\n" - "str s11, [x22, x28]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "str s20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str s24, [x22, x28]\n" - "str s23, [x23, x28]\n" - "add x28, x28, #4\n" - "bne 5b\n" - "6:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "mov v11.16b, v25.16b\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr s23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr s26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr s20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr s23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "ldr s30, [x24, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr s31, [x17, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "ldr s25, [x25, x27]\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr s26, [x16, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr s28, [x24, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr s25, [x17, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr s29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr s22, [x15, x27]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr s26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr s25, [x16, x27]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr s22, [x7, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "ldr s19, [x16, x27]\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "add x27, x27, #4\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "movi v29.16b, #0\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str s2, [x20, x28]\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s18, [x20, x28]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s16, [x21, x28]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "ldr x20, [%[outptrs], 16]\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr x21, [%[outptrs], 40]\n" - "str s17, [x20, x28]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "str s0, [x21, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str s13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr x21, [%[outptrs], 48]\n" - "str s1, [x20, x28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "str s15, [x21, x28]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "str s14, [x23, x28]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr x21, [%[outptrs], 56]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s21, [x21, x28]\n" - "str s12, [x22, x28]\n" - "str s10, [x23, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr x23, [%[outptrs], 112]\n" - "str s11, [x22, x28]\n" - "str s20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str s24, [x22, x28]\n" - "str s23, [x23, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile<ActivationFunction::ReLU6>( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x24, %[inptr0], %[input_row_stride]\n" - "add x13, %[input_col_stride1], %[input_col_stride1]\n" - "add x8, %[outptr0], %[output_row_stride]\n" - "add x9, x24, %[input_row_stride]\n" - "add x10, x13, #64\n" - "add x19, x13, %[input_col_stride1]\n" - "add x20, x9, %[input_row_stride]\n" - "add x21, x19, #64\n" - "add x17, x19, %[input_col_stride1]\n" - "add x22, x20, %[input_row_stride]\n" - "add x7, x17, #64\n" - "add x11, x17, %[input_col_stride1]\n" - "add x23, x22, %[input_row_stride]\n" - "add x12, x11, #64\n" - "add x25, x8, %[output_row_stride]\n" - "add x26, x25, %[output_row_stride]\n" - "add x27, %[output_col_stride1], %[output_col_stride1]\n" - "and x14, %[n_channels], #3\n" - "add x28, x27, %[output_col_stride1]\n" - "lsr x15, %[n_channels], #2\n" - "cbz x15, 4f\n" - "1:\n" - "ldr q23, [%[wbptr]]\n" - "subs x15, x15, #1\n" - "mov v12.16b, v23.16b\n" - "ldr q20, [%[wbptr], #16]\n" - "mov v8.16b, v23.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "mov v11.16b, v23.16b\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v16.16b, v23.16b\n" - "ldr q19, [%[wbptr], #64]\n" - "mov v7.16b, v23.16b\n" - "ldr q4, [%[wbptr], #80]\n" - "mov v10.16b, v23.16b\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v14.16b, v23.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "mov v15.16b, v23.16b\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v17.16b, v23.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "mov v9.16b, v23.16b\n" - "ldr q28, [%[inptr0]]\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "ldr q25, [x24]\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "ldr q18, [%[inptr0], %[input_col_stride1]]\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "ldr q30, [x9]\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "ldr q29, [x24, %[input_col_stride1]]\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "ldr q24, [%[inptr0], x13]\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "ldr q27, [x20]\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "ldr q22, [x9, %[input_col_stride1]]\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x24, #64]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x24, x16]\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "mov v13.16b, v23.16b\n" - "ldr q21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr q25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr q24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr q26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr q30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr q22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr q21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr q24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr q26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "subs x15, x15, #1\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr q27, [x20, x13]\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "ldr q28, [x9, x19]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "ldr q29, [x24, x17]\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "ldr q23, [%[inptr0], x11]\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr q30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr q23, [x22, x13]\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "ldr q29, [x20, x19]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr q23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr q26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "ldr q20, [x22, x19]\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr q26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr q23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #16\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr q27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr q20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #16\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr q29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "add x22, x22, #16\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "ldr q30, [x23, x11]\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr q23, [%[wbptr]]\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "add x23, x23, #16\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "ldr q20, [%[wbptr], #16]\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "ldr q6, [%[wbptr], #32]\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "ldr q19, [%[wbptr], #64]\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "ldr q5, [%[wbptr], #48]\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "fmov v27.4s, #6.0\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str q12, [%[outptr0]]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "str q10, [%[outptr0], x27]\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "str q9, [%[outptr0], x28]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "str q8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str q7, [x8, %[output_col_stride1]]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [x8, x27]\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "ldr q0, [%[wbptr], #144]\n" - "str q25, [x8, x28]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str q16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "ldr q28, [%[inptr0]]\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "ldr q25, [x24]\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "str q18, [x25, x27]\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "ldr q18, [%[inptr0], %[input_col_stride1]]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "ldr q30, [x9]\n" - "str q24, [x25, x28]\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str q14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "ldr q29, [x24, %[input_col_stride1]]\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "ldr q24, [%[inptr0], x13]\n" - "str q13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "str q22, [x26, x27]\n" - "mov v12.16b, v23.16b\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "ldr q27, [x20]\n" - "mov v8.16b, v23.16b\n" - "ldr q22, [x9, %[input_col_stride1]]\n" - "str q21, [x26, x28]\n" - "mov v11.16b, v23.16b\n" - "mov v16.16b, v23.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v7.16b, v23.16b\n" - "add x8, x8, #16\n" - "mov v10.16b, v23.16b\n" - "add x25, x25, #16\n" - "mov v14.16b, v23.16b\n" - "add x26, x26, #16\n" - "mov v15.16b, v23.16b\n" - "mov v17.16b, v23.16b\n" - "mov v9.16b, v23.16b\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "mov v13.16b, v23.16b\n" - "ldr q21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr q25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr q24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr q26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr q30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr q22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr q21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr q24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr q26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "ldr q27, [x20, x13]\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr q28, [x9, x19]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "ldr q29, [x24, x17]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "ldr q23, [%[inptr0], x11]\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr q30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr q23, [x22, x13]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "ldr q29, [x20, x19]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr q23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr q26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "ldr q20, [x22, x19]\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr q26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr q23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #16\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr q27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr q20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #16\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr q29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "add x22, x22, #16\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr q30, [x23, x11]\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "add x23, x23, #16\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "fmov v27.4s, #6.0\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str q12, [%[outptr0]]\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "str q10, [%[outptr0], x27]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "str q9, [%[outptr0], x28]\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "str q8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str q7, [x8, %[output_col_stride1]]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [x8, x27]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str q25, [x8, x28]\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "add x8, x8, #16\n" - "str q16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str q18, [x25, x27]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "str q24, [x25, x28]\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "str q14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "str q13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "add x25, x25, #16\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "str q22, [x26, x27]\n" - "str q21, [x26, x28]\n" - "add x26, x26, #16\n" - "4:\n" - "cbz x14, 7f\n" - "ldr s23, [%[wbptr]]\n" - "mov v12.16b, v23.16b\n" - "ldr s20, [%[wbptr], #4]\n" - "mov v8.16b, v23.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "mov v11.16b, v23.16b\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v16.16b, v23.16b\n" - "ldr s19, [%[wbptr], #16]\n" - "mov v7.16b, v23.16b\n" - "ldr s4, [%[wbptr], #20]\n" - "mov v10.16b, v23.16b\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v14.16b, v23.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "mov v15.16b, v23.16b\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v17.16b, v23.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "mov v9.16b, v23.16b\n" - "ldr s28, [%[inptr0]]\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "ldr s25, [x24]\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "ldr s18, [%[inptr0], %[input_col_stride1]]\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "ldr s30, [x9]\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "ldr s29, [x24, %[input_col_stride1]]\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "ldr s24, [%[inptr0], x13]\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "ldr s27, [x20]\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "ldr s22, [x9, %[input_col_stride1]]\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x24, #64]\n" - "subs x14, x14, #1\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "mov v13.16b, v23.16b\n" - "ldr s21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr s25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr s24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr s26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr s30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr s22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr s21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr s24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr s26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "subs x14, x14, #1\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr s27, [x20, x13]\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "ldr s28, [x9, x19]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "ldr s29, [x24, x17]\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "ldr s23, [%[inptr0], x11]\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr s30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr s23, [x22, x13]\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "ldr s29, [x20, x19]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr s23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr s26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "ldr s20, [x22, x19]\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr s26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr s23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #4\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr s27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr s20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #4\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr s29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "add x22, x22, #4\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "ldr s30, [x23, x11]\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr s23, [%[wbptr]]\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "add x23, x23, #4\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "ldr s20, [%[wbptr], #4]\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "ldr s6, [%[wbptr], #8]\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "ldr s19, [%[wbptr], #16]\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "ldr s5, [%[wbptr], #12]\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "fmov v27.4s, #6.0\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str s12, [%[outptr0]]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "str s10, [%[outptr0], x27]\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "str s9, [%[outptr0], x28]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "str s8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str s7, [x8, %[output_col_stride1]]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [x8, x27]\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "ldr s0, [%[wbptr], #36]\n" - "str s25, [x8, x28]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str s16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "ldr s28, [%[inptr0]]\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "ldr s25, [x24]\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "str s18, [x25, x27]\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "ldr s18, [%[inptr0], %[input_col_stride1]]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "ldr s30, [x9]\n" - "str s24, [x25, x28]\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str s14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "ldr s29, [x24, %[input_col_stride1]]\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "ldr s24, [%[inptr0], x13]\n" - "str s13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "str s22, [x26, x27]\n" - "mov v12.16b, v23.16b\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "ldr s27, [x20]\n" - "mov v8.16b, v23.16b\n" - "ldr s22, [x9, %[input_col_stride1]]\n" - "str s21, [x26, x28]\n" - "mov v11.16b, v23.16b\n" - "mov v16.16b, v23.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v7.16b, v23.16b\n" - "add x8, x8, #4\n" - "mov v10.16b, v23.16b\n" - "add x25, x25, #4\n" - "mov v14.16b, v23.16b\n" - "add x26, x26, #4\n" - "mov v15.16b, v23.16b\n" - "mov v17.16b, v23.16b\n" - "mov v9.16b, v23.16b\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "mov v13.16b, v23.16b\n" - "ldr s21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr s25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr s24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr s26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr s30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr s22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr s21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr s24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr s26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "ldr s27, [x20, x13]\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr s28, [x9, x19]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "ldr s29, [x24, x17]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "ldr s23, [%[inptr0], x11]\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr s30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr s23, [x22, x13]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "ldr s29, [x20, x19]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr s23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr s26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "ldr s20, [x22, x19]\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr s26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr s23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #4\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr s27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr s20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #4\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr s29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "add x22, x22, #4\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr s30, [x23, x11]\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "add x23, x23, #4\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "fmov v27.4s, #6.0\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str s12, [%[outptr0]]\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "str s10, [%[outptr0], x27]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "str s9, [%[outptr0], x28]\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "str s8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str s7, [x8, %[output_col_stride1]]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [x8, x27]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str s25, [x8, x28]\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "add x8, x8, #4\n" - "str s16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str s18, [x25, x27]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "str s24, [x25, x28]\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "str s14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "str s13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "add x25, x25, #4\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "str s22, [x26, x27]\n" - "str s21, [x26, x28]\n" - "add x26, x26, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp deleted file mode 100644 index 27bfb843f6..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "impl_dilated.hpp" - -template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp deleted file mode 100644 index 1bae815613..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include <deque> -#include <functional> -#include <memory> - -#include "depthwise.hpp" - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut -> -class DilatedDepthwiseConvolution : public IDepthwiseConvolution -{ - public: - /** Create a new dilated depthwise convolution engine. - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - /** Create a new dilated depthwise convolution engine. - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - // Cannot copy or move a DilatedDepthwiseConvolution. - DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete; - DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete; - - /* Set input tensor and stride. */ - void set_input(const void *inptr) override; - void set_input(const void *inptr, int column_stride) override; - void set_input(const void *inptr, int row_stride, int column_stride) override; - void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; - - /* Set output tensor and stride. */ - void set_output(void *outptr) override; - void set_output(void *outptr, int column_stride) override; - void set_output(void *outptr, int row_stride, int column_stride) override; - void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; - - static int get_output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after, - int dilation_factor - ); - - int output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ) const override; - - /* Weights and biases are re-ordered to improve memory access patterns. Use - * these methods to determine the size of the re-pack buffer and to set the - * address (and implicitly reorder the weights and biases into) the buffer. - */ - size_t get_packed_params_size(void) const override; - void set_packed_params_buffer(void *) override; - - void pack_params(const void *weights, const void *biases=nullptr) const override; - void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override; - void pack_params( - void *buffer, - const void* weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const override; - - /* Working space is used to pad tensors on the fly. Before running any - * inference check the amount of space required, allocate and provide a - * pointer to the convolution engine. - */ - size_t get_working_space_size(unsigned int nthreads=1) const override; - void set_working_space(void *) override; - - unsigned int get_window(void) const override; - void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; - - protected: - /** Protected constructor which also accepts a function to construct a new - * subconvolution - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn - ); - - const int _dilation_factor; - const int _n_input_rows, _n_input_cols, _n_channels; - const int _padding_top, _padding_left; - const int _n_output_rows, _n_output_cols; - - /* Dilated depthwise convolution is performed through repeated calls to - * non-dilated convolutions. If the dilation factor is $n$, then we perform - * $(n + 1)^2$ depthwise convolutions. - */ - using BaseDepthwise = DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - >; - std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs; -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp deleted file mode 100644 index e56583d6b3..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "depthwise_quantized_dilated.hpp" -#include "impl_dilated.hpp" - -namespace depthwise { - -template <unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols> -QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, - KernelCols, StrideRows, StrideCols>:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, weight_quantisation, input_quantisation, - output_quantisation, padding_top, padding_left, padding_bottom, - padding_right) {} - -template <unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols> -QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, - KernelCols, StrideRows, StrideCols>:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, weight_quantisation, - input_quantisation, output_quantisation, - qasymm8::QAsymm8RescaleParams::make_rescale_params( - weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right) {} - -template <unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols> -QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, - KernelCols, StrideRows, StrideCols>:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, weight_quantisation, input_quantisation, - output_quantisation, rescale_parameters, padding_top, padding_left, - padding_bottom, padding_right) {} - -template <unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols> -QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, - KernelCols, StrideRows, StrideCols>:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, - KernelCols, StrideRows, StrideCols, uint8_t, - int32_t, uint8_t>( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, padding_top, padding_left, - padding_bottom, padding_right, - [weight_quantisation, input_quantisation, output_quantisation, - rescale_parameters]( - const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int n_output_rows, const int n_output_cols, - const nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) -> IDepthwiseConvolution * { - return new QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, weight_quantisation, - input_quantisation, output_quantisation, rescale_parameters, - padding_top, padding_left, padding_bottom, padding_right); - }) {} - -} // namespace depthwise - -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp deleted file mode 100644 index 99f0f53792..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp16_fp16.hpp" - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -namespace depthwise -{ -template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>; -} // namespace depthwise -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp deleted file mode 100644 index c13dd70a61..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ -template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>; -template class DepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>; -template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp deleted file mode 100644 index bddae51135..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "impl_base.hpp" - -// TODO Move to common utilities somewhere -template <size_t Size> struct DType { }; -template <> struct DType<1> { using scalar_type = uint8_t; }; -template <> struct DType<2> { using scalar_type = uint16_t; }; -template <> struct DType<4> { using scalar_type = uint32_t; }; - -namespace depthwise -{ - -template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize> -void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute( - unsigned int n_channels, - void *buffer, - const void *weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void *biases -) -{ - using TWeight = typename DType<WeightSize>::scalar_type; - using TBias = typename DType<BiasSize>::scalar_type; - - auto buffer_ptr = static_cast<uint8_t *>(buffer); - auto weights_ptr = static_cast<const TWeight *>(weights); - auto biases_ptr = static_cast<const TBias *>(biases); - - const unsigned int veclen = 16 / WeightSize; - for (; n_channels >= veclen; n_channels -= veclen) - { - // Copy biases - for (unsigned int i = 0; i < veclen; i++) - { - auto ptr = reinterpret_cast<TBias *>(buffer_ptr); - *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++); - buffer_ptr += BiasSize; - } - - // Copy weights - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelColumns; j++) - { - for (unsigned int c = 0; c < veclen; c++) - { - *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c]; - buffer_ptr += WeightSize; - } - } - } - weights_ptr += veclen; - } - for (; n_channels; n_channels--) - { - // Copy bias - auto ptr = reinterpret_cast<TBias *>(buffer_ptr); - *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++); - buffer_ptr += BiasSize; - - // Copy weights - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelColumns; j++) - { - *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride]; - buffer_ptr += WeightSize; - } - } - weights_ptr++; - } -} - -template struct PackParameters<3, 3, 2ul, 2ul>; -template struct PackParameters<3, 3, 4ul, 4ul>; -template struct PackParameters<5, 5, 2ul, 2ul>; -template struct PackParameters<5, 5, 4ul, 4ul>; -} // namespace diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp deleted file mode 100644 index b09f620475..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_qa8_qa8.hpp" - -namespace depthwise -{ -template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 2, 2>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp deleted file mode 100644 index 1ae48b9417..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_qa8_qs8_per_channel.hpp" - -namespace depthwise { -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp deleted file mode 100644 index 4343f6ad45..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once -#include "depthwise.hpp" -#include "qasymm8.hpp" -#include "qsymm8.hpp" -#pragma once - -using namespace neon_convolution_kernels; -using namespace qasymm8; - -inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b) -{ - return vqrdmulhq_s32(a, b); -} - -inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b) -{ - return vqrdmulhq_n_s32(a, b); -} - -inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b) -{ - return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); -} - -inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift) -{ - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent) -{ - const int32x4_t shift = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent) -{ - const int32x2_t shift = vdup_n_s32(-exponent); - const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); - const int32x2_t fixed = vqadd_s32(x, fixup); - return vrshl_s32(fixed, shift); -} - -inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) -{ - const int32x2_t xs = vdup_n_s32(x); - return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); -} - -namespace depthwise -{ - -namespace nck = neon_convolution_kernels; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols> -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols> - >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - uint8_t _input_padding_value(void) const; - - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); - - private: - // Quantization parameters - const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant; - const qasymm8::QAsymm8RescaleParams rescale_parameters; -}; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols> -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols> - >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - size_t get_packed_params_size(void) const override - { - return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t)); - - } - - protected: - uint8_t _input_padding_value(void) const; - - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template <nck::ActivationFunction Activation> - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); - - private: - // Quantization parameters - const qsymm8::QSymm8PerChannelParams _weights_quant; - const qasymm8::QAsymm8Params _input_quant, _output_quant; - const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters; -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp deleted file mode 100644 index a11b0981c9..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once -#include "depthwise_dilated.hpp" -#include "depthwise_quantized.hpp" - -namespace depthwise { - -template <unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols> -class QAsymm8DilatedDepthwiseConvolution - : public DilatedDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, - StrideCols, uint8_t, int32_t, uint8_t> { -public: - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp deleted file mode 100644 index 266d13d6fc..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ /dev/null @@ -1,505 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include <algorithm> -#include <cstdint> -#include "depthwise.hpp" -#include "padding.hpp" -#include "utils.hpp" - -#pragma once - -#define MEMBERFN(TOUT) template <\ - unsigned int OutputTileRows, unsigned int OutputTileColumns,\ - unsigned int KernelRows, unsigned int KernelColumns,\ - unsigned int StrideRows, unsigned int StrideColumns,\ - typename TIn, typename TBias, typename TOut,\ - typename Derived\ -> TOUT DepthwiseConvolutionBase<\ - OutputTileRows, OutputTileColumns,\ - KernelRows, KernelColumns,\ - StrideRows, StrideColumns,\ - TIn, TBias, TOut, Derived\ -> - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize> -struct PackParameters -{ - static void execute( - unsigned int n_channels, - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases - ); -}; - -const unsigned int CHANNEL_BLOCK = 16; - -MEMBERFN(int)::get_output_size( - const int dim_size, const unsigned int padding_before, const unsigned int padding_after -) -{ - return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows); -} - -MEMBERFN(int)::output_size( - const int dim_size, const unsigned int padding_before, const unsigned int padding_after -) const -{ - return get_output_size(dim_size, padding_before, padding_after); -} - -MEMBERFN()::DepthwiseConvolutionBase( - const int n_batches, - const int n_input_rows, - const int n_input_cols, - const int n_channels, - ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right -) : DepthwiseConvolutionBase( - n_batches, n_input_rows, n_input_cols, n_channels, - get_output_size(n_input_rows, padding_top, padding_bottom), - get_output_size(n_input_cols, padding_left, padding_right), - activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -MEMBERFN()::DepthwiseConvolutionBase( - const int n_batches, - const int n_input_rows, - const int n_input_cols, - const int n_channels, - const int n_output_rows, - const int n_output_cols, - ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right -) : _input(nullptr), _output(nullptr), - _packed_parameters(nullptr), - _working_space(nullptr), - _n_batches(n_batches), - _n_input_rows(n_input_rows), - _n_input_cols(n_input_cols), - _n_channels(n_channels), - _n_output_rows(n_output_rows), - _n_output_cols(n_output_cols), - _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)), - _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)), - _padding_top(padding_top), - _padding_left(padding_left), - _padding_bottom(padding_bottom), - _padding_right(padding_right), - _activation(activation), - _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0), - _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0) -{ -} - -MEMBERFN(void)::set_input(const void* const inptr) -{ - set_input(inptr, _n_channels); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_col) -{ - set_input(inptr, _n_input_cols * ld_col, ld_col); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col) -{ - set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col) -{ - _input = static_cast<const TIn *>(inptr); - _input_batch_stride = ld_batch; - _input_row_stride = ld_row; - _input_col_stride = ld_col; -} - -MEMBERFN(void)::set_output(void* const outptr) -{ - set_output(outptr, _n_channels); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_col) -{ - set_output(outptr, _n_output_cols * ld_col, ld_col); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col) -{ - set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col) -{ - _output = static_cast<TOut *>(outptr); - _output_batch_stride = ld_batch; - _output_row_stride = ld_row; - _output_col_stride = ld_col; -} - -MEMBERFN(size_t)::get_packed_params_size(void) const -{ - return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); -} - -MEMBERFN(void)::set_packed_params_buffer(void *buffer) -{ - _packed_parameters = buffer; -} - -MEMBERFN(void)::pack_params(const void *weights, const void *biases) const -{ - static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases); -} - -MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const -{ - const unsigned int weight_col_stride = _n_channels; - const unsigned int weight_row_stride = KernelColumns * weight_col_stride; - static_cast<const Derived *>(this)->pack_params( - buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(void)::pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - static_cast<const Derived *>(this)->_pack_params( - buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(void)::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - // Default implementation - PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute( - _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const -{ - return nthreads * ( - _get_input_working_space_size() + _get_output_working_space_size() - ); -} - -MEMBERFN(void)::set_working_space(void *buffer) -{ - _working_space = buffer; -} - -MEMBERFN(size_t)::_get_input_working_space_size(void) const -{ - return sizeof(TIn) * _n_channels; -} - -MEMBERFN(size_t)::_get_output_working_space_size(void) const -{ - return sizeof(TOut) * _n_channels; -} - -MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const -{ - return static_cast<uint8_t*>(_working_space) + threadid * ( - _get_input_working_space_size() + _get_output_working_space_size() - ); -} - -MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const -{ - return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size(); -} - -MEMBERFN(unsigned int)::get_window() const -{ - // Parallelise over blocks of channels. - return iceildiv(_n_channels, CHANNEL_BLOCK); -} - -MEMBERFN(void)::run( - const unsigned int start, - const unsigned int stop, - const unsigned int threadid -) -{ - // Clear the input padding buffer - TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid)); - const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value(); - for (int n = 0; n < _n_channels; n++) - { - buf[n] = pad_value; - } - - // Parallelise over blocks of channels - const auto start_channel = CHANNEL_BLOCK * start; - const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop); - const auto params_size_per_channel = this->get_packed_params_size()/_n_channels; - - // Compute top and bottom padding for input and output - const int input_pad_top = _padding_top; - const int input_pad_left = _padding_left; - constexpr int tile_overlap = kernel_rows - stride_rows; - - // Perform the convolution by calling `process_tile_row` for each tile row in - // each batch. - for (int batch = 0; batch < _n_batches; batch++) - { - const TIn* const inptr_batch = _input + batch*_input_batch_stride; - TOut* const outptr_batch = _output + batch*_output_batch_stride; - - // Loop over rows of tiles - for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++) - { - // Pointer to the row - const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top; - const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride); - TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride; - - // Input padding (top + bottom) for the row - const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top; - const int input_row_bottom = input_row_top + inner_tile_rows; - const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0; - const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows); - - // Output padding (bottom) for the row - const int output_row_bottom = (tile_i + 1)*output_tile_rows; - const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); - - // Get the offset into the packed parameters - const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) + - start_channel*params_size_per_channel; - - // Process the row - process_tile_row( - threadid, - stop_channel - start_channel, - params_ptr, - inptr_row + start_channel, - outptr_row + start_channel, - input_row_pad_top, input_pad_left, input_row_pad_bottom, - output_row_pad_bottom, - _n_tile_cols, _n_input_cols, _n_output_cols - ); - } - } -} - -MEMBERFN(void)::process_tile_row( - const unsigned int threadid, - const int n_channels, - const void* const packed_params, - const TIn* const inptr, - TOut* const outptr, - const int row_pad_in_top, - const int row_pad_in_left, - const int row_pad_in_bottom, - const int row_pad_out_bottom, - const int n_tiles, - const int n_input_cols, - const int n_output_cols -) -{ - constexpr int tile_overlap = kernel_cols - stride_cols; - - // Loop over columns of tiles - for (int tile_j = 0; tile_j < n_tiles; tile_j++) - { - // Input padding (left + right) for the tile - const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0; - const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left; - const int t_in_end = t_in_start + inner_tile_cols; - const int t_pad_in_right = std::max(0, t_in_end - n_input_cols); - - // Output padding (right) for the tile - const int t_out_end = (tile_j + 1) * output_tile_cols; - const int t_pad_out_right = std::max(0, t_out_end - n_output_cols); - - // Get pointers into the inputs and outputs - const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; - const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride); - TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride; - - // Process just this tile - process_tile( - threadid, n_channels, packed_params, inptr_col, outptr_col, - row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings - row_pad_out_bottom, t_pad_out_right // Output paddings - ); - } -} - -MEMBERFN(TIn)::_input_padding_value(void) const -{ - return static_cast<TIn>(0); -} - -MEMBERFN(void)::process_tile( - const unsigned int threadid, - const int n_channels, - const void* const packed_params, - const TIn* const inptr, - TOut* const outptr, - const int pad_in_top, - const int pad_in_left, - const int pad_in_bottom, - const int pad_in_right, - const int pad_out_bottom, - const int pad_out_right -) -{ - Derived * dthis = static_cast<Derived *>(this); - const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right; - const bool pad_output = pad_out_bottom || pad_out_right; - - if (!pad_input && !pad_output) - { - switch(_activation) - { - case ActivationFunction::ReLU: - dthis->template execute_tile<ActivationFunction::ReLU>( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - case ActivationFunction::ReLU6: - dthis->template execute_tile<ActivationFunction::ReLU6>( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - default: - dthis->template execute_tile<ActivationFunction::None>( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - } - } - else - { - // Create arrays of input and output pointers, pointing padded elements to - // the working space padding buffers provided. - const TIn *inptrs[inner_tile_rows][inner_tile_cols]; - for (int i = 0; i < inner_tile_rows; i++) - { - for (int j = 0; j < inner_tile_cols; j++) - { - if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i || - j < pad_in_left || (inner_tile_cols - pad_in_right) <= j) - { - // Padded input - inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid)); - } - else - { - inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride; - } - } - } - - TOut *outptrs[output_tile_rows][output_tile_cols]; - for (int i = 0; i < output_tile_rows; i++) - { - for (int j = 0; j < output_tile_cols; j++) - { - if (i < (output_tile_rows - pad_out_bottom) && - j < (output_tile_cols - pad_out_right)) - { - outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride; - } - else - { - outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid)); - } - } - } - - switch(_activation) - { - case ActivationFunction::ReLU: - dthis->template execute_tile<ActivationFunction::ReLU>( - n_channels, packed_params, inptrs, outptrs - ); - break; - case ActivationFunction::ReLU6: - dthis->template execute_tile<ActivationFunction::ReLU6>( - n_channels, packed_params, inptrs, outptrs - ); - break; - default: - dthis->template execute_tile<ActivationFunction::None>( - n_channels, packed_params, inptrs, outptrs - ); - break; - } - } -} - -MEMBERFN(int)::n_channels(void) const -{ - return _n_channels; -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp deleted file mode 100644 index 4130188187..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "depthwise_dilated.hpp" -#include "utils.hpp" - -#define MEMBERFN(TOUT) \ - template <unsigned int OutputTileRows, unsigned int OutputTileColumns, \ - unsigned int KernelRows, unsigned int KernelColumns, \ - unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \ - typename TBias, typename TOut> \ - TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns, \ - KernelRows, KernelColumns, StrideRows, \ - StrideColumns, TIn, TBias, TOut> - -namespace depthwise { - -MEMBERFN() -::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int dilation_factor, - nck::ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) - : DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, padding_top, padding_left, padding_bottom, - padding_right) {} - -MEMBERFN() -::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int dilation_factor, - const int n_output_rows, const int n_output_cols, - nck::ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int, // padding_bottom - const unsigned int // padding_right - ) - : DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, padding_top, padding_left, - 0, 0, - // Function which creates a new (standard) depthwise convolution - [](const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int n_output_rows, const int n_output_cols, - const nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) -> IDepthwiseConvolution * { - return new DepthwiseConvolution< - OutputTileRows, OutputTileColumns, KernelRows, KernelColumns, - StrideRows, StrideColumns, TIn, TBias, TOut>( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, padding_top, - padding_left, padding_bottom, padding_right); - }) {} - -MEMBERFN() -::DilatedDepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, - const int n_channels, const int dilation_factor, const int n_output_rows, - const int n_output_cols, nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int, // padding_bottom - const unsigned int, // padding_right - std::function<IDepthwiseConvolution *( - int, int, int, int, int, int, nck::ActivationFunction, unsigned int, - unsigned int, unsigned int, unsigned int)> - subconvfn // Function to create a new convolution - ) - : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows), - _n_input_cols(n_input_cols), _n_channels(n_channels), - _padding_top(static_cast<int>(padding_top)), - _padding_left(static_cast<int>(padding_left)), - _n_output_rows(n_output_rows), _n_output_cols(n_output_cols), - _convs(_dilation_factor) { - // Instantiate the base convolutions - for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) { - // Compute properties of this row of base convolutions - const int row_top = - i * StrideRows - _padding_top; // -ve values are in the padding - const int row_pad_top = - row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0; - - const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor); - const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor); - - for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) { - // Compute properties of the base convolution - const int col_left = - j * StrideColumns - padding_left; // -ve values are in the padding - const int col_pad_left = - col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0; - - const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor); - const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor); - - // Create new depthwise convolution engine and include it in the vector - // of engines. The new depthwise convolution engine is created by calling - // the delegate function we received as an argument. - _convs[i].emplace_back(subconvfn( - n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows, - _n_output_cols, activation, - // Note: since we have computed the output tensor size we don't need - // to explicitly provide bottom and right padding values to the - // depthwise convolution. - row_pad_top, col_pad_left, 0, 0)); - } - } -} - -MEMBERFN(void)::set_input(const void *const inptr) { - set_input(inptr, _n_channels); -} - -MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) { - set_input(inptr, _n_input_cols * ldcol, ldcol); -} - -MEMBERFN(void) -::set_input(const void *const inptr, const int ldrow, const int ldcol) { - set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol); -} - -MEMBERFN(void) -::set_input(const void *const inptr, const int ldbatch, const int ldrow, - const int ldcol) { - // Compute dilated strides - const int ldrow_dilated = ldrow * _dilation_factor; - const int ldcol_dilated = ldcol * _dilation_factor; - - // Pass input parameters on to base convolutions - for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) { - const int top_pos = - i * StrideRows - _padding_top + - ((static_cast<int>(i * StrideRows) < _padding_top) - ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) * - _dilation_factor - : 0); - const TIn *const inptr_i = - static_cast<const TIn *>(inptr) + top_pos * ldrow; - - for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) { - int left_pos = j * StrideColumns - _padding_left; - while (left_pos < 0) - left_pos += _dilation_factor; - - // Modify the pointer to point to the first element of the dilated input - // tensor, then set the input for this convolution engine. - const void *const inptr_ij = inptr_i + left_pos * ldcol; - _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated); - } - } -} - -MEMBERFN(void)::set_output(void *const outptr) { - set_output(outptr, _n_channels); -} - -MEMBERFN(void)::set_output(void *const outptr, const int ldcol) { - set_output(outptr, _n_output_cols * ldcol, ldcol); -} - -MEMBERFN(void) -::set_output(void *const outptr, const int ldrow, const int ldcol) { - set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol); -} - -MEMBERFN(void) -::set_output(void *const outptr, const int ldbatch, const int ldrow, - const int ldcol) { - // Compute dilated strides - const int ldrow_dilated = ldrow * _dilation_factor; - const int ldcol_dilated = ldcol * _dilation_factor; - - // Pass input parameters on to base convolutions - for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) { - for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) { - // Modify the pointer to point to the first element of the dilated input - // tensor, then set the input for this convolution engine. - void *const outptr_ij = - static_cast<TOut *>(outptr) + i * ldrow + j * ldcol; - _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated, - ldcol_dilated); - } - } -} - -MEMBERFN(int) -::get_output_size(const int dim_size, const unsigned int padding_before, - const unsigned int padding_after, const int dilation_factor) { - const int input_size = - dim_size + static_cast<int>(padding_before + padding_after); - const int window_size = (KernelRows - 1) * dilation_factor + 1; - return iceildiv(input_size - window_size + 1, StrideRows); -} - -MEMBERFN(int) -::output_size(const int dim_size, const unsigned int padding_before, - const unsigned int padding_after) const { - return get_output_size(dim_size, padding_before, padding_after, - _dilation_factor); -} - -MEMBERFN(size_t)::get_packed_params_size(void) const { - return _convs[0][0]->get_packed_params_size(); -} - -MEMBERFN(void)::set_packed_params_buffer(void *buffer) { - // Set the buffer for all convolution engines - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->set_packed_params_buffer(buffer); - } - } -} - -MEMBERFN(void) -::pack_params(const void *const weights, const void *const biases) const { - _convs[0][0]->pack_params(weights, biases); -} - -MEMBERFN(void) -::pack_params(void *const buffer, const void *const weights, - const void *const biases) const { - _convs[0][0]->pack_params(buffer, weights, biases); -} - -MEMBERFN(void) -::pack_params(void *const buffer, const void *const weights, - const unsigned int ldrow, const unsigned int ldcol, - const void *const biases) const { - _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases); -} - -MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const { - return _convs[0][0]->get_working_space_size(nthreads); -} - -MEMBERFN(void)::set_working_space(void *const ws) { - // Use the same working space set for all contained depthwise engines. - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->set_working_space(ws); - } - } -} - -MEMBERFN(unsigned int)::get_window(void) const { - return _convs[0][0]->get_window(); -} - -MEMBERFN(void) -::run(const unsigned int start, const unsigned int stop, - const unsigned int threadid) { - // Run each contained convolution in turn - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->run(start, stop, threadid); - } - } -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp deleted file mode 100644 index a00a1ef04a..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#include "arm.hpp" -#include "impl_base.hpp" - -#pragma once - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float16_t *input, - const unsigned int in_row_stride, - const unsigned int in_col_stride, - float16_t *output, - const unsigned int out_row_stride, - const unsigned int out_col_stride -) -{ - // Instantiate pointers - const float16_t* __restrict__ inptr_base = input; - float16_t* __restrict__ outptr_base = output; - const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 8; channels_remaining -= 8) - { - // Load input tile - float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float16_t* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f16(inptr_row + j*in_col_stride); - } - } - inptr_base += 8; - - // Load weights tile - float16x8_t vbias = vld1q_f16(params); - params += 8; - - float16x8_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f16(params); - params += 8; - } - } - - // Perform the convolution - float16x8_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float16_t* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f16(outptr_row + j*out_col_stride, v[i][j]); - } - } - outptr_base += 8; - } - for (; channels_remaining; channels_remaining--) - { - // Load input tile - float16_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float16_t* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptr_row + j*in_col_stride); - } - } - inptr_base++; - - // Load weights tile - float16_t bias = *(params++); - float16_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float16_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float16_t* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - // Instantiate pointers - const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr); - int n = 0; - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 8; channels_remaining -= 8, n += 8) - { - // Load input tile - float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f16(inptrs[i][j] + n); - } - } - - // Load weights tile - float16x8_t vbias = vld1q_f16(params); - params += 8; - - float16x8_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f16(params); - params += 8; - } - } - - // Perform the convolution - float16x8_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f16(outptrs[i][j] + n, v[i][j]); - } - } - } - for (; channels_remaining; channels_remaining--, n++) - { - // Load input tile - float16_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptrs[i][j] + n); - } - } - - // Load weights tile - float16_t bias = *(params++); - float16_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float16_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptrs[i][j] + n) = v[i][j]; - } - } - } -} - -} // namespace depthwise -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp deleted file mode 100644 index b0d8126a40..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include "arm.hpp" -#include "impl_base.hpp" - -#pragma once - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float *input, - const unsigned int in_row_stride, - const unsigned int in_col_stride, - float *output, - const unsigned int out_row_stride, - const unsigned int out_col_stride -) -{ - // Instantiate pointers - const float* __restrict__ inptr_base = input; - float* __restrict__ outptr_base = output; - const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Load input tile - float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f32(inptr_row + j*in_col_stride); - } - } - inptr_base += 4; - - // Load weights tile - float32x4_t vbias = vld1q_f32(params); - params += 4; - - float32x4_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f32(params); - params += 4; - } - } - - // Perform the convolution - float32x4_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f32(outptr_row + j*out_col_stride, v[i][j]); - } - } - outptr_base += 4; - } - for (; channels_remaining; channels_remaining--) - { - // Load input tile - float u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptr_row + j*in_col_stride); - } - } - inptr_base++; - - // Load weights tile - float bias = *(params++); - float w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; - } -} - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - int n = 0; - for (; channels_remaining >= 4; channels_remaining -= 4, n += 4) - { - // Load input tile - float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f32(inptrs[i][j] + n); - } - } - - // Load weights tile - float32x4_t vbias = vld1q_f32(params); - params += 4; - - float32x4_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f32(params); - params += 4; - } - } - - // Perform the convolution - float32x4_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f32(outptrs[i][j] + n, v[i][j]); - } - } - } - for (; channels_remaining; channels_remaining--, n++) - { - // Load input tile - float u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptrs[i][j] + n); - } - } - - // Load weights tile - float bias = *(params++); - float w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptrs[i][j] + n) = v[i][j]; - } - } - } -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp deleted file mode 100644 index e8b4c7bc0f..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include <limits> - -#include "arm.hpp" -#include "impl_base.hpp" -#include "depthwise_quantized.hpp" - -namespace depthwise -{ -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -uint8_t QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_input_padding_value(void) const -{ - return _inputs_quant.offset; -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - const uint8_t *wptr = static_cast<const uint8_t *>(weights); - const int32_t *bptr = static_cast<const int32_t *>(biases); - uint8_t *outptr = static_cast<uint8_t *>(buffer); - - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. - unsigned int veclen = 8; - - // While there are channels left to process, pack a vector length of them at - // a time and reduce the size of vector used as the size of the tensor - // decreases. - for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) - { - // NOTE Ignore this section if using SVE, the vector length remains the - // same and we just don't fill a full register for the tail. - while (n_channels < veclen) - { - // Reduce the vector length to either 8 or 1 (scalar) - // TODO Support more vector lengths in `execute_tile`. - veclen = (veclen == 16) ? 8 : 1; - } - - // Get pointers to bias and weight portions of the output structure. - int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr); - uint8_t *out_wptr = outptr + veclen*sizeof(int32_t); - - // Copy a vector length of elements - for (unsigned int n = 0; n < veclen && n < n_channels; n++) - { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - out_bptr[n] = bias; - - for (unsigned int i = 0; i < KernelRows; i++) - { - uint8_t *row_outptr = out_wptr + i*KernelCols*veclen; - for (unsigned int j = 0; j < KernelCols; j++) - { - uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); - row_outptr[j*veclen + n] = w; - } - } - wptr++; - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void tilefn( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const int32_t clamp_max, - const int32_t clamp_min, - const uint8_t input_offset, - const uint8_t weight_offset, - const uint8_t output_offset, - const int32_t requant_multiplier, - const int32_t requant_shift -) -{ - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; - - // Byte type pointer to weights and biases - const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params); - - for (; n_channels >= 8; n_channels -= 8, channel += 8) - { - const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)), - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4), - }; - wbptr += 8*sizeof(int32_t); - - int16x8_t weights[KernelRows][KernelCols]; - const uint8x8_t woffset = vdup_n_u8(weight_offset); - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - const uint8x8_t w = vld1_u8(wbptr); - weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset)); - wbptr += 8; - } - } - - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset)); - } - } - - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32x4_t acc_a = biases[0], acc_b = biases[1]; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; -#ifndef __aarch64__ - acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x)); - acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x)); -#else - asm("smlal %[acc_a].4s, %[w].4h, %[x].4h\n" - "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n" - : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b) - : [w] "w"(w), [x] "w"(x)); -#endif // __aarch64__ - } - } - - int32x4_t final_accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier), - requant_shift); - const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); - } - -#ifndef __aarch64__ - const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]); - - const int8x16x2_t zoutput = vuzpq_s8(elems, elems); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0])); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#else - const int8x16_t elems = vreinterpretq_s8_s16( - vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1]))); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems))); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#endif // __aarch64__ - } - } - } - for (; n_channels; n_channels--, channel++) - { - // Load bias - const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr); - wbptr += sizeof(int32_t); - - // Load weights - int16_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - weights[i][j] = *(wbptr++) - weight_offset; - } - } - - // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; - } - } - - // Perform the convolution - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32_t acc = bias; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; - } - } - - // Requantize - acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, requant_multiplier), - requant_shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast<uint8_t>(acc); - *(get_output_ptr(oi, oj, channel)) = output; - } - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void execute_tilefn( - int n_channels, - const void* packed_params, - const nck::ActivationFunction actfn, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const QAsymm8Params &input_quant, - const QAsymm8Params &weight_quant, - const QAsymm8Params &output_quant, - const QAsymm8RescaleParams &requant -) { - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits<uint8_t>::min(); - int32_t clamp_max = std::numeric_limits<uint8_t>::max(); - - if (actfn == nck::ActivationFunction::ReLU || - actfn == nck::ActivationFunction::ReLU6) { - const int32_t bottom_rail = output_quant.offset; - clamp_min = std::max(clamp_min, bottom_rail); - } - - if (actfn == nck::ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); - } - - // Call the tile execution method - tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, - StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, - clamp_max, clamp_min, input_quant.offset, - weight_quant.offset, output_quant.offset, - requant.multiplier, requant.shift); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <nck::ActivationFunction Activation> -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <nck::ActivationFunction Activation> -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp deleted file mode 100644 index 68e20d98a9..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp +++ /dev/null @@ -1,457 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include <limits> - -#include "arm.hpp" -#include "impl_base.hpp" -#include "depthwise_quantized.hpp" - -#pragma once - -namespace { - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void tilefn_hybrid( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - int32_t clamp_min, - int32_t clamp_max, - uint8_t input_offset, - uint8_t output_offset -) -{ - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; - - // Byte type pointer to weights and biases - const int8_t *wbptr = static_cast<const int8_t *>(packed_params); - - for (; n_channels >= 8; n_channels -= 8, channel += 8) - { - const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)), - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4), - }; - const int32x4_t multipliers[2] = { - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12), - }; - const int32x4_t shifts[2] = { - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16), - vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20), - }; - wbptr += 24*sizeof(int32_t); - - int16x8_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - const auto w = vld1_s8(wbptr); - weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w)); - wbptr += 8; - } - } - - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset)); - } - } - - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32x4_t accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - accs[i] = biases[i]; - } - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; - accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x)); - accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x)); - } - } - - int32x4_t final_accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul(accs[i], multipliers[i]), - shifts[i]); - const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); - } - - const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0])); - - vst1_u8(get_output_ptr(oi, oj, channel), output); - } - } - } - - for (; n_channels; n_channels--, channel++) - { - // Load bias - const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr); - const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t)); - const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t)); - - wbptr += 3*sizeof(int32_t); - - // Load weights - int16_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - weights[i][j] = *(wbptr++); - } - } - - // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; - } - } - - // Perform the convolution - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32_t acc = bias; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; - } - } - - // Requantize - acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, multiplier), - -shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast<uint8_t>(acc); - *(get_output_ptr(oi, oj, channel)) = output; - } - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void execute_tilefn_hybrid( - int n_channels, - const void* packed_params, - const ActivationFunction actfn, - const qasymm8::QAsymm8Params &input_quant, - const qasymm8::QAsymm8Params &output_quant, - FInput &get_input_ptr, - FOutput &get_output_ptr) { - - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits<uint8_t>::min(); - int32_t clamp_max = std::numeric_limits<uint8_t>::max(); - - if (actfn == ActivationFunction::ReLU) { - clamp_min = output_quant.offset; - } - - // Disabling Relu6 for now - if (actfn == ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); - } - - // Call the tile execution method - tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, - StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset); -} -} - - - -namespace depthwise { -using namespace qsymm8; -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QSymm8HybridPerChannelDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const QSymm8PerChannelRescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _input_quant(input_quantisation), - _output_quant(output_quantisation), - _rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -uint8_t QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_input_padding_value(void) const -{ - return _input_quant.offset; -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - const int8_t *wptr = static_cast<const int8_t *>(weights); - const int32_t *bptr = static_cast<const int32_t *>(biases); - const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data()); - const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data()); - int8_t *outptr = static_cast<int8_t *>(buffer); - - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. - unsigned int veclen = 8; - - // While there are channels left to process, pack a vector length of them at - // a time and reduce the size of vector used as the size of the tensor - // decreases. - for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) - { - // NOTE Ignore this section if using SVE, the vector length remains the - // same and we just don't fill a full register for the tail. - while (n_channels < veclen) - { - // Reduce the vector length to either 8 or 1 (scalar) - // TODO Support more vector lengths in `execute_tile`. - veclen = (veclen == 16) ? 8 : 1; - } - - // Get pointers to bias and weight portions of the output structure. - int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr); - int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t)); - int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t)); - int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t); - - // Copy a vector length of elements - for (unsigned int n = 0; n < veclen && n < n_channels; n++) - { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0; - const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0; - - out_bptr[n] = bias; - out_mptr[n] = multiplier; - out_sptr[n] = -shift; - - for (unsigned int i = 0; i < KernelRows; i++) - { - int8_t *row_outptr = out_wptr + i*KernelCols*veclen; - for (unsigned int j = 0; j < KernelCols; j++) - { - int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); - row_outptr[j*veclen + n] = w; - } - } - wptr++; - } - } -} - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride -) { - - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template <ActivationFunction Activation> -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); -} - -} // namespace depthwise |