diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/core/NEON/kernels/assembly | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/assembly')
-rw-r--r-- | src/core/NEON/kernels/assembly/depthwise.hpp | 270 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/depthwise_common.hpp | 106 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/pool_common.hpp | 71 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/pooling.hpp | 210 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/premultiply.hpp | 17 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/winograd.hpp | 181 |
6 files changed, 436 insertions, 419 deletions
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index dbd47ccfa9..13c2d314e4 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -38,9 +38,8 @@ struct DepthwiseConfig DepthwiseMethod method = DepthwiseMethod::DEFAULT; std::string filter = ""; - DepthwiseConfig(DepthwiseMethod method) - : method(method) {}; - DepthwiseConfig() {}; + DepthwiseConfig(DepthwiseMethod method) : method(method){}; + DepthwiseConfig(){}; }; struct DepthwiseArgs @@ -63,18 +62,24 @@ struct DepthwiseArgs bool fast_mode = false; - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int dilation_rows, unsigned int dilation_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - - const DepthwiseConfig *config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int dilation_rows, + unsigned int dilation_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + + const DepthwiseConfig *config) : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), @@ -95,20 +100,38 @@ struct DepthwiseArgs { } - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - const DepthwiseConfig *config) - : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows, - stride_cols, 1, 1, n_batches, input_rows, input_cols, - input_channels, output_rows, output_cols, - channel_multiplier, padding, activation, config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + const DepthwiseConfig *config) + : DepthwiseArgs(cpu_info, + kernel_rows, + kernel_cols, + stride_rows, + stride_cols, + 1, + 1, + n_batches, + input_rows, + input_cols, + input_channels, + output_rows, + output_cols, + channel_multiplier, + padding, + activation, + config) { } }; @@ -127,17 +150,18 @@ struct Tile { } - Tile() - : Tile(nullptr, 0, 0, 0) + Tile() : Tile(nullptr, 0, 0, 0) { } - void load_from( - const TInput *input, - const unsigned int ld_row, const unsigned int ld_col, - const unsigned int n_rows, const unsigned int n_cols, - const int input_i, const int input_j, - const unsigned int channel_multiplier) const + void load_from(const TInput *input, + const unsigned int ld_row, + const unsigned int ld_col, + const unsigned int n_rows, + const unsigned int n_cols, + const int input_i, + const int input_j, + const unsigned int channel_multiplier) const { const auto pad_top = input_i < 0 ? -input_i : 0; const auto pad_left = input_j < 0 ? -input_j : 0; @@ -145,18 +169,15 @@ struct Tile const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top; const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left; - if(padded_rows < tile_rows || padded_cols < tile_cols) + if (padded_rows < tile_rows || padded_cols < tile_cols) { memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput)); } - do_premultiply<TInput>( - (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, - ld_row, ld_col, - array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, - tile_cols * tile_channels, tile_channels, - padded_rows, padded_cols, tile_channels / channel_multiplier, - channel_multiplier); + do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row, + ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, + tile_cols * tile_channels, tile_channels, padded_rows, padded_cols, + tile_channels / channel_multiplier, channel_multiplier); } }; @@ -168,9 +189,8 @@ protected: std::string m_name{}; public: - DepthwiseCommon(const DepthwiseArgs &args) - : m_args(args) {}; - DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; DepthwiseCommon &operator=(DepthwiseCommon &) = delete; std::string name() const override @@ -181,19 +201,18 @@ public: void set_name(std::string name) { // Only allow the name to be set once - if(m_name.empty()) + if (m_name.empty()) { m_name = name; } } - void execute( - const void *const input, - const void *const parameters, - void *const output, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { const size_t ld_input_col = m_args.input_channels; const size_t ld_input_row = ld_input_col * m_args.input_cols; @@ -202,56 +221,47 @@ public: const size_t ld_output_row = ld_output_col * m_args.output_cols; const size_t ld_output_batch = ld_output_row * m_args.output_rows; - execute( - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, n_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *const parameters, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { - execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, - m_args.input_channels, m_args.padding, - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads); } - void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &padding, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const override final + void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final { // Construct a new set of arguments to reflect that we might have been // passed different input/output tensors. Dilation is handled at this @@ -271,38 +281,33 @@ public: auto ld_output_col_d = ld_output_col * m_args.dilation_cols; auto ld_output_row_d = ld_output_row * m_args.dilation_rows; - for(size_t drow = 0; drow < m_args.dilation_rows; drow++) + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) { size_t start_i; - std::tie(args.output_rows, args.input_rows, start_i, - args.padding.top, args.padding.bottom) = - get_reduced_view_for_dilation( - output_height, input_height, drow, m_args.dilation_rows, - m_args.kernel_rows, m_args.stride_rows, padding.top); + std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row; auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row; - if(args.output_rows) + if (args.output_rows) { - for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) { size_t start_j; - std::tie(args.output_cols, args.input_cols, start_j, - args.padding.left, args.padding.right) = - get_reduced_view_for_dilation( - output_width, input_width, dcol, m_args.dilation_cols, - m_args.kernel_cols, m_args.stride_cols, padding.left); + std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) = + get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); const TInput *input_col = input_row + start_j * ld_input_col; TOutput *output_col = output_row + dcol * ld_output_col; - if(args.output_cols) + if (args.output_cols) { - this->execute_internal( - args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters, - output_col, ld_output_col_d, ld_output_row_d, ld_output_batch, - working_space, thread_id, n_threads); + this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, + parameters, output_col, ld_output_col_d, ld_output_row_d, + ld_output_batch, working_space, thread_id, n_threads); } } } @@ -310,20 +315,19 @@ public: } protected: - virtual void execute_internal( - const DepthwiseArgs &instance_args, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute_internal(const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; virtual bool uses_premultiply() const { diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp index a5db793b3d..5ff848e281 100644 --- a/src/core/NEON/kernels/assembly/depthwise_common.hpp +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -49,11 +49,7 @@ struct KernelDescription bool is_default = false; uint64_t cycle_estimate = 0; - KernelDescription( - DepthwiseMethod method, - std::string name, - bool is_default, - uint64_t cycle_estimate) + KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate) : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) { } @@ -78,58 +74,51 @@ public: // pointer the bias vector (which may be nullptr in the case of no bias) and // a pointer to the array of weights (stored in HWIO order). virtual void pack_parameters( - void *buffer, - const void *biases, - const void *weights, - size_t ld_weight_col = 0, - size_t ld_weight_row = 0) = 0; + void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0; // Determine the amount of working space required virtual size_t get_working_size(unsigned int n_threads) const = 0; // Execute the convolution over the specified area of memory. - virtual void execute( - const void *input, // Pointer to input tensor - const void *parameters, // Packed parameters buffer - void *output, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute(const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; // To handle a dilation factor of D execute the kernel once for each d in @@ -145,12 +134,13 @@ public: // - Number of valid input pixels corresponding to `d` // - Offset of the first pixel corresponding to `d` // - Amount of padding in the view for `d` -std::tuple<size_t, size_t, size_t, size_t, size_t> -get_reduced_view_for_dilation( - size_t out_size, size_t in_size, - size_t d, size_t dilation_factor, - size_t kernel_size, size_t stride, - size_t pad_before); +std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size, + size_t in_size, + size_t d, + size_t dilation_factor, + size_t kernel_size, + size_t stride, + size_t pad_before); } // namespace depthwise } // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index f1f70cf1d6..045f9f95d3 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -68,45 +68,42 @@ public: virtual size_t get_working_size(unsigned int num_threads) const = 0; // Execute pooling over the specified area of memory. - virtual void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; } // namespace pooling diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp index e8db35c593..89d594298e 100644 --- a/src/core/NEON/kernels/assembly/pooling.hpp +++ b/src/core/NEON/kernels/assembly/pooling.hpp @@ -36,9 +36,8 @@ struct PoolingConfig PoolingMethod method = PoolingMethod::DEFAULT; std::string filter = ""; - PoolingConfig(PoolingMethod method) - : method(method) {}; - PoolingConfig() {}; + PoolingConfig(PoolingMethod method) : method(method){}; + PoolingConfig(){}; }; struct PoolingArgs @@ -57,30 +56,40 @@ struct PoolingArgs const PoolingConfig *config; - PoolingArgs( - const CPUInfo *cpu_info, - PoolingType pool_type, - const PoolingWindow &window, - const PoolingStride &stride, - bool exclude_padding, - unsigned int n_batches, - unsigned int input_rows, - unsigned int input_cols, - unsigned int n_channels, - unsigned int output_rows, - unsigned int output_cols, - const PaddingValues &padding, - const PoolingConfig *cfg) - : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg) + PoolingArgs(const CPUInfo *cpu_info, + PoolingType pool_type, + const PoolingWindow &window, + const PoolingStride &stride, + bool exclude_padding, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int n_channels, + unsigned int output_rows, + unsigned int output_cols, + const PaddingValues &padding, + const PoolingConfig *cfg) + : cpu_info(cpu_info), + pool_type(pool_type), + pool_window(window), + pool_stride(stride), + exclude_padding(exclude_padding), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + n_channels(n_channels), + output_rows(output_rows), + output_cols(output_cols), + padding(padding), + config(cfg) { // If either of the pooling window dimensions are set to zero, meaning // "pool everything", then replace with the corresponding input dimension. - if(pool_window.rows == 0) + if (pool_window.rows == 0) { pool_window.rows = input_rows; } - if(pool_window.cols == 0) + if (pool_window.cols == 0) { pool_window.cols = input_cols; } @@ -100,10 +109,16 @@ struct Requantize32 int32_t per_layer_right_shift = 0; int32_t per_layer_mul = 0; - Requantize32(int32_t input_offset, int32_t output_offset, - int32_t per_layer_left_shift, int32_t per_layer_right_shift, + Requantize32(int32_t input_offset, + int32_t output_offset, + int32_t per_layer_left_shift, + int32_t per_layer_right_shift, int32_t per_layer_mul) - : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul) + : input_offset(input_offset), + output_offset(output_offset), + per_layer_left_shift(per_layer_left_shift), + per_layer_right_shift(per_layer_right_shift), + per_layer_mul(per_layer_mul) { } }; @@ -115,105 +130,88 @@ protected: const PoolingArgs m_args; public: - PoolingCommon(const PoolingArgs &args) - : m_args(args) + PoolingCommon(const PoolingArgs &args) : m_args(args) { } - PoolingCommon(PoolingCommon &) = delete; + PoolingCommon(PoolingCommon &) = delete; PoolingCommon &operator=(PoolingCommon &) = delete; size_t get_working_size(unsigned int) const override = 0; // Execute pooling over the specified area of memory. - void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - input, - m_args.n_channels, - m_args.n_channels * m_args.input_cols, - m_args.n_channels * m_args.input_cols * m_args.input_rows, - output, - m_args.n_channels, - m_args.n_channels * m_args.output_cols, - m_args.n_channels * m_args.output_cols * m_args.output_rows, - working_space, - thread_id, num_threads); + this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols, + m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels, + m_args.n_channels * m_args.output_cols, + m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id, + num_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, - input, ld_input_col, ld_input_row, ld_input_batch, - m_args.padding, m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col, + ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads); } - void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &padding, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &padding, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute_internal( - batches, height, width, channels, padding, - input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row, + ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, num_threads); } protected: - virtual void execute_internal( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const PaddingValues &, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute_internal(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const PaddingValues &, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; template <typename TInput, typename TOutput> diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp index 16f26de38a..fb97cf8baf 100644 --- a/src/core/NEON/kernels/assembly/premultiply.hpp +++ b/src/core/NEON/kernels/assembly/premultiply.hpp @@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr, const unsigned input_channels, const unsigned int channel_multiplier) { - if(sizeof(T) == 4 && channel_multiplier == 6) + if (sizeof(T) == 4 && channel_multiplier == 6) { - do_premultiply_float_6( - (const float *)in_ptr, ld_row, ld_col, - (float *)out_ptr, out_ld_row, out_ld_col, - tile_rows, tile_cols, - input_channels); + do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col, + tile_rows, tile_cols, input_channels); } else { - for(unsigned int i = 0; i < tile_rows; i++) + for (unsigned int i = 0; i < tile_rows; i++) { const T *ip2 = in_ptr + i * ld_row; T *op2 = out_ptr + i * out_ld_row; - for(unsigned int j = 0; j < tile_cols; j++) + for (unsigned int j = 0; j < tile_cols; j++) { const T *ip = ip2; T *op = op2; - for(unsigned int c = 0; c < input_channels; c++) + for (unsigned int c = 0; c < input_channels; c++) { T val = *ip; ip++; - for(unsigned int r = 0; r < channel_multiplier; r++) + for (unsigned int r = 0; r < channel_multiplier; r++) { op[r] = val; } diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp index 50290757ec..dbf95d23cd 100644 --- a/src/core/NEON/kernels/assembly/winograd.hpp +++ b/src/core/NEON/kernels/assembly/winograd.hpp @@ -45,17 +45,24 @@ struct ConvolutionArgs Shape2D kernel_shape; arm_gemm::Activation activation; - ConvolutionArgs( - unsigned int n_batches, - const Shape2D &input_shape, - unsigned int n_input_channels, - unsigned int pad_top, unsigned int pad_left, - const Shape2D &output_shape, - unsigned int n_output_channels, - const Shape2D kernel_shape, - const arm_gemm::Activation &activation = {}) - : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels), - kernel_shape(kernel_shape), activation(activation) + ConvolutionArgs(unsigned int n_batches, + const Shape2D &input_shape, + unsigned int n_input_channels, + unsigned int pad_top, + unsigned int pad_left, + const Shape2D &output_shape, + unsigned int n_output_channels, + const Shape2D kernel_shape, + const arm_gemm::Activation &activation = {}) + : n_batches(n_batches), + input_shape(input_shape), + n_input_channels(n_input_channels), + pad_top(pad_top), + pad_left(pad_left), + output_shape(output_shape), + n_output_channels(n_output_channels), + kernel_shape(kernel_shape), + activation(activation) { } }; @@ -105,23 +112,30 @@ public: virtual unsigned int get_transformed_tile_rows(void) const = 0; virtual unsigned int get_transformed_tile_cols(void) const = 0; - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, const WinogradDomainSpec &wds, - unsigned int thread_id, unsigned int n_threads) const + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + const WinogradDomainSpec &wds, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_row, ld_in_col, ld_input_channel, - outptr, wds.weight_ld_matrix, wds.weight_ld_row, - thread_id, n_threads); + this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix, + wds.weight_ld_row, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, size_t ld_out_matrix, size_t ld_out_row, - unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + size_t ld_out_matrix, + size_t ld_out_row, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace weight_transform @@ -136,27 +150,35 @@ public: virtual unsigned int get_input_rows(void) const = 0; virtual unsigned int get_input_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, const WinogradDomainSpec &wds, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + const WinogradDomainSpec &wds, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_batch, ld_in_row, ld_in_col, - outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row, - working_space, thread_id, n_threads); + this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix, + wds.input_ld_row, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + size_t ld_out_batch, + size_t ld_out_matrix, + size_t ld_out_row, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace input_transform @@ -177,31 +199,37 @@ public: virtual unsigned int get_kernel_rows(void) const = 0; virtual unsigned int get_kernel_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, const WinogradDomainSpec &wds, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + const WinogradDomainSpec &wds, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, - inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, - bias, - outptr, ld_out_batch, ld_out_row, ld_out_col, - working_space, thread_id, n_threads); + this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr, + ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_matrix, + size_t ld_in_row, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace output_transform @@ -210,7 +238,7 @@ struct WinogradImpl { const output_transform::ITransform *output_transform = nullptr; const weight_transform::ITransform *weight_transform = nullptr; - const input_transform::ITransform *input_transform = nullptr; + const input_transform::ITransform *input_transform = nullptr; std::unique_ptr<arm_gemm::GemmArgs> gemm_args; WinogradDomainSpec winograd_spec; }; @@ -220,15 +248,18 @@ struct WinogradImpl * Assigns to the pointers in the `dest` struct and returns true or false to * indicate whether the given problem can be executed or not. */ -template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut> -bool get_implementation( - WinogradImpl &dest, // Destination for the selected implementation - const CPUInfo *, - const ConvolutionArgs &, - int max_threads, - bool fast_mode, - const WinogradConfig *, - const arm_gemm::GemmConfig *); +template <typename TIn, + typename TWeight = TIn, + typename TOut = TIn, + typename TWinogradIn = TIn, + typename TWinogradOut = TOut> +bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation + const CPUInfo *, + const ConvolutionArgs &, + int max_threads, + bool fast_mode, + const WinogradConfig *, + const arm_gemm::GemmConfig *); } // namespace winograd } // namespace arm_conv |