diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp | 450 |
1 files changed, 211 insertions, 239 deletions
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp index ad95207fb3..556ae2a67a 100644 --- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,288 +24,260 @@ #pragma once -#include "pool_common.hpp" +#include "depthfirst_driver.hpp" +#include "src/core/NEON/kernels/arm_conv/addressing.hpp" #include "utils.hpp" -#include "arm_compute/core/Types.h" +#include <alloca.h> #include <limits> namespace arm_conv { namespace pooling { -template <class strategy> -class PoolingDepthfirst : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type> +template <typename TInput, typename TOutput> +class DepthfirstStrategy : public IDepthfirstStrategy { - using TInput = typename strategy::operand_type; - using TOutput = typename strategy::return_type; + unsigned int input_rows, input_cols, output_rows, output_cols; - const PoolingArgs m_args; // Copy of arguments - - constexpr static unsigned int input_rows(void) + public: + DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols, + unsigned int stride_rows, unsigned int stride_cols, + unsigned int output_rows, unsigned int output_cols) + : input_rows(output_rows + (window_rows - 1) * stride_rows), + input_cols(output_cols + (window_cols - 1) * stride_cols), + output_rows(output_rows), output_cols(output_cols) { - return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows(); } - constexpr static unsigned int input_cols(void) - { - return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols(); - } + unsigned int get_input_rows() const override { return input_rows; } + unsigned int get_input_cols() const override { return input_cols; } + unsigned int get_output_rows() const override { return output_rows; } + unsigned int get_output_cols() const override { return output_cols; } + + typedef void (*KernelType)( + unsigned int n_channels, + const TInput *const *, + TOutput *const *, + bool exclude_padding, + unsigned int pad_left, + unsigned int pad_top, + unsigned int pad_right, + unsigned int pad_bottom + ); + virtual KernelType get_kernel(void) const = 0; +}; + +struct WorkingSpace +{ + void *input_buffer; + void *output_buffer; +}; + + +template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing> +class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput> +{ size_t sizeof_input_buffer(void) const { - return sizeof(TInput) * m_args.n_channels; + return sizeof(TInput) * this->m_args.n_channels; } size_t sizeof_output_buffer(void) const { - return sizeof(TOutput) * m_args.n_channels; + return sizeof(TOutput) * this->m_args.n_channels; } - public: - PoolingDepthfirst(const PoolingArgs &args) : m_args(args) + protected: + /* Compute the amount of working space required for a single thread. */ + size_t get_working_size_per_thread(unsigned int n_channels) const override { + return sizeof(WorkingSpace) + n_channels * (sizeof(TInput) + sizeof(TOutput)); } - PoolingDepthfirst(PoolingDepthfirst &) = delete; - PoolingDepthfirst &operator=(PoolingDepthfirst &) = delete; - - size_t get_working_size(unsigned int num_threads) const override + /* Initialise the working space for a thread. */ + void initialise_working_space(void *raw_ws, unsigned int n_channels) const override { - // We require a channel-length vector of input padding values - // (to be shared amongst all threads) and (for each thread) a - // channel-length vector in which to dump surplus output. - return sizeof_input_buffer() + num_threads * sizeof_output_buffer(); + auto ws = reinterpret_cast<WorkingSpace *>(raw_ws); + ws->input_buffer = ws + 1; + ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels; + + // Fill the input buffer with an appropriate value + TInput fill_val = 0; + if (this->m_args.pool_type == PoolingType::MAX) + { + using limits = std::numeric_limits<TInput>; + if (limits::has_infinity) + { + fill_val = -limits::infinity(); + } + else + { + fill_val = limits::min(); + } + } + + auto ptr = reinterpret_cast<TInput *>(ws->input_buffer); + for (; n_channels; n_channels--) + { + *(ptr++) = fill_val; + } } - void execute( - const void *const input, - void *const output, - void *const working_space, - unsigned int thread_id, - unsigned int num_threads + /* Compute a portion of the output tensor with padding. */ + void compute_tile_padded( + unsigned int output_i, unsigned int output_j, + unsigned int channel_start, unsigned int channel_end, + const TensorSpec<const TInput *> &input, + const TensorSpec<TOutput *> &output, + void *working_space ) const override { - const size_t ld_input_col = m_args.n_channels; - const size_t ld_input_row = ld_input_col * m_args.input_cols; - const size_t ld_input_batch = ld_input_row * m_args.input_rows; - const size_t ld_output_col = ld_input_col; - const size_t ld_output_row = ld_output_col * m_args.output_cols; - const size_t ld_output_batch = ld_output_row * m_args.output_rows; - - execute( - input, ld_input_col, ld_input_row, ld_input_batch, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, - thread_id, num_threads + const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>( + this->m_strat.get())->get_kernel(); + + // Get the working space, and some space on the stack for pointer arrays + auto ws = reinterpret_cast<WorkingSpace *>(working_space); + auto inptr_array = reinterpret_cast<const TInput **>(alloca( + sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols())); + auto outptr_array = reinterpret_cast<TOutput **>(alloca( + sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols())); + + // Prepare the input pointers + const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0); + const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii); + + const unsigned int end_ii = ii + this->m_strat->get_input_rows(); + const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows; + + const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left; + const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0); + const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij); + + const unsigned int end_ij = ij + this->m_strat->get_input_cols(); + const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols; + + fill_pointer_array<const TInput>( + inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), + input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, + input.ld_row, input.ld_col, + reinterpret_cast<const TInput *>(ws->input_buffer), + input_pad_top, this->m_args.input_rows - input_i, + input_pad_left, this->m_args.input_cols - input_j ); - } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const working_space, - unsigned int thread_id, - unsigned int num_threads - ) const override - { - execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, - m_args.n_channels, - input, ld_input_col, ld_input_row, ld_input_batch, - m_args.padding, - m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, - thread_id, num_threads + // Prepare the output pointers + fill_pointer_array( + outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start, + output.ld_row, output.ld_col, + reinterpret_cast<TOutput *>(ws->output_buffer), + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); + + // Call the kernel + kern( + channel_end - channel_start, inptr_array, outptr_array, + this->m_args.exclude_padding, + input_pad_left, input_pad_top, + input_pad_right, input_pad_bottom ); } - void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const _input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &padding, - unsigned int output_height, - unsigned int output_width, - void *const _output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const _working_space, - unsigned int thread_id, - unsigned int num_threads + // Compute a portion of the work with only top/bottom padding. + void compute_row_padded_tile_row( + const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, + const unsigned int channel_start, const unsigned int channel_end, + const TensorSpec<const TInput *> &input, + const TensorSpec<TOutput *> &output, + void *working_space ) const override { - ARM_COMPUTE_UNUSED(batches, ld_input_batch, ld_output_batch); - strategy strat(m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif // CYCLE_PROFILING - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast<const TInput *>(_input); - TOutput *const outptr = static_cast<TOutput *>(_output); - - const unsigned int roundup_output_rows = roundup(output_height, num_threads); - const unsigned int rows_per_thread = roundup_output_rows / num_threads; - const int start_out_height = static_cast<int>(thread_id * rows_per_thread); - const int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread)); - - // Create an array for the input pointers - const TInput * _inptr_array[input_rows() * input_cols()]; - const TInput **const inptr_array = _inptr_array; - - // Create an array for the output pointers - TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()]; - TOutput **const outptr_array = _outptr_array; - - // Allocate portions of the working space - uint8_t *const working_space = static_cast<uint8_t *>(_working_space); - TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + thread_id * sizeof_output_buffer()); - TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + num_threads * sizeof_output_buffer()); - - // Initialise the input buffer - for (unsigned int c = 0; c < channels; c++) - { - TInput &val = input_buffer[c]; + const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>( + this->m_strat.get())->get_kernel(); + + // Get the working space, and some space on the stack for pointer arrays + auto ws = reinterpret_cast<WorkingSpace *>(working_space); + auto inptr_array = reinterpret_cast<const TInput **>(alloca( + sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols())); + auto outptr_array = reinterpret_cast<TOutput **>(alloca( + sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols())); + + // Prepare the initial input pointers + const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0); + const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii); + + const unsigned int end_ii = ii + this->m_strat->get_input_rows(); + const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows; + + const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left; + const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij); + + const auto end_oi = output_i + this->m_strat->get_output_cols(); + const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows; + + fill_pointer_array<const TInput>( + inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), + input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, + input.ld_row, input.ld_col, + reinterpret_cast<const TInput *>(ws->input_buffer), + input_pad_top, this->m_args.input_rows - input_i, + 0, this->m_args.input_cols - input_j + ); - if (strategy::pooling_type() == PoolingType::AVERAGE) - { - val = static_cast<TInput>(0); - } - else if (strategy::pooling_type() == PoolingType::MAX) - { -#if defined(__aarch64__) - using InputType = typename std::conditional<std::is_same<TInput, __fp16>::value, arm_compute::half, TInput>::type; - using limits = std::numeric_limits<InputType>; -#else // defined(__aarch64__) - using limits = std::numeric_limits<TInput>; -#endif // defined(__aarch64__) - if (limits::has_infinity) - { - val = -limits::infinity(); - } - else - { - val = limits::min(); - } - } - } + // Prepare the initial output pointers + fill_pointer_array( + outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start, + output.ld_row, output.ld_col, + reinterpret_cast<TOutput *>(ws->output_buffer), + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) + // Call the kernel + for (; n_tile_cols; n_tile_cols--) { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; + kern( + channel_end - channel_start, inptr_array, outptr_array, + this->m_args.exclude_padding, + 0, input_pad_top, + 0, input_pad_bottom + ); + + // Progress the input and output pointer arrays + const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols; + for ( + auto n = input_pad_top * this->m_strat->get_input_cols(); + n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols(); + n++ + ) + { + inptr_array[n] += input_col_stride; + } - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast<int>(strategy::out_rows())) + const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols(); + for ( + auto n = 0u; + n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols(); + n++ + ) { - const int end_out_i = start_out_i + strategy::out_rows(); - const int start_in_i = start_out_i * strategy::stride_rows() - padding.top; - const int end_in_i = start_in_i + input_rows(); - - // Compute top/bottom padding - TODO Is this right for average pooling? - const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast<int>(end_out_height) - start_out_i - ); - - // Fill the input pointer array with padding values - for (auto index = 0u; index < input_rows() * input_cols(); index++) - { - inptr_array[index] = input_buffer; - } - - for (int start_out_j = 0, start_in_j = -padding.left; - start_out_j < static_cast<int>(output_width); - start_out_j += static_cast<int>(strategy::out_cols()), - start_in_j += static_cast<int>(strategy::out_cols()) * strategy::stride_cols()) - { - const int end_out_j = start_out_j + strategy::out_cols(); - const int end_in_j = start_in_j + input_cols(); - - // Compute left/right padding - TODO Is this right for average pooling? - const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0)); - const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0)); - - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast<int>(output_width) - start_out_j - ); - - // Construct the input pointer array - fill the array with pointers to - // the input buffer and then fill in the required values. - for (auto i = pad_top; i < input_rows() - pad_bottom; i++) - { - // Can skip over the left padding because we will have either the - // same or less than the previous tile. - unsigned int j = pad_left; - const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col; - const TInput **ptrs = inptr_array + i * input_cols() + j; - for (; j < input_cols() - pad_right; j++) - { - *(ptrs++) = colptr; - colptr += ld_input_col; - } - for (; j < input_cols(); j++) - { - *(ptrs++) = input_buffer; - } - } - - // Construct the output pointer array. - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) - { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < strategy::out_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } - } - for (auto i = valid_output_rows; i < strategy::out_rows(); i++) - { - for (auto j = 0u; j < strategy::out_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } - } - -#ifdef CYCLE_PROFILING - // TODO Work number - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols())); -#endif - strat.kernel( - channels, inptr_array, outptr_array, - m_args.exclude_padding, pad_left, pad_top, pad_right, pad_bottom - ); - } + outptr_array[n] += output_col_stride; } } } + + public: + PoolingDepthfirst(const DepthfirstStrategy<TInput, TOutput> *strat, + const PoolingArgs &args, const OutputStage &os = {}) + : DepthfirstDriver<TInput, TOutput>(strat, args) + { + ARM_COMPUTE_UNUSED(os); + } }; } // namespace pooling |