diff options
Diffstat (limited to 'src/core/NEON/kernels/assembly')
-rw-r--r-- | src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h | 88 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/common.hpp | 34 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/depthwise.hpp | 351 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/depthwise_common.hpp | 146 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/pool_common.hpp | 85 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/pooling.hpp | 168 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/premultiply.hpp | 78 | ||||
-rw-r--r-- | src/core/NEON/kernels/assembly/winograd.hpp | 265 |
8 files changed, 1048 insertions, 167 deletions
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h deleted file mode 100644 index a956898403..0000000000 --- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H -#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H - -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/NEON/INEKernel.h" - -#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** This class is a wrapper for the depthwise convolution assembly kernels. */ -class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel -{ -public: - const char *name() const override - { - return "NEDepthwiseConvolutionAssemblyKernelWrapper"; - } - - /** Default constructor */ - NEDepthwiseConvolutionAssemblyKernelWrapper() - : _kernel(nullptr) - { - } - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; - /** Default Move Constructor. */ - NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; - /** Default move assignment operator */ - NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; - - /** Initialise the kernel's input and output. - * - * @param[in] kernel Pointer to an assembly kernel implementation. - */ - void configure(depthwise::IDepthwiseConvolution *kernel) - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel))); - _kernel = kernel; - Window win; - win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1)); - INEKernel::configure(win); - } - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - auto first = window.x().start(); - auto last = window.x().end(); - _kernel->run(first, last, info.thread_id); - } - -private: - depthwise::IDepthwiseConvolution *_kernel; -}; -} // namespace arm_compute -#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/core/NEON/kernels/assembly/common.hpp b/src/core/NEON/kernels/assembly/common.hpp new file mode 100644 index 0000000000..d82d11cae0 --- /dev/null +++ b/src/core/NEON/kernels/assembly/common.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +namespace arm_conv +{ +struct PaddingValues +{ + unsigned int left, top, right, bottom; +}; + +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp new file mode 100644 index 0000000000..13c2d314e4 --- /dev/null +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include "arm_gemm_local.hpp" +#include "depthwise_common.hpp" +#include "premultiply.hpp" + +namespace arm_conv +{ +namespace depthwise +{ +struct DepthwiseConfig +{ + DepthwiseMethod method = DepthwiseMethod::DEFAULT; + std::string filter = ""; + + DepthwiseConfig(DepthwiseMethod method) : method(method){}; + DepthwiseConfig(){}; +}; + +struct DepthwiseArgs +{ + const CPUInfo *cpu_info; + + unsigned int kernel_rows, kernel_cols; + unsigned int stride_rows, stride_cols; + unsigned int dilation_rows, dilation_cols; + + unsigned int n_batches, input_rows, input_cols, input_channels; + unsigned int output_rows, output_cols; + unsigned int channel_multiplier; + + PaddingValues padding; + + arm_gemm::Activation activation; + + const DepthwiseConfig *config; + + bool fast_mode = false; + + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int dilation_rows, + unsigned int dilation_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + + const DepthwiseConfig *config) + : cpu_info(cpu_info), + kernel_rows(kernel_rows), + kernel_cols(kernel_cols), + stride_rows(stride_rows), + stride_cols(stride_cols), + dilation_rows(dilation_rows), + dilation_cols(dilation_cols), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + input_channels(input_channels), + output_rows(output_rows), + output_cols(output_cols), + channel_multiplier(channel_multiplier), + padding(padding), + activation(activation), + config(config) + { + } + + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + const DepthwiseConfig *config) + : DepthwiseArgs(cpu_info, + kernel_rows, + kernel_cols, + stride_rows, + stride_cols, + 1, + 1, + n_batches, + input_rows, + input_cols, + input_channels, + output_rows, + output_cols, + channel_multiplier, + padding, + activation, + config) + { + } +}; + +template <typename TInput> +struct Tile +{ + TInput *array; + + unsigned int tile_rows = 0; + unsigned int tile_cols = 0; + unsigned int tile_channels = 0; + + Tile(TInput *array, unsigned int tile_rows, unsigned int tile_cols, unsigned int tile_channels) + : array(array), tile_rows(tile_rows), tile_cols(tile_cols), tile_channels(tile_channels) + { + } + + Tile() : Tile(nullptr, 0, 0, 0) + { + } + + void load_from(const TInput *input, + const unsigned int ld_row, + const unsigned int ld_col, + const unsigned int n_rows, + const unsigned int n_cols, + const int input_i, + const int input_j, + const unsigned int channel_multiplier) const + { + const auto pad_top = input_i < 0 ? -input_i : 0; + const auto pad_left = input_j < 0 ? -input_j : 0; + + const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top; + const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left; + + if (padded_rows < tile_rows || padded_cols < tile_cols) + { + memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput)); + } + + do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row, + ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, + tile_cols * tile_channels, tile_channels, padded_rows, padded_cols, + tile_channels / channel_multiplier, channel_multiplier); + } +}; + +template <typename TInput, typename TWeight, typename TOutput> +class DepthwiseCommon : public IDepthwiseCommon +{ +protected: + const DepthwiseArgs m_args; // Copy of arguments + std::string m_name{}; + +public: + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon &operator=(DepthwiseCommon &) = delete; + + std::string name() const override + { + return m_name; + } + + void set_name(std::string name) + { + // Only allow the name to be set once + if (m_name.empty()) + { + m_name = name; + } + } + + void execute(const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final + { + const size_t ld_input_col = m_args.input_channels; + const size_t ld_input_row = ld_input_col * m_args.input_cols; + const size_t ld_input_batch = ld_input_row * m_args.input_rows; + const size_t ld_output_col = m_args.input_channels * m_args.channel_multiplier; + const size_t ld_output_row = ld_output_col * m_args.output_cols; + const size_t ld_output_batch = ld_output_row * m_args.output_rows; + + execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, n_threads); + } + + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final + { + execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads); + } + + void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final + { + // Construct a new set of arguments to reflect that we might have been + // passed different input/output tensors. Dilation is handled at this + // level; so we set the dilation in the arguments to zero. + DepthwiseArgs args(this->m_args); + args.n_batches = batches; + args.input_rows = input_height; + args.input_cols = input_width; + args.input_channels = channels; + args.output_rows = output_height; + args.output_cols = output_width; + args.padding = padding; + args.dilation_rows = args.dilation_cols = 1; + + auto ld_input_col_d = ld_input_col * m_args.dilation_cols; + auto ld_input_row_d = ld_input_row * m_args.dilation_rows; + auto ld_output_col_d = ld_output_col * m_args.dilation_cols; + auto ld_output_row_d = ld_output_row * m_args.dilation_rows; + + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) + { + size_t start_i; + std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); + + auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row; + auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row; + + if (args.output_rows) + { + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + { + size_t start_j; + std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) = + get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); + + const TInput *input_col = input_row + start_j * ld_input_col; + TOutput *output_col = output_row + dcol * ld_output_col; + + if (args.output_cols) + { + this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, + parameters, output_col, ld_output_col_d, ld_output_row_d, + ld_output_batch, working_space, thread_id, n_threads); + } + } + } + } + } + +protected: + virtual void execute_internal(const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual bool uses_premultiply() const + { + return true; + } +}; + +template <typename TInput, typename TWeight = TInput, typename TOutput = TInput> +using UniqueDepthwiseCommon = std::unique_ptr<DepthwiseCommon<TInput, TWeight, TOutput>>; + +template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing> +KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {}); + +template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing> +UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &, const OutputStage & = {}); + +template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing> +std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {}); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp new file mode 100644 index 0000000000..5ff848e281 --- /dev/null +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include "common.hpp" +#include <cstddef> +#include <tuple> + +namespace arm_conv +{ +namespace depthwise +{ +using arm_gemm::Nothing; + +enum class DepthwiseMethod +{ + DEFAULT, + DEPTHFIRST, + PLANAR, +}; + +struct KernelDescription +{ + DepthwiseMethod method = DepthwiseMethod::DEFAULT; + std::string name = ""; + bool is_default = false; + uint64_t cycle_estimate = 0; + + KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate) + : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) + { + } + + KernelDescription() noexcept {}; +}; + +class IDepthwiseCommon +{ +public: + virtual ~IDepthwiseCommon() = default; + + // Get the name of the depthwise implementation + virtual std::string name() const = 0; + + // Determine the amount of storage space required for the rearranged weights + // and bias. + virtual size_t get_storage_size(void) const = 0; + + // Rearrange the weights and biases into a storage buffer. + // Accepts a pointer to a buffer into which to store the packed parameters, a + // pointer the bias vector (which may be nullptr in the case of no bias) and + // a pointer to the array of weights (stored in HWIO order). + virtual void pack_parameters( + void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0; + + // Determine the amount of working space required + virtual size_t get_working_size(unsigned int n_threads) const = 0; + + // Execute the convolution over the specified area of memory. + virtual void execute(const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; +}; + +// To handle a dilation factor of D execute the kernel once for each d in +// [0..D). Each `d` corresponds to a portion or "view" of the input and output +// tensors. The output view corresponds to every Dth pixel starting from `d`; +// this function computes how many pixels are covered. The input view consists +// of an amount of before padding, every Dth pixel starting from an offset, and +// some after padding. This function computes the start padding, input offset, +// number of valid input pixels, and the after padding. +// +// Returns +// - Number of valid output pixels corresponding to `d` +// - Number of valid input pixels corresponding to `d` +// - Offset of the first pixel corresponding to `d` +// - Amount of padding in the view for `d` +std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size, + size_t in_size, + size_t d, + size_t dilation_factor, + size_t kernel_size, + size_t stride, + size_t pad_before); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index fdc18aef39..045f9f95d3 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,10 +23,9 @@ */ #pragma once - #ifdef CYCLE_PROFILING #include "profiler.hpp" -#endif // CYCLE_PROFILING +#endif namespace arm_conv { @@ -69,54 +68,42 @@ public: virtual size_t get_working_size(unsigned int num_threads) const = 0; // Execute pooling over the specified area of memory. - virtual void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; - - virtual void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; -}; + virtual void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; -struct Nothing -{ -}; - -template <typename TInput, typename TOutput, class OutputStage = Nothing> -class PoolingCommon : public IPoolingCommon -{ + virtual void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; } // namespace pooling diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp index 2325bd08ca..89d594298e 100644 --- a/src/core/NEON/kernels/assembly/pooling.hpp +++ b/src/core/NEON/kernels/assembly/pooling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,6 @@ #include "arm_gemm_local.hpp" #include "pool_common.hpp" -#include <memory> - namespace arm_conv { namespace pooling @@ -38,9 +36,8 @@ struct PoolingConfig PoolingMethod method = PoolingMethod::DEFAULT; std::string filter = ""; - PoolingConfig(PoolingMethod method) - : method(method) {}; - PoolingConfig() {}; + PoolingConfig(PoolingMethod method) : method(method){}; + PoolingConfig(){}; }; struct PoolingArgs @@ -59,36 +56,50 @@ struct PoolingArgs const PoolingConfig *config; - PoolingArgs( - const CPUInfo *cpu_info, - PoolingType pool_type, - const PoolingWindow &window, - const PoolingStride &stride, - bool exclude_padding, - unsigned int n_batches, - unsigned int input_rows, - unsigned int input_cols, - unsigned int n_channels, - unsigned int output_rows, - unsigned int output_cols, - const PaddingValues &padding, - const PoolingConfig *cfg) - : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg) + PoolingArgs(const CPUInfo *cpu_info, + PoolingType pool_type, + const PoolingWindow &window, + const PoolingStride &stride, + bool exclude_padding, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int n_channels, + unsigned int output_rows, + unsigned int output_cols, + const PaddingValues &padding, + const PoolingConfig *cfg) + : cpu_info(cpu_info), + pool_type(pool_type), + pool_window(window), + pool_stride(stride), + exclude_padding(exclude_padding), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + n_channels(n_channels), + output_rows(output_rows), + output_cols(output_cols), + padding(padding), + config(cfg) { // If either of the pooling window dimensions are set to zero, meaning // "pool everything", then replace with the corresponding input dimension. - if(pool_window.rows == 0) + if (pool_window.rows == 0) { pool_window.rows = input_rows; } - if(pool_window.cols == 0) + if (pool_window.cols == 0) { pool_window.cols = input_cols; } } }; +struct Nothing +{ +}; + struct Requantize32 { int32_t input_offset = 0; @@ -98,20 +109,117 @@ struct Requantize32 int32_t per_layer_right_shift = 0; int32_t per_layer_mul = 0; - Requantize32(int32_t input_offset, int32_t output_offset, - int32_t per_layer_left_shift, int32_t per_layer_right_shift, + Requantize32(int32_t input_offset, + int32_t output_offset, + int32_t per_layer_left_shift, + int32_t per_layer_right_shift, int32_t per_layer_mul) - : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul) + : input_offset(input_offset), + output_offset(output_offset), + per_layer_left_shift(per_layer_left_shift), + per_layer_right_shift(per_layer_right_shift), + per_layer_mul(per_layer_mul) { } }; -template <typename TInput, typename TOutput, class OutputStage = Nothing> -using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput, OutputStage>>; +template <typename TInput, typename TOutput> +class PoolingCommon : public IPoolingCommon +{ +protected: + const PoolingArgs m_args; + +public: + PoolingCommon(const PoolingArgs &args) : m_args(args) + { + } + PoolingCommon(PoolingCommon &) = delete; + PoolingCommon &operator=(PoolingCommon &) = delete; + + size_t get_working_size(unsigned int) const override = 0; + + // Execute pooling over the specified area of memory. + void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override + { + this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols, + m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels, + m_args.n_channels * m_args.output_cols, + m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id, + num_threads); + } + + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override + { + this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col, + ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads); + } + + void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &padding, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override + { + this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row, + ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, num_threads); + } + +protected: + virtual void execute_internal(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const PaddingValues &, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; +}; + +template <typename TInput, typename TOutput> +using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput>>; // Get a pooling engine template <typename TInput, typename TOutput = TInput, class OutputStage = Nothing> -UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &, const OutputStage & = {}); +UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &, const OutputStage & = {}); } // namespace pooling } // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp new file mode 100644 index 0000000000..fb97cf8baf --- /dev/null +++ b/src/core/NEON/kernels/assembly/premultiply.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +void do_premultiply_float_6(const float *in_ptr, + const unsigned int ld_row, + const unsigned int ld_col, + float *out_ptr, + const unsigned int out_ld_row, + const unsigned int out_ld_col, + const unsigned int tile_rows, + const unsigned int tile_cols, + const unsigned input_channels); + +template <typename T> +void do_premultiply(const T *in_ptr, + const unsigned int ld_row, + const unsigned int ld_col, + T *out_ptr, + const unsigned int out_ld_row, + const unsigned int out_ld_col, + const unsigned int tile_rows, + const unsigned int tile_cols, + const unsigned input_channels, + const unsigned int channel_multiplier) +{ + if (sizeof(T) == 4 && channel_multiplier == 6) + { + do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col, + tile_rows, tile_cols, input_channels); + } + else + { + for (unsigned int i = 0; i < tile_rows; i++) + { + const T *ip2 = in_ptr + i * ld_row; + T *op2 = out_ptr + i * out_ld_row; + for (unsigned int j = 0; j < tile_cols; j++) + { + const T *ip = ip2; + T *op = op2; + for (unsigned int c = 0; c < input_channels; c++) + { + T val = *ip; + ip++; + + for (unsigned int r = 0; r < channel_multiplier; r++) + { + op[r] = val; + } + op += channel_multiplier; + } + ip2 += ld_col; + op2 += out_ld_col; + } + } + } +} diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp new file mode 100644 index 0000000000..dbf95d23cd --- /dev/null +++ b/src/core/NEON/kernels/assembly/winograd.hpp @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include <cstddef> + +namespace arm_conv +{ +struct Shape2D +{ + unsigned int rows, cols; +}; + +struct ConvolutionArgs +{ + unsigned int n_batches; + Shape2D input_shape; + unsigned int n_input_channels; + unsigned int pad_top, pad_left; + Shape2D output_shape; + unsigned int n_output_channels; + Shape2D kernel_shape; + arm_gemm::Activation activation; + + ConvolutionArgs(unsigned int n_batches, + const Shape2D &input_shape, + unsigned int n_input_channels, + unsigned int pad_top, + unsigned int pad_left, + const Shape2D &output_shape, + unsigned int n_output_channels, + const Shape2D kernel_shape, + const arm_gemm::Activation &activation = {}) + : n_batches(n_batches), + input_shape(input_shape), + n_input_channels(n_input_channels), + pad_top(pad_top), + pad_left(pad_left), + output_shape(output_shape), + n_output_channels(n_output_channels), + kernel_shape(kernel_shape), + activation(activation) + { + } +}; + +namespace winograd +{ +/* Constrain the selected Winograd implementation. + */ +struct WinogradConfig +{ + unsigned int output_rows = 0, output_cols = 0; + std::string input_transform_filter = ""; + std::string output_transform_filter = ""; + std::string weight_transform_filter = ""; +}; + +/* Struct describing (suggested) memory layout within the Winograd domain. + */ +struct WinogradDomainSpec +{ + size_t weight_matrix_size_bytes, input_matrix_size_bytes, output_matrix_size_bytes; + + size_t weight_ld_matrix, weight_ld_row; + size_t input_ld_batch, input_ld_matrix, input_ld_row; + size_t output_ld_batch, output_ld_matrix, output_ld_row; +}; + +class ITransformCommon +{ +public: + virtual ~ITransformCommon() = default; + + // Get the name of the transform + virtual const std::string &get_name(void) const = 0; +}; + +namespace weight_transform +{ +class ITransform : public ITransformCommon +{ +public: + ~ITransform() = default; + + virtual unsigned int get_kernel_rows(void) const = 0; + virtual unsigned int get_kernel_cols(void) const = 0; + + virtual unsigned int get_transformed_tile_rows(void) const = 0; + virtual unsigned int get_transformed_tile_cols(void) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + const WinogradDomainSpec &wds, + unsigned int thread_id, + unsigned int n_threads) const + { + this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix, + wds.weight_ld_row, thread_id, n_threads); + } + + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + size_t ld_out_matrix, + size_t ld_out_row, + unsigned int thread_id, + unsigned int n_threads) const = 0; +}; + +} // namespace weight_transform + +namespace input_transform +{ +class ITransform : public ITransformCommon +{ +public: + ~ITransform() = default; + + virtual unsigned int get_input_rows(void) const = 0; + virtual unsigned int get_input_cols(void) const = 0; + + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + const WinogradDomainSpec &wds, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const + { + this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix, + wds.input_ld_row, working_space, thread_id, n_threads); + } + + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + size_t ld_out_batch, + size_t ld_out_matrix, + size_t ld_out_row, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; +}; + +} // namespace input_transform + +namespace output_transform +{ +class ITransform : public ITransformCommon +{ +public: + ~ITransform() = default; + + virtual unsigned int get_input_rows(void) const = 0; + virtual unsigned int get_input_cols(void) const = 0; + + virtual unsigned int get_output_rows(void) const = 0; + virtual unsigned int get_output_cols(void) const = 0; + + virtual unsigned int get_kernel_rows(void) const = 0; + virtual unsigned int get_kernel_cols(void) const = 0; + + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + const WinogradDomainSpec &wds, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const + { + this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr, + ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads); + } + + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_matrix, + size_t ld_in_row, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; +}; + +} // namespace output_transform + +struct WinogradImpl +{ + const output_transform::ITransform *output_transform = nullptr; + const weight_transform::ITransform *weight_transform = nullptr; + const input_transform::ITransform *input_transform = nullptr; + std::unique_ptr<arm_gemm::GemmArgs> gemm_args; + WinogradDomainSpec winograd_spec; +}; + +/* Get pointers to Winograd transforms for the given convolution problem. + * + * Assigns to the pointers in the `dest` struct and returns true or false to + * indicate whether the given problem can be executed or not. + */ +template <typename TIn, + typename TWeight = TIn, + typename TOut = TIn, + typename TWinogradIn = TIn, + typename TWinogradOut = TOut> +bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation + const CPUInfo *, + const ConvolutionArgs &, + int max_threads, + bool fast_mode, + const WinogradConfig *, + const arm_gemm::GemmConfig *); + +} // namespace winograd +} // namespace arm_conv |