aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp')
-rw-r--r--src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp342
1 files changed, 342 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
new file mode 100644
index 0000000000..af0dd04298
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include <memory>
+#include <string>
+
+namespace arm_conv {
+namespace winograd {
+
+enum class MethodConstraints
+{
+ None,
+ RequiresSVE = 0x1,
+ RequiresSVE2 = 0x2,
+ RequiresSME = 0x4,
+ RequiresSME2 = 0x8,
+ LargerShape = 0x10, // Input tensor shape is larger than the output transform tile shape.
+};
+
+constexpr inline bool operator!(const MethodConstraints &c)
+{
+ return c == MethodConstraints::None;
+}
+
+constexpr inline MethodConstraints operator|(const MethodConstraints &a, const MethodConstraints &b)
+{
+ return static_cast<MethodConstraints>(static_cast<unsigned int>(a) | static_cast<unsigned int>(b));
+}
+
+constexpr inline MethodConstraints operator&(const MethodConstraints &a, const MethodConstraints &b)
+{
+ return static_cast<MethodConstraints>(static_cast<unsigned int>(a) & static_cast<unsigned int>(b));
+}
+
+inline bool constraints_met(const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &, const WinogradConfig *)
+{
+ return (
+ (!(c & MethodConstraints::RequiresSVE) || (ci->has_sve())) &&
+ (!(c & MethodConstraints::RequiresSVE2) || (ci->has_sve2())) &&
+ (!(c & MethodConstraints::RequiresSME) || (ci->has_sme())) &&
+ (!(c & MethodConstraints::RequiresSME2) || (ci->has_sme2()))
+ // Add further constraints here
+ );
+}
+
+inline bool output_transform_constraints_met(const output_transform::ITransform *transform, const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg)
+{
+ return (
+ constraints_met(c, ci, conv_args, cfg) &&
+ (!(c & MethodConstraints::LargerShape) || (conv_args.input_shape.rows > transform->get_output_rows() && conv_args.input_shape.cols > transform->get_output_cols()))
+ );
+}
+
+namespace weight_transform {
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+ std::unique_ptr<const ITransform> transform;
+ MethodConstraints constraints;
+
+ TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+ : transform(transform), constraints(constraints)
+ {
+ }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+} // namespace weight_transform
+
+namespace input_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+ std::unique_ptr<const ITransform> transform;
+ MethodConstraints constraints;
+
+ TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+ : transform(transform), constraints(constraints)
+ {
+ }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+} // namespace input_transform
+
+namespace output_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+ std::unique_ptr<const ITransform> transform;
+ MethodConstraints constraints;
+
+ TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+ : transform(transform), constraints(constraints)
+ {
+ }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+} // namespace output_transform
+
+namespace{
+
+template <typename T>
+constexpr T iceildiv(T num, T den)
+{
+ return (num + den - 1) / den;
+}
+
+template <typename T>
+constexpr T iroundup(T num, T den)
+{
+ return den * iceildiv(num, den);
+}
+
+}
+
+template <typename TWeight, typename TWinogradIn>
+inline std::vector<const weight_transform::ITransform *> get_weight_transforms(
+ const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+ // Get target inner tile size
+ const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+ const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+ std::vector<const weight_transform::ITransform *> weight_transforms;
+ for (auto impl = weight_transform::implementation_list<TWeight, TWinogradIn>();
+ impl->transform.get() != nullptr; impl++)
+ {
+ // If this transform supports the requested kernel size, then add it to the
+ // list of weight transforms.
+ if (
+ constraints_met(impl->constraints, ci, conv_args, cfg) &&
+ impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+ impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+ (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_transformed_tile_rows()) &&
+ (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_transformed_tile_cols()) &&
+ (cfg->weight_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->weight_transform_filter.c_str()))
+ )
+ {
+ weight_transforms.push_back(impl->transform.get());
+ }
+ }
+
+ return weight_transforms;
+}
+
+template <typename TIn, typename TWinogradIn>
+inline std::vector<const input_transform::ITransform *> get_input_transforms(
+ const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+ // Get target inner tile size
+ const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+ const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+ std::vector<const input_transform::ITransform *> input_transforms;
+ for (auto impl = input_transform::implementation_list<TIn, TWinogradIn>();
+ impl->transform.get() != nullptr; impl++)
+ {
+ if(
+ constraints_met(impl->constraints, ci, conv_args, cfg) &&
+ (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_input_rows()) &&
+ (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_input_cols()) &&
+ (cfg->input_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->input_transform_filter.c_str()))
+ )
+ {
+ input_transforms.push_back(impl->transform.get());
+ }
+ }
+
+ return input_transforms;
+}
+
+template <typename TWinogradOut, typename TOut>
+inline std::vector<const output_transform::ITransform *> get_output_transforms(
+ const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+ std::vector<const output_transform::ITransform *> output_transforms;
+ for (auto impl = output_transform::implementation_list<TWinogradOut, TOut>();
+ impl->transform.get() != nullptr; impl++)
+ {
+ if(
+ output_transform_constraints_met(impl->transform.get(), impl->constraints, ci, conv_args, cfg) &&
+ impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+ impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+ (cfg->output_rows == 0 || cfg->output_rows == impl->transform->get_output_rows()) &&
+ (cfg->output_cols == 0 || cfg->output_cols == impl->transform->get_output_cols()) &&
+ (cfg->output_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->output_transform_filter.c_str()))
+ )
+ {
+ output_transforms.push_back(impl->transform.get());
+ }
+ }
+
+ return output_transforms;
+}
+
+template <typename TIn, typename TWeight, typename TOut, typename TWinogradIn, typename TWinogradOut>
+bool get_implementation(
+ WinogradImpl &dest, // Destination for the selected implementation
+ const CPUInfo *ci,
+ const ConvolutionArgs &conv_args,
+ int max_threads,
+ bool fast_mode,
+ const WinogradConfig *cfg,
+ const arm_gemm::GemmConfig *gemm_cfg
+)
+{
+ // Get vectors of valid weight, input and output transforms; then select the
+ // combination which produces the biggest output tile.
+ const auto weight_transforms = get_weight_transforms<TWeight, TWinogradIn>(ci, conv_args, cfg);
+ const auto input_transforms = get_input_transforms<TIn, TWinogradIn>(ci, conv_args, cfg);
+ const auto output_transforms = get_output_transforms<TWinogradOut, TOut>(ci, conv_args, cfg);
+
+ // Now attempt to select a complete set of Winograd transformations which can
+ // solve the problem. Work backwards from the output transform to find
+ // matching input implementations.
+ bool success = false;
+ for (auto output_transform = output_transforms.cbegin();
+ !success && output_transform != output_transforms.cend();
+ output_transform++)
+ {
+ // Look for matching weight transforms, if we find one then we look for
+ // matching input transforms.
+ for (auto weight_transform = weight_transforms.cbegin();
+ !success && weight_transform != weight_transforms.cend();
+ weight_transform++)
+ {
+ // If this weight transform is compatible, then look for a matching input
+ // transform
+ if ((*output_transform)->get_input_rows() == (*weight_transform)->get_transformed_tile_rows() &&
+ (*output_transform)->get_input_cols() == (*weight_transform)->get_transformed_tile_cols())
+ {
+ for (auto input_transform = input_transforms.cbegin();
+ !success && input_transform != input_transforms.cend();
+ input_transform++)
+ {
+ // If the input transform is suitable, then set the configuration and
+ // indicate success.
+ if ((*input_transform)->get_input_rows() == (*output_transform)->get_input_rows() &&
+ (*input_transform)->get_input_cols() == (*output_transform)->get_input_cols())
+ {
+ dest.output_transform = *output_transform;
+ dest.input_transform = *input_transform;
+ dest.weight_transform = *weight_transform;
+ success = true;
+ }
+ }
+ }
+ }
+ }
+
+ if (!success)
+ {
+ return false;
+ }
+
+ // If we're able to construct the Winograd elements, then specify the GEMM
+ // arguments required to perform the multiply-accumulate step of the
+ // convolution.
+ const auto n_output_row_tiles = iceildiv(conv_args.output_shape.rows, dest.output_transform->get_output_rows());
+ const auto n_output_col_tiles = iceildiv(conv_args.output_shape.cols, dest.output_transform->get_output_cols());
+ const auto n_output_patches = n_output_row_tiles * n_output_col_tiles;
+
+ const int n_multis = dest.input_transform->get_input_rows() *
+ dest.input_transform->get_input_cols();
+
+ dest.gemm_args.reset(new arm_gemm::GemmArgs(
+ ci,
+ n_output_patches, // M
+ conv_args.n_output_channels, // N
+ conv_args.n_input_channels, // K
+ 1, // K-sections
+ conv_args.n_batches, // # Batches
+ n_multis,
+ false, // Indirect input
+ {}, // No activation
+ max_threads,
+ false, // Not fixed format
+ fast_mode,
+ gemm_cfg
+ ));
+
+ // Also provide hints for the Winograd memory layout
+ auto &ws = dest.winograd_spec;
+ ws.weight_ld_row = iroundup(conv_args.n_output_channels, 4u);
+ ws.weight_ld_matrix = conv_args.n_input_channels * ws.weight_ld_row;
+ ws.weight_matrix_size_bytes = n_multis * ws.weight_ld_matrix * sizeof(TWinogradIn);
+
+ ws.input_ld_row = iroundup(conv_args.n_input_channels, 4u);
+ ws.input_ld_matrix = iroundup(n_output_patches, 4u) * ws.input_ld_row;
+ ws.input_ld_batch = n_multis * ws.input_ld_matrix;
+ ws.input_matrix_size_bytes = conv_args.n_batches * ws.input_ld_batch * sizeof(TWinogradIn);
+
+ ws.output_ld_row = ws.weight_ld_row;
+ ws.output_ld_matrix = n_output_patches * ws.output_ld_row;
+ ws.output_ld_batch = n_multis * ws.output_ld_matrix;
+ ws.output_matrix_size_bytes = conv_args.n_batches * ws.output_ld_batch * sizeof(TWinogradOut);
+
+ return true;
+}
+
+} // namespace winograd
+} // namespace arm_conv