aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/weight_transform.hpp')
-rw-r--r--src/core/NEON/kernels/convolution/winograd/weight_transform.hpp145
1 files changed, 145 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
new file mode 100644
index 0000000000..5569bc1b89
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include <algorithm>
+#include <functional>
+
+namespace arm_conv {
+namespace winograd {
+namespace weight_transform {
+
+/* Driver class for the Winograd weight transforms.
+ */
+template <typename TIn, typename TOut=TIn>
+class Transform : public ITransform
+{
+ using Kernel = std::function<void(
+ unsigned int n_channels, // Number of channels to transform
+ const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
+ TOut *outptr, size_t ld_out_matrix
+ )>;
+
+ const std::string m_name;
+ const unsigned int m_kernel_rows, m_kernel_cols;
+ const unsigned int m_transformed_tile_rows, m_transformed_tile_cols;
+ const Kernel m_kernel;
+
+ void execute_internal(
+ const ConvolutionArgs &args,
+ const TIn *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
+ TOut *outptr, size_t ld_out_matrix, size_t ld_out_row,
+ unsigned int thread_id, unsigned int n_threads
+ ) const
+ {
+ // Stripe groups of input channels over threads, this should reduce false
+ // sharing of the output matrix.
+ constexpr auto n_input_channels_per_thread = 16u;
+
+ // Get the initial offset for the input and output pointers
+ const auto offset = thread_id * n_input_channels_per_thread;
+ inptr += offset * ld_input_channel;
+ outptr += offset * ld_out_row;
+
+ for (auto start_ic = thread_id * n_input_channels_per_thread;
+ start_ic < args.n_input_channels;
+ start_ic += n_threads * n_input_channels_per_thread)
+ {
+ // Now iterate over the input channels assigned to this thread.
+ const auto end_ic = std::min(args.n_input_channels,
+ start_ic + n_input_channels_per_thread);
+ for (auto ic = start_ic; ic < end_ic; ic++)
+ {
+ m_kernel(args.n_output_channels, inptr, ld_in_row, ld_in_col,
+ outptr, ld_out_matrix);
+ inptr += ld_input_channel;
+ outptr += ld_out_row;
+ }
+
+ // Progress the pointers to the account for the work not performed by
+ // this thread.
+ const auto skip = (n_threads - 1) * n_input_channels_per_thread;
+ inptr += skip * ld_input_channel;
+ outptr += skip * ld_out_row;
+ }
+ }
+
+ public:
+ Transform(
+ const std::string &name,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int transformed_tile_rows, unsigned int transformed_tile_cols,
+ const Kernel kernel
+ )
+ : m_name(name),
+ m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_transformed_tile_rows(transformed_tile_rows), m_transformed_tile_cols(transformed_tile_cols),
+ m_kernel(kernel)
+ {
+ }
+
+ const std::string &get_name(void) const override { return m_name; }
+
+ unsigned int get_kernel_rows(void) const override { return m_kernel_rows; }
+ unsigned int get_kernel_cols(void) const override { return m_kernel_cols; }
+
+ unsigned int get_transformed_tile_rows(void) const override { return m_transformed_tile_rows; }
+ unsigned int get_transformed_tile_cols(void) const override { return m_transformed_tile_cols; }
+
+ void execute(
+ const ConvolutionArgs &args,
+ const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
+ void *outptr, size_t ld_out_matrix, size_t ld_out_row,
+ unsigned int thread_id, unsigned int n_threads
+ ) const override
+ {
+ execute_internal(
+ args,
+ reinterpret_cast<const TIn *>(inptr), ld_in_row, ld_in_col, ld_input_channel,
+ reinterpret_cast<TOut *>(outptr), ld_out_matrix, ld_out_row,
+ thread_id, n_threads
+ );
+ }
+
+ /* Utility method to get a transposed variant of a kernel, this transposed
+ * version simply calls the original kernel with the input row and column
+ * strides swapped.
+ */
+ static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+ {
+ return [kernel] (
+ const unsigned int n_channels,
+ const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
+ TOut *const outptr, const size_t ld_out
+ ) {
+ kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out);
+ };
+ }
+};
+
+} // namespace weight_transform
+} // namespace winograd
+} // namespace arm_conv