src/core/NEON/kernels/convolution/winograd/weight_transform.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

/*
 * Copyright (c) 2022-2023 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "winograd.hpp"
#include <algorithm>
#include <functional>

namespace arm_conv {
namespace winograd {
namespace weight_transform {

/* Driver class for the Winograd weight transforms.
 */
template <typename TIn, typename TOut=TIn>
class Transform : public ITransform
{
  using Kernel = std::function<void(
    unsigned int n_channels,  // Number of channels to transform
    const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
    TOut *outptr, size_t ld_out_matrix
  )>;

  const std::string m_name;
  const unsigned int m_kernel_rows, m_kernel_cols;
  const unsigned int m_transformed_tile_rows, m_transformed_tile_cols;
  const Kernel m_kernel;

  void execute_internal(
    const ConvolutionArgs &args,
    const TIn *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
    TOut *outptr, size_t ld_out_matrix, size_t ld_out_row,
    unsigned int thread_id, unsigned int n_threads
  ) const
  {
    // Stripe groups of input channels over threads, this should reduce false
    // sharing of the output matrix.
    constexpr auto n_input_channels_per_thread = 16u;

    // Get the initial offset for the input and output pointers
    const auto offset = thread_id * n_input_channels_per_thread;
    inptr += offset * ld_input_channel;
    outptr += offset * ld_out_row;

    for (auto start_ic = thread_id * n_input_channels_per_thread;
         start_ic < args.n_input_channels;
         start_ic += n_threads * n_input_channels_per_thread)
    {
      // Now iterate over the input channels assigned to this thread.
      const auto end_ic = std::min(args.n_input_channels,
                                   start_ic + n_input_channels_per_thread);
      for (auto ic = start_ic; ic < end_ic; ic++)
      {
        m_kernel(args.n_output_channels, inptr, ld_in_row, ld_in_col,
                 outptr, ld_out_matrix);
        inptr += ld_input_channel;
        outptr += ld_out_row;
      }

      // Progress the pointers to the account for the work not performed by
      // this thread.
      const auto skip = (n_threads - 1) * n_input_channels_per_thread;
      inptr += skip * ld_input_channel;
      outptr += skip * ld_out_row;
    }
  }

  public:
  Transform(
    const std::string &name,
    unsigned int kernel_rows, unsigned int kernel_cols,
    unsigned int transformed_tile_rows, unsigned int transformed_tile_cols,
    const Kernel kernel
  )
  : m_name(name),
    m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
    m_transformed_tile_rows(transformed_tile_rows), m_transformed_tile_cols(transformed_tile_cols),
    m_kernel(kernel)
  {
  }

  const std::string &get_name(void) const override { return m_name; }

  unsigned int get_kernel_rows(void) const override { return m_kernel_rows; }
  unsigned int get_kernel_cols(void) const override { return m_kernel_cols; }

  unsigned int get_transformed_tile_rows(void) const override { return m_transformed_tile_rows; }
  unsigned int get_transformed_tile_cols(void) const override { return m_transformed_tile_cols; }

  void execute(
    const ConvolutionArgs &args,
    const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
    void *outptr, size_t ld_out_matrix, size_t ld_out_row,
    unsigned int thread_id, unsigned int n_threads
  ) const override
  {
    execute_internal(
      args,
      reinterpret_cast<const TIn *>(inptr), ld_in_row, ld_in_col, ld_input_channel,
      reinterpret_cast<TOut *>(outptr), ld_out_matrix, ld_out_row,
      thread_id, n_threads
    );
  }

  /* Utility method to get a transposed variant of a kernel, this transposed
   * version simply calls the original kernel with the input row and column
   * strides swapped.
   */
  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
  {
    return [kernel] (
      const unsigned int n_channels,
      const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
      TOut *const outptr, const size_t ld_out
    ) {
      kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out);
    };
  }
};

}  // namespace weight_transform
}  // namespace winograd
}  // namespace arm_conv