aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/assembly
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/assembly')
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp270
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp106
-rw-r--r--src/core/NEON/kernels/assembly/pool_common.hpp71
-rw-r--r--src/core/NEON/kernels/assembly/pooling.hpp210
-rw-r--r--src/core/NEON/kernels/assembly/premultiply.hpp17
-rw-r--r--src/core/NEON/kernels/assembly/winograd.hpp181
6 files changed, 436 insertions, 419 deletions
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa9..13c2d314e4 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
DepthwiseMethod method = DepthwiseMethod::DEFAULT;
std::string filter = "";
- DepthwiseConfig(DepthwiseMethod method)
- : method(method) {};
- DepthwiseConfig() {};
+ DepthwiseConfig(DepthwiseMethod method) : method(method){};
+ DepthwiseConfig(){};
};
struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
bool fast_mode = false;
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int dilation_rows, unsigned int dilation_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
-
- const DepthwiseConfig *config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int dilation_rows,
+ unsigned int dilation_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
: cpu_info(cpu_info),
kernel_rows(kernel_rows),
kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
{
}
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
- const DepthwiseConfig *config)
- : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
- stride_cols, 1, 1, n_batches, input_rows, input_cols,
- input_channels, output_rows, output_cols,
- channel_multiplier, padding, activation, config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+ const DepthwiseConfig *config)
+ : DepthwiseArgs(cpu_info,
+ kernel_rows,
+ kernel_cols,
+ stride_rows,
+ stride_cols,
+ 1,
+ 1,
+ n_batches,
+ input_rows,
+ input_cols,
+ input_channels,
+ output_rows,
+ output_cols,
+ channel_multiplier,
+ padding,
+ activation,
+ config)
{
}
};
@@ -127,17 +150,18 @@ struct Tile
{
}
- Tile()
- : Tile(nullptr, 0, 0, 0)
+ Tile() : Tile(nullptr, 0, 0, 0)
{
}
- void load_from(
- const TInput *input,
- const unsigned int ld_row, const unsigned int ld_col,
- const unsigned int n_rows, const unsigned int n_cols,
- const int input_i, const int input_j,
- const unsigned int channel_multiplier) const
+ void load_from(const TInput *input,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ const unsigned int n_rows,
+ const unsigned int n_cols,
+ const int input_i,
+ const int input_j,
+ const unsigned int channel_multiplier) const
{
const auto pad_top = input_i < 0 ? -input_i : 0;
const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
- if(padded_rows < tile_rows || padded_cols < tile_cols)
+ if (padded_rows < tile_rows || padded_cols < tile_cols)
{
memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
}
- do_premultiply<TInput>(
- (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
- ld_row, ld_col,
- array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
- tile_cols * tile_channels, tile_channels,
- padded_rows, padded_cols, tile_channels / channel_multiplier,
- channel_multiplier);
+ do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+ ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+ tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+ tile_channels / channel_multiplier, channel_multiplier);
}
};
@@ -168,9 +189,8 @@ protected:
std::string m_name{};
public:
- DepthwiseCommon(const DepthwiseArgs &args)
- : m_args(args) {};
- DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
std::string name() const override
@@ -181,19 +201,18 @@ public:
void set_name(std::string name)
{
// Only allow the name to be set once
- if(m_name.empty())
+ if (m_name.empty())
{
m_name = name;
}
}
- void execute(
- const void *const input,
- const void *const parameters,
- void *const output,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ const void *const parameters,
+ void *const output,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
const size_t ld_input_col = m_args.input_channels;
const size_t ld_input_row = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
const size_t ld_output_row = ld_output_col * m_args.output_cols;
const size_t ld_output_batch = ld_output_row * m_args.output_rows;
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters, output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *const parameters,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *const parameters,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.input_channels, m_args.padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+ ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &padding,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const override final
+ void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const override final
{
// Construct a new set of arguments to reflect that we might have been
// passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
- for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
{
size_t start_i;
- std::tie(args.output_rows, args.input_rows, start_i,
- args.padding.top, args.padding.bottom) =
- get_reduced_view_for_dilation(
- output_height, input_height, drow, m_args.dilation_rows,
- m_args.kernel_rows, m_args.stride_rows, padding.top);
+ std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
- if(args.output_rows)
+ if (args.output_rows)
{
- for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
{
size_t start_j;
- std::tie(args.output_cols, args.input_cols, start_j,
- args.padding.left, args.padding.right) =
- get_reduced_view_for_dilation(
- output_width, input_width, dcol, m_args.dilation_cols,
- m_args.kernel_cols, m_args.stride_cols, padding.left);
+ std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
const TInput *input_col = input_row + start_j * ld_input_col;
TOutput *output_col = output_row + dcol * ld_output_col;
- if(args.output_cols)
+ if (args.output_cols)
{
- this->execute_internal(
- args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
- output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
- working_space, thread_id, n_threads);
+ this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+ parameters, output_col, ld_output_col_d, ld_output_row_d,
+ ld_output_batch, working_space, thread_id, n_threads);
}
}
}
@@ -310,20 +315,19 @@ public:
}
protected:
- virtual void execute_internal(
- const DepthwiseArgs &instance_args,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute_internal(const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
virtual bool uses_premultiply() const
{
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3d..5ff848e281 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
bool is_default = false;
uint64_t cycle_estimate = 0;
- KernelDescription(
- DepthwiseMethod method,
- std::string name,
- bool is_default,
- uint64_t cycle_estimate)
+ KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
: method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
{
}
@@ -78,58 +74,51 @@ public:
// pointer the bias vector (which may be nullptr in the case of no bias) and
// a pointer to the array of weights (stored in HWIO order).
virtual void pack_parameters(
- void *buffer,
- const void *biases,
- const void *weights,
- size_t ld_weight_col = 0,
- size_t ld_weight_row = 0) = 0;
+ void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
// Determine the amount of working space required
virtual size_t get_working_size(unsigned int n_threads) const = 0;
// Execute the convolution over the specified area of memory.
- virtual void execute(
- const void *input, // Pointer to input tensor
- const void *parameters, // Packed parameters buffer
- void *output,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute(const void *input, // Pointer to input tensor
+ const void *parameters, // Packed parameters buffer
+ void *output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
// To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
// - Number of valid input pixels corresponding to `d`
// - Offset of the first pixel corresponding to `d`
// - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
- size_t out_size, size_t in_size,
- size_t d, size_t dilation_factor,
- size_t kernel_size, size_t stride,
- size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+ size_t in_size,
+ size_t d,
+ size_t dilation_factor,
+ size_t kernel_size,
+ size_t stride,
+ size_t pad_before);
} // namespace depthwise
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d6..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
virtual size_t get_working_size(unsigned int num_threads) const = 0;
// Execute pooling over the specified area of memory.
- virtual void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c593..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
PoolingMethod method = PoolingMethod::DEFAULT;
std::string filter = "";
- PoolingConfig(PoolingMethod method)
- : method(method) {};
- PoolingConfig() {};
+ PoolingConfig(PoolingMethod method) : method(method){};
+ PoolingConfig(){};
};
struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
const PoolingConfig *config;
- PoolingArgs(
- const CPUInfo *cpu_info,
- PoolingType pool_type,
- const PoolingWindow &window,
- const PoolingStride &stride,
- bool exclude_padding,
- unsigned int n_batches,
- unsigned int input_rows,
- unsigned int input_cols,
- unsigned int n_channels,
- unsigned int output_rows,
- unsigned int output_cols,
- const PaddingValues &padding,
- const PoolingConfig *cfg)
- : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+ PoolingArgs(const CPUInfo *cpu_info,
+ PoolingType pool_type,
+ const PoolingWindow &window,
+ const PoolingStride &stride,
+ bool exclude_padding,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int n_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ const PaddingValues &padding,
+ const PoolingConfig *cfg)
+ : cpu_info(cpu_info),
+ pool_type(pool_type),
+ pool_window(window),
+ pool_stride(stride),
+ exclude_padding(exclude_padding),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ n_channels(n_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ padding(padding),
+ config(cfg)
{
// If either of the pooling window dimensions are set to zero, meaning
// "pool everything", then replace with the corresponding input dimension.
- if(pool_window.rows == 0)
+ if (pool_window.rows == 0)
{
pool_window.rows = input_rows;
}
- if(pool_window.cols == 0)
+ if (pool_window.cols == 0)
{
pool_window.cols = input_cols;
}
@@ -100,10 +109,16 @@ struct Requantize32
int32_t per_layer_right_shift = 0;
int32_t per_layer_mul = 0;
- Requantize32(int32_t input_offset, int32_t output_offset,
- int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+ Requantize32(int32_t input_offset,
+ int32_t output_offset,
+ int32_t per_layer_left_shift,
+ int32_t per_layer_right_shift,
int32_t per_layer_mul)
- : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+ : input_offset(input_offset),
+ output_offset(output_offset),
+ per_layer_left_shift(per_layer_left_shift),
+ per_layer_right_shift(per_layer_right_shift),
+ per_layer_mul(per_layer_mul)
{
}
};
@@ -115,105 +130,88 @@ protected:
const PoolingArgs m_args;
public:
- PoolingCommon(const PoolingArgs &args)
- : m_args(args)
+ PoolingCommon(const PoolingArgs &args) : m_args(args)
{
}
- PoolingCommon(PoolingCommon &) = delete;
+ PoolingCommon(PoolingCommon &) = delete;
PoolingCommon &operator=(PoolingCommon &) = delete;
size_t get_working_size(unsigned int) const override = 0;
// Execute pooling over the specified area of memory.
- void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- input,
- m_args.n_channels,
- m_args.n_channels * m_args.input_cols,
- m_args.n_channels * m_args.input_cols * m_args.input_rows,
- output,
- m_args.n_channels,
- m_args.n_channels * m_args.output_cols,
- m_args.n_channels * m_args.output_cols * m_args.output_rows,
- working_space,
- thread_id, num_threads);
+ this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+ m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+ m_args.n_channels * m_args.output_cols,
+ m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+ num_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding, m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+ ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
}
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &padding,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute_internal(
- batches, height, width, channels, padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- output_height, output_width,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+ ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, num_threads);
}
protected:
- virtual void execute_internal(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const PaddingValues &,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute_internal(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38a..fb97cf8baf 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr,
const unsigned input_channels,
const unsigned int channel_multiplier)
{
- if(sizeof(T) == 4 && channel_multiplier == 6)
+ if (sizeof(T) == 4 && channel_multiplier == 6)
{
- do_premultiply_float_6(
- (const float *)in_ptr, ld_row, ld_col,
- (float *)out_ptr, out_ld_row, out_ld_col,
- tile_rows, tile_cols,
- input_channels);
+ do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+ tile_rows, tile_cols, input_channels);
}
else
{
- for(unsigned int i = 0; i < tile_rows; i++)
+ for (unsigned int i = 0; i < tile_rows; i++)
{
const T *ip2 = in_ptr + i * ld_row;
T *op2 = out_ptr + i * out_ld_row;
- for(unsigned int j = 0; j < tile_cols; j++)
+ for (unsigned int j = 0; j < tile_cols; j++)
{
const T *ip = ip2;
T *op = op2;
- for(unsigned int c = 0; c < input_channels; c++)
+ for (unsigned int c = 0; c < input_channels; c++)
{
T val = *ip;
ip++;
- for(unsigned int r = 0; r < channel_multiplier; r++)
+ for (unsigned int r = 0; r < channel_multiplier; r++)
{
op[r] = val;
}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ec..dbf95d23cd 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
Shape2D kernel_shape;
arm_gemm::Activation activation;
- ConvolutionArgs(
- unsigned int n_batches,
- const Shape2D &input_shape,
- unsigned int n_input_channels,
- unsigned int pad_top, unsigned int pad_left,
- const Shape2D &output_shape,
- unsigned int n_output_channels,
- const Shape2D kernel_shape,
- const arm_gemm::Activation &activation = {})
- : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
- kernel_shape(kernel_shape), activation(activation)
+ ConvolutionArgs(unsigned int n_batches,
+ const Shape2D &input_shape,
+ unsigned int n_input_channels,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ const Shape2D &output_shape,
+ unsigned int n_output_channels,
+ const Shape2D kernel_shape,
+ const arm_gemm::Activation &activation = {})
+ : n_batches(n_batches),
+ input_shape(input_shape),
+ n_input_channels(n_input_channels),
+ pad_top(pad_top),
+ pad_left(pad_left),
+ output_shape(output_shape),
+ n_output_channels(n_output_channels),
+ kernel_shape(kernel_shape),
+ activation(activation)
{
}
};
@@ -105,23 +112,30 @@ public:
virtual unsigned int get_transformed_tile_rows(void) const = 0;
virtual unsigned int get_transformed_tile_cols(void) const = 0;
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, const WinogradDomainSpec &wds,
- unsigned int thread_id, unsigned int n_threads) const
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_row, ld_in_col, ld_input_channel,
- outptr, wds.weight_ld_matrix, wds.weight_ld_row,
- thread_id, n_threads);
+ this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+ wds.weight_ld_row, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, size_t ld_out_matrix, size_t ld_out_row,
- unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace weight_transform
@@ -136,27 +150,35 @@ public:
virtual unsigned int get_input_rows(void) const = 0;
virtual unsigned int get_input_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, const WinogradDomainSpec &wds,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_batch, ld_in_row, ld_in_col,
- outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+ wds.input_ld_row, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace input_transform
@@ -177,31 +199,37 @@ public:
virtual unsigned int get_kernel_rows(void) const = 0;
virtual unsigned int get_kernel_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, const WinogradDomainSpec &wds,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ const WinogradDomainSpec &wds,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args,
- inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
- bias,
- outptr, ld_out_batch, ld_out_row, ld_out_col,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+ ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_matrix,
+ size_t ld_in_row,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
{
const output_transform::ITransform *output_transform = nullptr;
const weight_transform::ITransform *weight_transform = nullptr;
- const input_transform::ITransform *input_transform = nullptr;
+ const input_transform::ITransform *input_transform = nullptr;
std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
WinogradDomainSpec winograd_spec;
};
@@ -220,15 +248,18 @@ struct WinogradImpl
* Assigns to the pointers in the `dest` struct and returns true or false to
* indicate whether the given problem can be executed or not.
*/
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
- WinogradImpl &dest, // Destination for the selected implementation
- const CPUInfo *,
- const ConvolutionArgs &,
- int max_threads,
- bool fast_mode,
- const WinogradConfig *,
- const arm_gemm::GemmConfig *);
+template <typename TIn,
+ typename TWeight = TIn,
+ typename TOut = TIn,
+ typename TWinogradIn = TIn,
+ typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+ const CPUInfo *,
+ const ConvolutionArgs &,
+ int max_threads,
+ bool fast_mode,
+ const WinogradConfig *,
+ const arm_gemm::GemmConfig *);
} // namespace winograd
} // namespace arm_conv