From 4074c995d2a88684fd4a9d1aa36d51de56bb8dab Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 30 Jan 2018 18:13:46 +0000 Subject: COMPMID-873: Integrate RSH NEON Depthwise Convolution routine Change-Id: Ida1e9a836bc518bfe5563e16bf7f92bde5fc13f7 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118472 Tested-by: Jenkins Reviewed-by: Pablo Tello --- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 195 +- .../kernels/NEDirectConvolutionLayerKernel.cpp | 2 +- src/core/NEON/kernels/convolution/common/utils.cpp | 50 + .../depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp | 439 ++ .../depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp | 1095 ++++ .../depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp | 1175 +++++ .../depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp | 3443 +++++++++++++ .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp | 2695 ++++++++++ .../depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp | 5207 ++++++++++++++++++++ .../convolution/winograd/batched_blocked_gemm.cpp | 82 + .../winograd/transforms/input_2x2_3x3_fp32.cpp | 409 ++ .../winograd/transforms/input_2x2_5x5_fp32.cpp | 458 ++ .../winograd/transforms/input_4x4_3x3_fp32.cpp | 486 ++ .../winograd/transforms/output_2x2_3x3_fp32.cpp | 251 + .../winograd/transforms/output_2x2_5x5_fp32.cpp | 242 + .../winograd/transforms/output_4x4_3x3_fp32.cpp | 306 ++ .../winograd/transforms/weights_2x2_3x3_fp32.cpp | 228 + .../winograd/transforms/weights_2x2_5x5_fp32.cpp | 408 ++ .../winograd/transforms/weights_4x4_3x3_fp32.cpp | 266 + .../kernels/convolution/winograd/winograd_gemm.cpp | 569 +++ .../NEON/kernels/winograd/batched_blocked_gemm.cpp | 81 - .../winograd/transforms/input_2x2_3x3_fp32.cpp | 409 -- .../winograd/transforms/input_2x2_5x5_fp32.cpp | 458 -- .../winograd/transforms/input_4x4_3x3_fp32.cpp | 486 -- .../winograd/transforms/output_2x2_3x3_fp32.cpp | 251 - .../winograd/transforms/output_2x2_5x5_fp32.cpp | 242 - .../winograd/transforms/output_4x4_3x3_fp32.cpp | 306 -- .../winograd/transforms/weights_2x2_3x3_fp32.cpp | 228 - .../winograd/transforms/weights_2x2_5x5_fp32.cpp | 408 -- .../winograd/transforms/weights_4x4_3x3_fp32.cpp | 266 - src/core/NEON/kernels/winograd/utils.cpp | 50 - src/core/NEON/kernels/winograd/winograd_gemm.cpp | 568 --- src/core/Utils.cpp | 15 + src/graph/operations/NESimpleOperations.cpp | 20 +- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 82 +- src/runtime/NEON/functions/NEWinogradLayer.cpp | 2 +- 36 files changed, 18079 insertions(+), 3799 deletions(-) create mode 100644 src/core/NEON/kernels/convolution/common/utils.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp delete mode 100644 src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/input_2x2_5x5_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/output_2x2_5x5_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/weights_2x2_5x5_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp delete mode 100644 src/core/NEON/kernels/winograd/utils.cpp delete mode 100644 src/core/NEON/kernels/winograd/winograd_gemm.cpp (limited to 'src') diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index bc2f1ed266..92383d9f15 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -22,7 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h" -#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h" +#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/AccessWindowTranspose.h" @@ -34,13 +34,16 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "support/ToolchainSupport.h" using namespace arm_compute; using namespace arm_compute::detail; using namespace arm_compute::misc::shape_calculator; +using namespace depthwise; namespace { @@ -143,7 +146,7 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe } // namespace NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel() - : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0) + : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false) { } @@ -152,35 +155,98 @@ BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const return _border_size; } -void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3); + + _input = input; + _output = output; + _weights = weights; + _conv_info = conv_info; + _convolver = nullptr; + + _run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(), + conv_info, + input->info()->data_type(), + data_layout); + + (_run_optimized) ? configure_optimized() : configure_generic(); +} + +void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(info); + + (_run_optimized) ? run_optimized(window, info) : run_generic(window, info); +} + +bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, DataLayout data_layout) +{ + // Reshape input shape if in NHWC format + TensorShape in_shape{ input_shape }; + if(data_layout == DataLayout::NHWC) + { + in_shape.set(Window::DimX, input_shape.y()); + in_shape.set(Window::DimY, input_shape.z()); + in_shape.set(Window::DimZ, input_shape.x()); + } + + // Check supported data type + bool supported_datatype = (dt == DataType::F32); + + // Check for supported strides + const auto &strides = conv_info.stride(); + bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); + + // Check for supported padding + const auto pad_top = conv_info.pad_top(); + const auto pad_right = conv_info.pad_right(); + const auto pad_bottom = conv_info.pad_bottom(); + const auto pad_left = conv_info.pad_left(); + PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info); + bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); + bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); + bool supported_padding = is_same_padding || is_valid_padding; + + return supported_datatype && supported_strides && supported_padding; +} + +void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver() +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights); + ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3); + + _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info, + _weights->buffer(), _input->buffer(), _output->buffer()); +} + +void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic() +{ + ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3); // Get convolved dimensions - const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info); - const DataType output_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type(); + const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info); + const DataType output_dt = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type(); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), - input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt)); + auto_init_if_empty(*_output->info(), + _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt)); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_output->info()->tensor_shape(), output_shape); - _input = input; - _output = output; - _weights = weights; - _conv_info = conv_info; - const unsigned int conv_stride_x = conv_info.stride().first; - const unsigned int conv_stride_y = conv_info.stride().second; - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); + const unsigned int conv_stride_x = _conv_info.stride().first; + const unsigned int conv_pad_top = _conv_info.pad_top(); + const unsigned int conv_pad_right = _conv_info.pad_right(); + const unsigned int conv_pad_bottom = _conv_info.pad_bottom(); + const unsigned int conv_pad_left = _conv_info.pad_left(); ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3); unsigned int num_elems_read_per_iteration = 0; - switch(input->info()->data_type()) + switch(_input->info()->data_type()) { case DataType::QASYMM8: num_elems_read_per_iteration = 16; @@ -193,31 +259,56 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const default: ARM_COMPUTE_ERROR("Data type not supported."); } - _border_size = BorderSize(conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), conv_pad_left); + _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left); // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration)); + Window win = calculate_max_window(*_output->info(), Steps(_num_elems_written_per_iteration)); const unsigned int num_x_steps = (output_shape.x() + _num_elems_written_per_iteration - 1) / _num_elems_written_per_iteration; const int input_num_elems_processed = get_input_num_elems_processed(_num_elems_written_per_iteration, conv_stride_x); - AccessWindowStatic input_access(input->info(), + AccessWindowStatic input_access(_input->info(), -conv_pad_left, -conv_pad_top, (num_x_steps - 1) * input_num_elems_processed + num_elems_read_per_iteration, - conv_stride_y * (output_shape.y() - 1) + 2); - AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1)); - AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * _num_elems_written_per_iteration, output_shape.y()); + _input->info()->tensor_shape().y() + conv_pad_bottom); + AccessWindowStatic weights_access(_weights->info(), 0, 0, _weights->info()->dimension(0), _weights->info()->dimension(1)); + AccessWindowStatic output_access(_output->info(), 0, 0, num_x_steps * _num_elems_written_per_iteration, output_shape.y()); update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + output_access.set_valid_region(win, ValidRegion(Coordinates(), _output->info()->tensor_shape())); INEKernel::configure(win); } -void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info) +void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized() +{ + ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3); + + _border_size = BorderSize(0, 0); + _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info, + _weights->buffer(), _input->buffer(), _output->buffer()); + + // Auto-configure output + bool same_padding = _conv_info.has_padding(); + TensorShape output_shape{ _input->info()->tensor_shape() }; + + output_shape.set(1, _convolver->output_size(output_shape.y(), same_padding)); // Set width + output_shape.set(2, _convolver->output_size(output_shape.z(), same_padding)); // Set height + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*_output->info(), + _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape)); + + // Configure window + Window win; + auto win_last = _convolver->get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); +} + +void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_UNUSED(info); switch(_input->info()->data_type()) @@ -232,3 +323,53 @@ void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const Threa ARM_COMPUTE_ERROR("Not implemented"); } } + +void NEDepthwiseConvolutionLayer3x3Kernel::run_optimized(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON(!_convolver); + + const size_t start = window.x().start(); + const size_t end = window.x().end(); + _convolver->run(start, end); +} + +std::unique_ptr NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(TensorShape shape, + PadStrideInfo conv_info, + const uint8_t *w_ptr, + uint8_t *in_ptr, + uint8_t *out_ptr) +{ + const int in_rows = shape.z(); + const int in_cols = shape.y(); + const int n_batches = shape[3]; + const int n_channels = shape.x(); + const bool padding_same = conv_info.has_padding(); + + const auto stride_x = conv_info.stride().first; + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, + in_rows, + in_cols, + n_channels, + padding_same, + reinterpret_cast(w_ptr), + reinterpret_cast(in_ptr), + reinterpret_cast(out_ptr)); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, + in_rows, + in_cols, + n_channels, + padding_same, + reinterpret_cast(w_ptr), + reinterpret_cast(in_ptr), + reinterpret_cast(out_ptr)); + default: + return nullptr; + } +} \ No newline at end of file diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index cb8246d09e..c7534c59a6 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -22,7 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" -#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h" +#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" diff --git a/src/core/NEON/kernels/convolution/common/utils.cpp b/src/core/NEON/kernels/convolution/common/utils.cpp new file mode 100644 index 0000000000..24d0386c76 --- /dev/null +++ b/src/core/NEON/kernels/convolution/common/utils.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +double TimeInUs(void) +{ +#ifdef CYCLE_PROFILING + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return 1e6*t.tv_sec + 1e-3*t.tv_nsec; +#else + return 0; +#endif +} + +void PrintMatrix(const float* const m, const int M, const int N, const int row_stride) +{ + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + { + printf("%.3f ", m[i*row_stride + j]); + } + printf("\n"); + } + printf("\n"); +} diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp new file mode 100644 index 0000000000..fa50f79bc5 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 2 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 2 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 2 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + }, // Input pad bottom = 2 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp new file mode 100644 index 0000000000..0ec5a77475 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp @@ -0,0 +1,1095 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 0, 0, 0>, + Conv::template process_tile<0, 0, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 0, 1, 0>, + Conv::template process_tile<0, 0, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 1, 0, 0>, + Conv::template process_tile<0, 0, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 1, 1, 0>, + Conv::template process_tile<0, 0, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 2, 0, 0>, + Conv::template process_tile<0, 0, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 2, 1, 0>, + Conv::template process_tile<0, 0, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 3, 0, 0>, + Conv::template process_tile<0, 0, 0, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 3, 1, 0>, + Conv::template process_tile<0, 0, 0, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 4, 0, 0>, + Conv::template process_tile<0, 0, 0, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 4, 1, 0>, + Conv::template process_tile<0, 0, 0, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 0, 0, 0>, + Conv::template process_tile<0, 0, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 0, 1, 0>, + Conv::template process_tile<0, 0, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 1, 0, 0>, + Conv::template process_tile<0, 0, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 1, 1, 0>, + Conv::template process_tile<0, 0, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 2, 0, 0>, + Conv::template process_tile<0, 0, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 2, 1, 0>, + Conv::template process_tile<0, 0, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 3, 0, 0>, + Conv::template process_tile<0, 0, 1, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 3, 1, 0>, + Conv::template process_tile<0, 0, 1, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 4, 0, 0>, + Conv::template process_tile<0, 0, 1, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 4, 1, 0>, + Conv::template process_tile<0, 0, 1, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 0, 0, 0>, + Conv::template process_tile<0, 0, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 0, 1, 0>, + Conv::template process_tile<0, 0, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 1, 0, 0>, + Conv::template process_tile<0, 0, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 1, 1, 0>, + Conv::template process_tile<0, 0, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 2, 0, 0>, + Conv::template process_tile<0, 0, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 2, 1, 0>, + Conv::template process_tile<0, 0, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 3, 0, 0>, + Conv::template process_tile<0, 0, 2, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 3, 1, 0>, + Conv::template process_tile<0, 0, 2, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 4, 0, 0>, + Conv::template process_tile<0, 0, 2, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 4, 1, 0>, + Conv::template process_tile<0, 0, 2, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 0, 0, 0>, + Conv::template process_tile<0, 0, 3, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 0, 1, 0>, + Conv::template process_tile<0, 0, 3, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 1, 0, 0>, + Conv::template process_tile<0, 0, 3, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 1, 1, 0>, + Conv::template process_tile<0, 0, 3, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 2, 0, 0>, + Conv::template process_tile<0, 0, 3, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 2, 1, 0>, + Conv::template process_tile<0, 0, 3, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 3, 0, 0>, + Conv::template process_tile<0, 0, 3, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 3, 1, 0>, + Conv::template process_tile<0, 0, 3, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 4, 0, 0>, + Conv::template process_tile<0, 0, 3, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 4, 1, 0>, + Conv::template process_tile<0, 0, 3, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 0, 0, 0>, + Conv::template process_tile<0, 0, 4, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 0, 1, 0>, + Conv::template process_tile<0, 0, 4, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 1, 0, 0>, + Conv::template process_tile<0, 0, 4, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 1, 1, 0>, + Conv::template process_tile<0, 0, 4, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 2, 0, 0>, + Conv::template process_tile<0, 0, 4, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 2, 1, 0>, + Conv::template process_tile<0, 0, 4, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 3, 0, 0>, + Conv::template process_tile<0, 0, 4, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 3, 1, 0>, + Conv::template process_tile<0, 0, 4, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 4, 0, 0>, + Conv::template process_tile<0, 0, 4, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 4, 1, 0>, + Conv::template process_tile<0, 0, 4, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 0, 0, 0>, + Conv::template process_tile<0, 1, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 0, 1, 0>, + Conv::template process_tile<0, 1, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 1, 0, 0>, + Conv::template process_tile<0, 1, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 1, 1, 0>, + Conv::template process_tile<0, 1, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 2, 0, 0>, + Conv::template process_tile<0, 1, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 2, 1, 0>, + Conv::template process_tile<0, 1, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 3, 0, 0>, + Conv::template process_tile<0, 1, 0, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 3, 1, 0>, + Conv::template process_tile<0, 1, 0, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 4, 0, 0>, + Conv::template process_tile<0, 1, 0, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 4, 1, 0>, + Conv::template process_tile<0, 1, 0, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 0, 0, 0>, + Conv::template process_tile<0, 1, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 0, 1, 0>, + Conv::template process_tile<0, 1, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 1, 0, 0>, + Conv::template process_tile<0, 1, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 1, 1, 0>, + Conv::template process_tile<0, 1, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 2, 0, 0>, + Conv::template process_tile<0, 1, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 2, 1, 0>, + Conv::template process_tile<0, 1, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 3, 0, 0>, + Conv::template process_tile<0, 1, 1, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 3, 1, 0>, + Conv::template process_tile<0, 1, 1, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 4, 0, 0>, + Conv::template process_tile<0, 1, 1, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 4, 1, 0>, + Conv::template process_tile<0, 1, 1, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 0, 0, 0>, + Conv::template process_tile<0, 1, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 0, 1, 0>, + Conv::template process_tile<0, 1, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 1, 0, 0>, + Conv::template process_tile<0, 1, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 1, 1, 0>, + Conv::template process_tile<0, 1, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 2, 0, 0>, + Conv::template process_tile<0, 1, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 2, 1, 0>, + Conv::template process_tile<0, 1, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 3, 0, 0>, + Conv::template process_tile<0, 1, 2, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 3, 1, 0>, + Conv::template process_tile<0, 1, 2, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 4, 0, 0>, + Conv::template process_tile<0, 1, 2, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 4, 1, 0>, + Conv::template process_tile<0, 1, 2, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 0, 0, 0>, + Conv::template process_tile<0, 1, 3, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 0, 1, 0>, + Conv::template process_tile<0, 1, 3, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 1, 0, 0>, + Conv::template process_tile<0, 1, 3, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 1, 1, 0>, + Conv::template process_tile<0, 1, 3, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 2, 0, 0>, + Conv::template process_tile<0, 1, 3, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 2, 1, 0>, + Conv::template process_tile<0, 1, 3, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 3, 0, 0>, + Conv::template process_tile<0, 1, 3, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 3, 1, 0>, + Conv::template process_tile<0, 1, 3, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 4, 0, 0>, + Conv::template process_tile<0, 1, 3, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 4, 1, 0>, + Conv::template process_tile<0, 1, 3, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 0, 0, 0>, + Conv::template process_tile<0, 1, 4, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 0, 1, 0>, + Conv::template process_tile<0, 1, 4, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 1, 0, 0>, + Conv::template process_tile<0, 1, 4, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 1, 1, 0>, + Conv::template process_tile<0, 1, 4, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 2, 0, 0>, + Conv::template process_tile<0, 1, 4, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 2, 1, 0>, + Conv::template process_tile<0, 1, 4, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 3, 0, 0>, + Conv::template process_tile<0, 1, 4, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 3, 1, 0>, + Conv::template process_tile<0, 1, 4, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 4, 0, 0>, + Conv::template process_tile<0, 1, 4, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 4, 1, 0>, + Conv::template process_tile<0, 1, 4, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 0, 0, 0>, + Conv::template process_tile<1, 0, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 0, 1, 0>, + Conv::template process_tile<1, 0, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 1, 0, 0>, + Conv::template process_tile<1, 0, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 1, 1, 0>, + Conv::template process_tile<1, 0, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 2, 0, 0>, + Conv::template process_tile<1, 0, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 2, 1, 0>, + Conv::template process_tile<1, 0, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 3, 0, 0>, + Conv::template process_tile<1, 0, 0, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 3, 1, 0>, + Conv::template process_tile<1, 0, 0, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 4, 0, 0>, + Conv::template process_tile<1, 0, 0, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 4, 1, 0>, + Conv::template process_tile<1, 0, 0, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 0, 0, 0>, + Conv::template process_tile<1, 0, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 0, 1, 0>, + Conv::template process_tile<1, 0, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 1, 0, 0>, + Conv::template process_tile<1, 0, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 1, 1, 0>, + Conv::template process_tile<1, 0, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 2, 0, 0>, + Conv::template process_tile<1, 0, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 2, 1, 0>, + Conv::template process_tile<1, 0, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 3, 0, 0>, + Conv::template process_tile<1, 0, 1, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 3, 1, 0>, + Conv::template process_tile<1, 0, 1, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 4, 0, 0>, + Conv::template process_tile<1, 0, 1, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 4, 1, 0>, + Conv::template process_tile<1, 0, 1, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 0, 0, 0>, + Conv::template process_tile<1, 0, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 0, 1, 0>, + Conv::template process_tile<1, 0, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 1, 0, 0>, + Conv::template process_tile<1, 0, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 1, 1, 0>, + Conv::template process_tile<1, 0, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 2, 0, 0>, + Conv::template process_tile<1, 0, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 2, 1, 0>, + Conv::template process_tile<1, 0, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 3, 0, 0>, + Conv::template process_tile<1, 0, 2, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 3, 1, 0>, + Conv::template process_tile<1, 0, 2, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 4, 0, 0>, + Conv::template process_tile<1, 0, 2, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 4, 1, 0>, + Conv::template process_tile<1, 0, 2, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 0, 0, 0>, + Conv::template process_tile<1, 0, 3, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 0, 1, 0>, + Conv::template process_tile<1, 0, 3, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 1, 0, 0>, + Conv::template process_tile<1, 0, 3, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 1, 1, 0>, + Conv::template process_tile<1, 0, 3, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 2, 0, 0>, + Conv::template process_tile<1, 0, 3, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 2, 1, 0>, + Conv::template process_tile<1, 0, 3, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 3, 0, 0>, + Conv::template process_tile<1, 0, 3, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 3, 1, 0>, + Conv::template process_tile<1, 0, 3, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 4, 0, 0>, + Conv::template process_tile<1, 0, 3, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 4, 1, 0>, + Conv::template process_tile<1, 0, 3, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 0, 0, 0>, + Conv::template process_tile<1, 0, 4, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 0, 1, 0>, + Conv::template process_tile<1, 0, 4, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 1, 0, 0>, + Conv::template process_tile<1, 0, 4, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 1, 1, 0>, + Conv::template process_tile<1, 0, 4, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 2, 0, 0>, + Conv::template process_tile<1, 0, 4, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 2, 1, 0>, + Conv::template process_tile<1, 0, 4, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 3, 0, 0>, + Conv::template process_tile<1, 0, 4, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 3, 1, 0>, + Conv::template process_tile<1, 0, 4, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 4, 0, 0>, + Conv::template process_tile<1, 0, 4, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 4, 1, 0>, + Conv::template process_tile<1, 0, 4, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 0, 0, 0>, + Conv::template process_tile<1, 1, 0, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 0, 1, 0>, + Conv::template process_tile<1, 1, 0, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 1, 0, 0>, + Conv::template process_tile<1, 1, 0, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 1, 1, 0>, + Conv::template process_tile<1, 1, 0, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 2, 0, 0>, + Conv::template process_tile<1, 1, 0, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 2, 1, 0>, + Conv::template process_tile<1, 1, 0, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 3, 0, 0>, + Conv::template process_tile<1, 1, 0, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 3, 1, 0>, + Conv::template process_tile<1, 1, 0, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 4, 0, 0>, + Conv::template process_tile<1, 1, 0, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 4, 1, 0>, + Conv::template process_tile<1, 1, 0, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 0, 0, 0>, + Conv::template process_tile<1, 1, 1, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 0, 1, 0>, + Conv::template process_tile<1, 1, 1, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 1, 0, 0>, + Conv::template process_tile<1, 1, 1, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 1, 1, 0>, + Conv::template process_tile<1, 1, 1, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 2, 0, 0>, + Conv::template process_tile<1, 1, 1, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 2, 1, 0>, + Conv::template process_tile<1, 1, 1, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 3, 0, 0>, + Conv::template process_tile<1, 1, 1, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 3, 1, 0>, + Conv::template process_tile<1, 1, 1, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 4, 0, 0>, + Conv::template process_tile<1, 1, 1, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 4, 1, 0>, + Conv::template process_tile<1, 1, 1, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 0, 0, 0>, + Conv::template process_tile<1, 1, 2, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 0, 1, 0>, + Conv::template process_tile<1, 1, 2, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 1, 0, 0>, + Conv::template process_tile<1, 1, 2, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 1, 1, 0>, + Conv::template process_tile<1, 1, 2, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 2, 0, 0>, + Conv::template process_tile<1, 1, 2, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 2, 1, 0>, + Conv::template process_tile<1, 1, 2, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 3, 0, 0>, + Conv::template process_tile<1, 1, 2, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 3, 1, 0>, + Conv::template process_tile<1, 1, 2, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 4, 0, 0>, + Conv::template process_tile<1, 1, 2, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 4, 1, 0>, + Conv::template process_tile<1, 1, 2, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 0, 0, 0>, + Conv::template process_tile<1, 1, 3, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 0, 1, 0>, + Conv::template process_tile<1, 1, 3, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 1, 0, 0>, + Conv::template process_tile<1, 1, 3, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 1, 1, 0>, + Conv::template process_tile<1, 1, 3, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 2, 0, 0>, + Conv::template process_tile<1, 1, 3, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 2, 1, 0>, + Conv::template process_tile<1, 1, 3, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 3, 0, 0>, + Conv::template process_tile<1, 1, 3, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 3, 1, 0>, + Conv::template process_tile<1, 1, 3, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 4, 0, 0>, + Conv::template process_tile<1, 1, 3, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 4, 1, 0>, + Conv::template process_tile<1, 1, 3, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 0, 0, 0>, + Conv::template process_tile<1, 1, 4, 0, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 0, 1, 0>, + Conv::template process_tile<1, 1, 4, 0, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 1, 0, 0>, + Conv::template process_tile<1, 1, 4, 1, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 1, 1, 0>, + Conv::template process_tile<1, 1, 4, 1, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 2, 0, 0>, + Conv::template process_tile<1, 1, 4, 2, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 2, 1, 0>, + Conv::template process_tile<1, 1, 4, 2, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 3, 0, 0>, + Conv::template process_tile<1, 1, 4, 3, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 3, 1, 0>, + Conv::template process_tile<1, 1, 4, 3, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 4, 0, 0>, + Conv::template process_tile<1, 1, 4, 4, 0, 1>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 4, 1, 0>, + Conv::template process_tile<1, 1, 4, 4, 1, 1>, + }, // Output pad bottom = 1 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp new file mode 100644 index 0000000000..dc3c383f99 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp @@ -0,0 +1,1175 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 0, 0, 0>, + Conv::template process_tile<0, 0, 0, 0, 0, 1>, + Conv::template process_tile<0, 0, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 0, 1, 0>, + Conv::template process_tile<0, 0, 0, 0, 1, 1>, + Conv::template process_tile<0, 0, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 0, 2, 0>, + Conv::template process_tile<0, 0, 0, 0, 2, 1>, + Conv::template process_tile<0, 0, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 1, 0, 0>, + Conv::template process_tile<0, 0, 0, 1, 0, 1>, + Conv::template process_tile<0, 0, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 1, 1, 0>, + Conv::template process_tile<0, 0, 0, 1, 1, 1>, + Conv::template process_tile<0, 0, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 1, 2, 0>, + Conv::template process_tile<0, 0, 0, 1, 2, 1>, + Conv::template process_tile<0, 0, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 2, 0, 0>, + Conv::template process_tile<0, 0, 0, 2, 0, 1>, + Conv::template process_tile<0, 0, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 2, 1, 0>, + Conv::template process_tile<0, 0, 0, 2, 1, 1>, + Conv::template process_tile<0, 0, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 2, 2, 0>, + Conv::template process_tile<0, 0, 0, 2, 2, 1>, + Conv::template process_tile<0, 0, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 3, 0, 0>, + Conv::template process_tile<0, 0, 0, 3, 0, 1>, + Conv::template process_tile<0, 0, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 3, 1, 0>, + Conv::template process_tile<0, 0, 0, 3, 1, 1>, + Conv::template process_tile<0, 0, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 3, 2, 0>, + Conv::template process_tile<0, 0, 0, 3, 2, 1>, + Conv::template process_tile<0, 0, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 0, 0, 0>, + Conv::template process_tile<0, 0, 1, 0, 0, 1>, + Conv::template process_tile<0, 0, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 0, 1, 0>, + Conv::template process_tile<0, 0, 1, 0, 1, 1>, + Conv::template process_tile<0, 0, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 0, 2, 0>, + Conv::template process_tile<0, 0, 1, 0, 2, 1>, + Conv::template process_tile<0, 0, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 1, 0, 0>, + Conv::template process_tile<0, 0, 1, 1, 0, 1>, + Conv::template process_tile<0, 0, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 1, 1, 0>, + Conv::template process_tile<0, 0, 1, 1, 1, 1>, + Conv::template process_tile<0, 0, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 1, 2, 0>, + Conv::template process_tile<0, 0, 1, 1, 2, 1>, + Conv::template process_tile<0, 0, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 2, 0, 0>, + Conv::template process_tile<0, 0, 1, 2, 0, 1>, + Conv::template process_tile<0, 0, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 2, 1, 0>, + Conv::template process_tile<0, 0, 1, 2, 1, 1>, + Conv::template process_tile<0, 0, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 2, 2, 0>, + Conv::template process_tile<0, 0, 1, 2, 2, 1>, + Conv::template process_tile<0, 0, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 3, 0, 0>, + Conv::template process_tile<0, 0, 1, 3, 0, 1>, + Conv::template process_tile<0, 0, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 3, 1, 0>, + Conv::template process_tile<0, 0, 1, 3, 1, 1>, + Conv::template process_tile<0, 0, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 3, 2, 0>, + Conv::template process_tile<0, 0, 1, 3, 2, 1>, + Conv::template process_tile<0, 0, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 0, 0, 0>, + Conv::template process_tile<0, 0, 2, 0, 0, 1>, + Conv::template process_tile<0, 0, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 0, 1, 0>, + Conv::template process_tile<0, 0, 2, 0, 1, 1>, + Conv::template process_tile<0, 0, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 0, 2, 0>, + Conv::template process_tile<0, 0, 2, 0, 2, 1>, + Conv::template process_tile<0, 0, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 1, 0, 0>, + Conv::template process_tile<0, 0, 2, 1, 0, 1>, + Conv::template process_tile<0, 0, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 1, 1, 0>, + Conv::template process_tile<0, 0, 2, 1, 1, 1>, + Conv::template process_tile<0, 0, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 1, 2, 0>, + Conv::template process_tile<0, 0, 2, 1, 2, 1>, + Conv::template process_tile<0, 0, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 2, 0, 0>, + Conv::template process_tile<0, 0, 2, 2, 0, 1>, + Conv::template process_tile<0, 0, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 2, 1, 0>, + Conv::template process_tile<0, 0, 2, 2, 1, 1>, + Conv::template process_tile<0, 0, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 2, 2, 0>, + Conv::template process_tile<0, 0, 2, 2, 2, 1>, + Conv::template process_tile<0, 0, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 3, 0, 0>, + Conv::template process_tile<0, 0, 2, 3, 0, 1>, + Conv::template process_tile<0, 0, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 3, 1, 0>, + Conv::template process_tile<0, 0, 2, 3, 1, 1>, + Conv::template process_tile<0, 0, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 3, 2, 0>, + Conv::template process_tile<0, 0, 2, 3, 2, 1>, + Conv::template process_tile<0, 0, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 0, 0, 0>, + Conv::template process_tile<0, 0, 3, 0, 0, 1>, + Conv::template process_tile<0, 0, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 0, 1, 0>, + Conv::template process_tile<0, 0, 3, 0, 1, 1>, + Conv::template process_tile<0, 0, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 0, 2, 0>, + Conv::template process_tile<0, 0, 3, 0, 2, 1>, + Conv::template process_tile<0, 0, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 1, 0, 0>, + Conv::template process_tile<0, 0, 3, 1, 0, 1>, + Conv::template process_tile<0, 0, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 1, 1, 0>, + Conv::template process_tile<0, 0, 3, 1, 1, 1>, + Conv::template process_tile<0, 0, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 1, 2, 0>, + Conv::template process_tile<0, 0, 3, 1, 2, 1>, + Conv::template process_tile<0, 0, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 2, 0, 0>, + Conv::template process_tile<0, 0, 3, 2, 0, 1>, + Conv::template process_tile<0, 0, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 2, 1, 0>, + Conv::template process_tile<0, 0, 3, 2, 1, 1>, + Conv::template process_tile<0, 0, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 2, 2, 0>, + Conv::template process_tile<0, 0, 3, 2, 2, 1>, + Conv::template process_tile<0, 0, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 3, 0, 0>, + Conv::template process_tile<0, 0, 3, 3, 0, 1>, + Conv::template process_tile<0, 0, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 3, 1, 0>, + Conv::template process_tile<0, 0, 3, 3, 1, 1>, + Conv::template process_tile<0, 0, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 3, 2, 0>, + Conv::template process_tile<0, 0, 3, 3, 2, 1>, + Conv::template process_tile<0, 0, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 3 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 0, 0, 0>, + Conv::template process_tile<0, 1, 0, 0, 0, 1>, + Conv::template process_tile<0, 1, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 0, 1, 0>, + Conv::template process_tile<0, 1, 0, 0, 1, 1>, + Conv::template process_tile<0, 1, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 0, 2, 0>, + Conv::template process_tile<0, 1, 0, 0, 2, 1>, + Conv::template process_tile<0, 1, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 1, 0, 0>, + Conv::template process_tile<0, 1, 0, 1, 0, 1>, + Conv::template process_tile<0, 1, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 1, 1, 0>, + Conv::template process_tile<0, 1, 0, 1, 1, 1>, + Conv::template process_tile<0, 1, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 1, 2, 0>, + Conv::template process_tile<0, 1, 0, 1, 2, 1>, + Conv::template process_tile<0, 1, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 2, 0, 0>, + Conv::template process_tile<0, 1, 0, 2, 0, 1>, + Conv::template process_tile<0, 1, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 2, 1, 0>, + Conv::template process_tile<0, 1, 0, 2, 1, 1>, + Conv::template process_tile<0, 1, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 2, 2, 0>, + Conv::template process_tile<0, 1, 0, 2, 2, 1>, + Conv::template process_tile<0, 1, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 3, 0, 0>, + Conv::template process_tile<0, 1, 0, 3, 0, 1>, + Conv::template process_tile<0, 1, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 3, 1, 0>, + Conv::template process_tile<0, 1, 0, 3, 1, 1>, + Conv::template process_tile<0, 1, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 3, 2, 0>, + Conv::template process_tile<0, 1, 0, 3, 2, 1>, + Conv::template process_tile<0, 1, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 0, 0, 0>, + Conv::template process_tile<0, 1, 1, 0, 0, 1>, + Conv::template process_tile<0, 1, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 0, 1, 0>, + Conv::template process_tile<0, 1, 1, 0, 1, 1>, + Conv::template process_tile<0, 1, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 0, 2, 0>, + Conv::template process_tile<0, 1, 1, 0, 2, 1>, + Conv::template process_tile<0, 1, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 1, 0, 0>, + Conv::template process_tile<0, 1, 1, 1, 0, 1>, + Conv::template process_tile<0, 1, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 1, 1, 0>, + Conv::template process_tile<0, 1, 1, 1, 1, 1>, + Conv::template process_tile<0, 1, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 1, 2, 0>, + Conv::template process_tile<0, 1, 1, 1, 2, 1>, + Conv::template process_tile<0, 1, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 2, 0, 0>, + Conv::template process_tile<0, 1, 1, 2, 0, 1>, + Conv::template process_tile<0, 1, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 2, 1, 0>, + Conv::template process_tile<0, 1, 1, 2, 1, 1>, + Conv::template process_tile<0, 1, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 2, 2, 0>, + Conv::template process_tile<0, 1, 1, 2, 2, 1>, + Conv::template process_tile<0, 1, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 3, 0, 0>, + Conv::template process_tile<0, 1, 1, 3, 0, 1>, + Conv::template process_tile<0, 1, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 3, 1, 0>, + Conv::template process_tile<0, 1, 1, 3, 1, 1>, + Conv::template process_tile<0, 1, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 3, 2, 0>, + Conv::template process_tile<0, 1, 1, 3, 2, 1>, + Conv::template process_tile<0, 1, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 0, 0, 0>, + Conv::template process_tile<0, 1, 2, 0, 0, 1>, + Conv::template process_tile<0, 1, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 0, 1, 0>, + Conv::template process_tile<0, 1, 2, 0, 1, 1>, + Conv::template process_tile<0, 1, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 0, 2, 0>, + Conv::template process_tile<0, 1, 2, 0, 2, 1>, + Conv::template process_tile<0, 1, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 1, 0, 0>, + Conv::template process_tile<0, 1, 2, 1, 0, 1>, + Conv::template process_tile<0, 1, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 1, 1, 0>, + Conv::template process_tile<0, 1, 2, 1, 1, 1>, + Conv::template process_tile<0, 1, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 1, 2, 0>, + Conv::template process_tile<0, 1, 2, 1, 2, 1>, + Conv::template process_tile<0, 1, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 2, 0, 0>, + Conv::template process_tile<0, 1, 2, 2, 0, 1>, + Conv::template process_tile<0, 1, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 2, 1, 0>, + Conv::template process_tile<0, 1, 2, 2, 1, 1>, + Conv::template process_tile<0, 1, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 2, 2, 0>, + Conv::template process_tile<0, 1, 2, 2, 2, 1>, + Conv::template process_tile<0, 1, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 3, 0, 0>, + Conv::template process_tile<0, 1, 2, 3, 0, 1>, + Conv::template process_tile<0, 1, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 3, 1, 0>, + Conv::template process_tile<0, 1, 2, 3, 1, 1>, + Conv::template process_tile<0, 1, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 3, 2, 0>, + Conv::template process_tile<0, 1, 2, 3, 2, 1>, + Conv::template process_tile<0, 1, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 0, 0, 0>, + Conv::template process_tile<0, 1, 3, 0, 0, 1>, + Conv::template process_tile<0, 1, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 0, 1, 0>, + Conv::template process_tile<0, 1, 3, 0, 1, 1>, + Conv::template process_tile<0, 1, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 0, 2, 0>, + Conv::template process_tile<0, 1, 3, 0, 2, 1>, + Conv::template process_tile<0, 1, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 1, 0, 0>, + Conv::template process_tile<0, 1, 3, 1, 0, 1>, + Conv::template process_tile<0, 1, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 1, 1, 0>, + Conv::template process_tile<0, 1, 3, 1, 1, 1>, + Conv::template process_tile<0, 1, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 1, 2, 0>, + Conv::template process_tile<0, 1, 3, 1, 2, 1>, + Conv::template process_tile<0, 1, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 2, 0, 0>, + Conv::template process_tile<0, 1, 3, 2, 0, 1>, + Conv::template process_tile<0, 1, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 2, 1, 0>, + Conv::template process_tile<0, 1, 3, 2, 1, 1>, + Conv::template process_tile<0, 1, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 2, 2, 0>, + Conv::template process_tile<0, 1, 3, 2, 2, 1>, + Conv::template process_tile<0, 1, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 3, 0, 0>, + Conv::template process_tile<0, 1, 3, 3, 0, 1>, + Conv::template process_tile<0, 1, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 3, 1, 0>, + Conv::template process_tile<0, 1, 3, 3, 1, 1>, + Conv::template process_tile<0, 1, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 3, 2, 0>, + Conv::template process_tile<0, 1, 3, 3, 2, 1>, + Conv::template process_tile<0, 1, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 3 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 0, 0, 0>, + Conv::template process_tile<1, 0, 0, 0, 0, 1>, + Conv::template process_tile<1, 0, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 0, 1, 0>, + Conv::template process_tile<1, 0, 0, 0, 1, 1>, + Conv::template process_tile<1, 0, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 0, 2, 0>, + Conv::template process_tile<1, 0, 0, 0, 2, 1>, + Conv::template process_tile<1, 0, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 1, 0, 0>, + Conv::template process_tile<1, 0, 0, 1, 0, 1>, + Conv::template process_tile<1, 0, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 1, 1, 0>, + Conv::template process_tile<1, 0, 0, 1, 1, 1>, + Conv::template process_tile<1, 0, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 1, 2, 0>, + Conv::template process_tile<1, 0, 0, 1, 2, 1>, + Conv::template process_tile<1, 0, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 2, 0, 0>, + Conv::template process_tile<1, 0, 0, 2, 0, 1>, + Conv::template process_tile<1, 0, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 2, 1, 0>, + Conv::template process_tile<1, 0, 0, 2, 1, 1>, + Conv::template process_tile<1, 0, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 2, 2, 0>, + Conv::template process_tile<1, 0, 0, 2, 2, 1>, + Conv::template process_tile<1, 0, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 3, 0, 0>, + Conv::template process_tile<1, 0, 0, 3, 0, 1>, + Conv::template process_tile<1, 0, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 3, 1, 0>, + Conv::template process_tile<1, 0, 0, 3, 1, 1>, + Conv::template process_tile<1, 0, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 3, 2, 0>, + Conv::template process_tile<1, 0, 0, 3, 2, 1>, + Conv::template process_tile<1, 0, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 0, 0, 0>, + Conv::template process_tile<1, 0, 1, 0, 0, 1>, + Conv::template process_tile<1, 0, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 0, 1, 0>, + Conv::template process_tile<1, 0, 1, 0, 1, 1>, + Conv::template process_tile<1, 0, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 0, 2, 0>, + Conv::template process_tile<1, 0, 1, 0, 2, 1>, + Conv::template process_tile<1, 0, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 1, 0, 0>, + Conv::template process_tile<1, 0, 1, 1, 0, 1>, + Conv::template process_tile<1, 0, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 1, 1, 0>, + Conv::template process_tile<1, 0, 1, 1, 1, 1>, + Conv::template process_tile<1, 0, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 1, 2, 0>, + Conv::template process_tile<1, 0, 1, 1, 2, 1>, + Conv::template process_tile<1, 0, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 2, 0, 0>, + Conv::template process_tile<1, 0, 1, 2, 0, 1>, + Conv::template process_tile<1, 0, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 2, 1, 0>, + Conv::template process_tile<1, 0, 1, 2, 1, 1>, + Conv::template process_tile<1, 0, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 2, 2, 0>, + Conv::template process_tile<1, 0, 1, 2, 2, 1>, + Conv::template process_tile<1, 0, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 3, 0, 0>, + Conv::template process_tile<1, 0, 1, 3, 0, 1>, + Conv::template process_tile<1, 0, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 3, 1, 0>, + Conv::template process_tile<1, 0, 1, 3, 1, 1>, + Conv::template process_tile<1, 0, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 3, 2, 0>, + Conv::template process_tile<1, 0, 1, 3, 2, 1>, + Conv::template process_tile<1, 0, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 0, 0, 0>, + Conv::template process_tile<1, 0, 2, 0, 0, 1>, + Conv::template process_tile<1, 0, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 0, 1, 0>, + Conv::template process_tile<1, 0, 2, 0, 1, 1>, + Conv::template process_tile<1, 0, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 0, 2, 0>, + Conv::template process_tile<1, 0, 2, 0, 2, 1>, + Conv::template process_tile<1, 0, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 1, 0, 0>, + Conv::template process_tile<1, 0, 2, 1, 0, 1>, + Conv::template process_tile<1, 0, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 1, 1, 0>, + Conv::template process_tile<1, 0, 2, 1, 1, 1>, + Conv::template process_tile<1, 0, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 1, 2, 0>, + Conv::template process_tile<1, 0, 2, 1, 2, 1>, + Conv::template process_tile<1, 0, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 2, 0, 0>, + Conv::template process_tile<1, 0, 2, 2, 0, 1>, + Conv::template process_tile<1, 0, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 2, 1, 0>, + Conv::template process_tile<1, 0, 2, 2, 1, 1>, + Conv::template process_tile<1, 0, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 2, 2, 0>, + Conv::template process_tile<1, 0, 2, 2, 2, 1>, + Conv::template process_tile<1, 0, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 3, 0, 0>, + Conv::template process_tile<1, 0, 2, 3, 0, 1>, + Conv::template process_tile<1, 0, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 3, 1, 0>, + Conv::template process_tile<1, 0, 2, 3, 1, 1>, + Conv::template process_tile<1, 0, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 3, 2, 0>, + Conv::template process_tile<1, 0, 2, 3, 2, 1>, + Conv::template process_tile<1, 0, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 0, 0, 0>, + Conv::template process_tile<1, 0, 3, 0, 0, 1>, + Conv::template process_tile<1, 0, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 0, 1, 0>, + Conv::template process_tile<1, 0, 3, 0, 1, 1>, + Conv::template process_tile<1, 0, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 0, 2, 0>, + Conv::template process_tile<1, 0, 3, 0, 2, 1>, + Conv::template process_tile<1, 0, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 1, 0, 0>, + Conv::template process_tile<1, 0, 3, 1, 0, 1>, + Conv::template process_tile<1, 0, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 1, 1, 0>, + Conv::template process_tile<1, 0, 3, 1, 1, 1>, + Conv::template process_tile<1, 0, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 1, 2, 0>, + Conv::template process_tile<1, 0, 3, 1, 2, 1>, + Conv::template process_tile<1, 0, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 2, 0, 0>, + Conv::template process_tile<1, 0, 3, 2, 0, 1>, + Conv::template process_tile<1, 0, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 2, 1, 0>, + Conv::template process_tile<1, 0, 3, 2, 1, 1>, + Conv::template process_tile<1, 0, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 2, 2, 0>, + Conv::template process_tile<1, 0, 3, 2, 2, 1>, + Conv::template process_tile<1, 0, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 3, 0, 0>, + Conv::template process_tile<1, 0, 3, 3, 0, 1>, + Conv::template process_tile<1, 0, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 3, 1, 0>, + Conv::template process_tile<1, 0, 3, 3, 1, 1>, + Conv::template process_tile<1, 0, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 3, 2, 0>, + Conv::template process_tile<1, 0, 3, 3, 2, 1>, + Conv::template process_tile<1, 0, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 3 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 0, 0, 0>, + Conv::template process_tile<1, 1, 0, 0, 0, 1>, + Conv::template process_tile<1, 1, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 0, 1, 0>, + Conv::template process_tile<1, 1, 0, 0, 1, 1>, + Conv::template process_tile<1, 1, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 0, 2, 0>, + Conv::template process_tile<1, 1, 0, 0, 2, 1>, + Conv::template process_tile<1, 1, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 1, 0, 0>, + Conv::template process_tile<1, 1, 0, 1, 0, 1>, + Conv::template process_tile<1, 1, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 1, 1, 0>, + Conv::template process_tile<1, 1, 0, 1, 1, 1>, + Conv::template process_tile<1, 1, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 1, 2, 0>, + Conv::template process_tile<1, 1, 0, 1, 2, 1>, + Conv::template process_tile<1, 1, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 2, 0, 0>, + Conv::template process_tile<1, 1, 0, 2, 0, 1>, + Conv::template process_tile<1, 1, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 2, 1, 0>, + Conv::template process_tile<1, 1, 0, 2, 1, 1>, + Conv::template process_tile<1, 1, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 2, 2, 0>, + Conv::template process_tile<1, 1, 0, 2, 2, 1>, + Conv::template process_tile<1, 1, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 3, 0, 0>, + Conv::template process_tile<1, 1, 0, 3, 0, 1>, + Conv::template process_tile<1, 1, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 3, 1, 0>, + Conv::template process_tile<1, 1, 0, 3, 1, 1>, + Conv::template process_tile<1, 1, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 3, 2, 0>, + Conv::template process_tile<1, 1, 0, 3, 2, 1>, + Conv::template process_tile<1, 1, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 0, 0, 0>, + Conv::template process_tile<1, 1, 1, 0, 0, 1>, + Conv::template process_tile<1, 1, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 0, 1, 0>, + Conv::template process_tile<1, 1, 1, 0, 1, 1>, + Conv::template process_tile<1, 1, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 0, 2, 0>, + Conv::template process_tile<1, 1, 1, 0, 2, 1>, + Conv::template process_tile<1, 1, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 1, 0, 0>, + Conv::template process_tile<1, 1, 1, 1, 0, 1>, + Conv::template process_tile<1, 1, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 1, 1, 0>, + Conv::template process_tile<1, 1, 1, 1, 1, 1>, + Conv::template process_tile<1, 1, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 1, 2, 0>, + Conv::template process_tile<1, 1, 1, 1, 2, 1>, + Conv::template process_tile<1, 1, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 2, 0, 0>, + Conv::template process_tile<1, 1, 1, 2, 0, 1>, + Conv::template process_tile<1, 1, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 2, 1, 0>, + Conv::template process_tile<1, 1, 1, 2, 1, 1>, + Conv::template process_tile<1, 1, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 2, 2, 0>, + Conv::template process_tile<1, 1, 1, 2, 2, 1>, + Conv::template process_tile<1, 1, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 3, 0, 0>, + Conv::template process_tile<1, 1, 1, 3, 0, 1>, + Conv::template process_tile<1, 1, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 3, 1, 0>, + Conv::template process_tile<1, 1, 1, 3, 1, 1>, + Conv::template process_tile<1, 1, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 3, 2, 0>, + Conv::template process_tile<1, 1, 1, 3, 2, 1>, + Conv::template process_tile<1, 1, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 0, 0, 0>, + Conv::template process_tile<1, 1, 2, 0, 0, 1>, + Conv::template process_tile<1, 1, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 0, 1, 0>, + Conv::template process_tile<1, 1, 2, 0, 1, 1>, + Conv::template process_tile<1, 1, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 0, 2, 0>, + Conv::template process_tile<1, 1, 2, 0, 2, 1>, + Conv::template process_tile<1, 1, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 1, 0, 0>, + Conv::template process_tile<1, 1, 2, 1, 0, 1>, + Conv::template process_tile<1, 1, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 1, 1, 0>, + Conv::template process_tile<1, 1, 2, 1, 1, 1>, + Conv::template process_tile<1, 1, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 1, 2, 0>, + Conv::template process_tile<1, 1, 2, 1, 2, 1>, + Conv::template process_tile<1, 1, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 2, 0, 0>, + Conv::template process_tile<1, 1, 2, 2, 0, 1>, + Conv::template process_tile<1, 1, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 2, 1, 0>, + Conv::template process_tile<1, 1, 2, 2, 1, 1>, + Conv::template process_tile<1, 1, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 2, 2, 0>, + Conv::template process_tile<1, 1, 2, 2, 2, 1>, + Conv::template process_tile<1, 1, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 3, 0, 0>, + Conv::template process_tile<1, 1, 2, 3, 0, 1>, + Conv::template process_tile<1, 1, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 3, 1, 0>, + Conv::template process_tile<1, 1, 2, 3, 1, 1>, + Conv::template process_tile<1, 1, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 3, 2, 0>, + Conv::template process_tile<1, 1, 2, 3, 2, 1>, + Conv::template process_tile<1, 1, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 0, 0, 0>, + Conv::template process_tile<1, 1, 3, 0, 0, 1>, + Conv::template process_tile<1, 1, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 0, 1, 0>, + Conv::template process_tile<1, 1, 3, 0, 1, 1>, + Conv::template process_tile<1, 1, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 0, 2, 0>, + Conv::template process_tile<1, 1, 3, 0, 2, 1>, + Conv::template process_tile<1, 1, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 1, 0, 0>, + Conv::template process_tile<1, 1, 3, 1, 0, 1>, + Conv::template process_tile<1, 1, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 1, 1, 0>, + Conv::template process_tile<1, 1, 3, 1, 1, 1>, + Conv::template process_tile<1, 1, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 1, 2, 0>, + Conv::template process_tile<1, 1, 3, 1, 2, 1>, + Conv::template process_tile<1, 1, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 2, 0, 0>, + Conv::template process_tile<1, 1, 3, 2, 0, 1>, + Conv::template process_tile<1, 1, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 2, 1, 0>, + Conv::template process_tile<1, 1, 3, 2, 1, 1>, + Conv::template process_tile<1, 1, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 2, 2, 0>, + Conv::template process_tile<1, 1, 3, 2, 2, 1>, + Conv::template process_tile<1, 1, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 3, 0, 0>, + Conv::template process_tile<1, 1, 3, 3, 0, 1>, + Conv::template process_tile<1, 1, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 3, 1, 0>, + Conv::template process_tile<1, 1, 3, 3, 1, 1>, + Conv::template process_tile<1, 1, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 3, 2, 0>, + Conv::template process_tile<1, 1, 3, 3, 2, 1>, + Conv::template process_tile<1, 1, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + }, // Input pad bottom = 3 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp new file mode 100644 index 0000000000..8d511b1a6c --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp @@ -0,0 +1,3443 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 0, 0, 0>, + Conv::template process_tile<0, 0, 0, 0, 0, 1>, + Conv::template process_tile<0, 0, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 0, 1, 0>, + Conv::template process_tile<0, 0, 0, 0, 1, 1>, + Conv::template process_tile<0, 0, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 0, 2, 0>, + Conv::template process_tile<0, 0, 0, 0, 2, 1>, + Conv::template process_tile<0, 0, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 1, 0, 0>, + Conv::template process_tile<0, 0, 0, 1, 0, 1>, + Conv::template process_tile<0, 0, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 1, 1, 0>, + Conv::template process_tile<0, 0, 0, 1, 1, 1>, + Conv::template process_tile<0, 0, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 1, 2, 0>, + Conv::template process_tile<0, 0, 0, 1, 2, 1>, + Conv::template process_tile<0, 0, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 2, 0, 0>, + Conv::template process_tile<0, 0, 0, 2, 0, 1>, + Conv::template process_tile<0, 0, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 2, 1, 0>, + Conv::template process_tile<0, 0, 0, 2, 1, 1>, + Conv::template process_tile<0, 0, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 2, 2, 0>, + Conv::template process_tile<0, 0, 0, 2, 2, 1>, + Conv::template process_tile<0, 0, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 3, 0, 0>, + Conv::template process_tile<0, 0, 0, 3, 0, 1>, + Conv::template process_tile<0, 0, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 3, 1, 0>, + Conv::template process_tile<0, 0, 0, 3, 1, 1>, + Conv::template process_tile<0, 0, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 3, 2, 0>, + Conv::template process_tile<0, 0, 0, 3, 2, 1>, + Conv::template process_tile<0, 0, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 4, 0, 0>, + Conv::template process_tile<0, 0, 0, 4, 0, 1>, + Conv::template process_tile<0, 0, 0, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 4, 1, 0>, + Conv::template process_tile<0, 0, 0, 4, 1, 1>, + Conv::template process_tile<0, 0, 0, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 4, 2, 0>, + Conv::template process_tile<0, 0, 0, 4, 2, 1>, + Conv::template process_tile<0, 0, 0, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 5, 0, 0>, + Conv::template process_tile<0, 0, 0, 5, 0, 1>, + Conv::template process_tile<0, 0, 0, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 5, 1, 0>, + Conv::template process_tile<0, 0, 0, 5, 1, 1>, + Conv::template process_tile<0, 0, 0, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 5, 2, 0>, + Conv::template process_tile<0, 0, 0, 5, 2, 1>, + Conv::template process_tile<0, 0, 0, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 6, 0, 0>, + Conv::template process_tile<0, 0, 0, 6, 0, 1>, + Conv::template process_tile<0, 0, 0, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 6, 1, 0>, + Conv::template process_tile<0, 0, 0, 6, 1, 1>, + Conv::template process_tile<0, 0, 0, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 6, 2, 0>, + Conv::template process_tile<0, 0, 0, 6, 2, 1>, + Conv::template process_tile<0, 0, 0, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 0, 0, 0>, + Conv::template process_tile<0, 0, 1, 0, 0, 1>, + Conv::template process_tile<0, 0, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 0, 1, 0>, + Conv::template process_tile<0, 0, 1, 0, 1, 1>, + Conv::template process_tile<0, 0, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 0, 2, 0>, + Conv::template process_tile<0, 0, 1, 0, 2, 1>, + Conv::template process_tile<0, 0, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 1, 0, 0>, + Conv::template process_tile<0, 0, 1, 1, 0, 1>, + Conv::template process_tile<0, 0, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 1, 1, 0>, + Conv::template process_tile<0, 0, 1, 1, 1, 1>, + Conv::template process_tile<0, 0, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 1, 2, 0>, + Conv::template process_tile<0, 0, 1, 1, 2, 1>, + Conv::template process_tile<0, 0, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 2, 0, 0>, + Conv::template process_tile<0, 0, 1, 2, 0, 1>, + Conv::template process_tile<0, 0, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 2, 1, 0>, + Conv::template process_tile<0, 0, 1, 2, 1, 1>, + Conv::template process_tile<0, 0, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 2, 2, 0>, + Conv::template process_tile<0, 0, 1, 2, 2, 1>, + Conv::template process_tile<0, 0, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 3, 0, 0>, + Conv::template process_tile<0, 0, 1, 3, 0, 1>, + Conv::template process_tile<0, 0, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 3, 1, 0>, + Conv::template process_tile<0, 0, 1, 3, 1, 1>, + Conv::template process_tile<0, 0, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 3, 2, 0>, + Conv::template process_tile<0, 0, 1, 3, 2, 1>, + Conv::template process_tile<0, 0, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 4, 0, 0>, + Conv::template process_tile<0, 0, 1, 4, 0, 1>, + Conv::template process_tile<0, 0, 1, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 4, 1, 0>, + Conv::template process_tile<0, 0, 1, 4, 1, 1>, + Conv::template process_tile<0, 0, 1, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 4, 2, 0>, + Conv::template process_tile<0, 0, 1, 4, 2, 1>, + Conv::template process_tile<0, 0, 1, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 5, 0, 0>, + Conv::template process_tile<0, 0, 1, 5, 0, 1>, + Conv::template process_tile<0, 0, 1, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 5, 1, 0>, + Conv::template process_tile<0, 0, 1, 5, 1, 1>, + Conv::template process_tile<0, 0, 1, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 5, 2, 0>, + Conv::template process_tile<0, 0, 1, 5, 2, 1>, + Conv::template process_tile<0, 0, 1, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 6, 0, 0>, + Conv::template process_tile<0, 0, 1, 6, 0, 1>, + Conv::template process_tile<0, 0, 1, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 6, 1, 0>, + Conv::template process_tile<0, 0, 1, 6, 1, 1>, + Conv::template process_tile<0, 0, 1, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 6, 2, 0>, + Conv::template process_tile<0, 0, 1, 6, 2, 1>, + Conv::template process_tile<0, 0, 1, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 0, 0, 0>, + Conv::template process_tile<0, 0, 2, 0, 0, 1>, + Conv::template process_tile<0, 0, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 0, 1, 0>, + Conv::template process_tile<0, 0, 2, 0, 1, 1>, + Conv::template process_tile<0, 0, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 0, 2, 0>, + Conv::template process_tile<0, 0, 2, 0, 2, 1>, + Conv::template process_tile<0, 0, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 1, 0, 0>, + Conv::template process_tile<0, 0, 2, 1, 0, 1>, + Conv::template process_tile<0, 0, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 1, 1, 0>, + Conv::template process_tile<0, 0, 2, 1, 1, 1>, + Conv::template process_tile<0, 0, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 1, 2, 0>, + Conv::template process_tile<0, 0, 2, 1, 2, 1>, + Conv::template process_tile<0, 0, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 2, 0, 0>, + Conv::template process_tile<0, 0, 2, 2, 0, 1>, + Conv::template process_tile<0, 0, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 2, 1, 0>, + Conv::template process_tile<0, 0, 2, 2, 1, 1>, + Conv::template process_tile<0, 0, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 2, 2, 0>, + Conv::template process_tile<0, 0, 2, 2, 2, 1>, + Conv::template process_tile<0, 0, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 3, 0, 0>, + Conv::template process_tile<0, 0, 2, 3, 0, 1>, + Conv::template process_tile<0, 0, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 3, 1, 0>, + Conv::template process_tile<0, 0, 2, 3, 1, 1>, + Conv::template process_tile<0, 0, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 3, 2, 0>, + Conv::template process_tile<0, 0, 2, 3, 2, 1>, + Conv::template process_tile<0, 0, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 4, 0, 0>, + Conv::template process_tile<0, 0, 2, 4, 0, 1>, + Conv::template process_tile<0, 0, 2, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 4, 1, 0>, + Conv::template process_tile<0, 0, 2, 4, 1, 1>, + Conv::template process_tile<0, 0, 2, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 4, 2, 0>, + Conv::template process_tile<0, 0, 2, 4, 2, 1>, + Conv::template process_tile<0, 0, 2, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 5, 0, 0>, + Conv::template process_tile<0, 0, 2, 5, 0, 1>, + Conv::template process_tile<0, 0, 2, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 5, 1, 0>, + Conv::template process_tile<0, 0, 2, 5, 1, 1>, + Conv::template process_tile<0, 0, 2, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 5, 2, 0>, + Conv::template process_tile<0, 0, 2, 5, 2, 1>, + Conv::template process_tile<0, 0, 2, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 6, 0, 0>, + Conv::template process_tile<0, 0, 2, 6, 0, 1>, + Conv::template process_tile<0, 0, 2, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 6, 1, 0>, + Conv::template process_tile<0, 0, 2, 6, 1, 1>, + Conv::template process_tile<0, 0, 2, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 6, 2, 0>, + Conv::template process_tile<0, 0, 2, 6, 2, 1>, + Conv::template process_tile<0, 0, 2, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 0, 0, 0>, + Conv::template process_tile<0, 0, 3, 0, 0, 1>, + Conv::template process_tile<0, 0, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 0, 1, 0>, + Conv::template process_tile<0, 0, 3, 0, 1, 1>, + Conv::template process_tile<0, 0, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 0, 2, 0>, + Conv::template process_tile<0, 0, 3, 0, 2, 1>, + Conv::template process_tile<0, 0, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 1, 0, 0>, + Conv::template process_tile<0, 0, 3, 1, 0, 1>, + Conv::template process_tile<0, 0, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 1, 1, 0>, + Conv::template process_tile<0, 0, 3, 1, 1, 1>, + Conv::template process_tile<0, 0, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 1, 2, 0>, + Conv::template process_tile<0, 0, 3, 1, 2, 1>, + Conv::template process_tile<0, 0, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 2, 0, 0>, + Conv::template process_tile<0, 0, 3, 2, 0, 1>, + Conv::template process_tile<0, 0, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 2, 1, 0>, + Conv::template process_tile<0, 0, 3, 2, 1, 1>, + Conv::template process_tile<0, 0, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 2, 2, 0>, + Conv::template process_tile<0, 0, 3, 2, 2, 1>, + Conv::template process_tile<0, 0, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 3, 0, 0>, + Conv::template process_tile<0, 0, 3, 3, 0, 1>, + Conv::template process_tile<0, 0, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 3, 1, 0>, + Conv::template process_tile<0, 0, 3, 3, 1, 1>, + Conv::template process_tile<0, 0, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 3, 2, 0>, + Conv::template process_tile<0, 0, 3, 3, 2, 1>, + Conv::template process_tile<0, 0, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 4, 0, 0>, + Conv::template process_tile<0, 0, 3, 4, 0, 1>, + Conv::template process_tile<0, 0, 3, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 4, 1, 0>, + Conv::template process_tile<0, 0, 3, 4, 1, 1>, + Conv::template process_tile<0, 0, 3, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 4, 2, 0>, + Conv::template process_tile<0, 0, 3, 4, 2, 1>, + Conv::template process_tile<0, 0, 3, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 5, 0, 0>, + Conv::template process_tile<0, 0, 3, 5, 0, 1>, + Conv::template process_tile<0, 0, 3, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 5, 1, 0>, + Conv::template process_tile<0, 0, 3, 5, 1, 1>, + Conv::template process_tile<0, 0, 3, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 5, 2, 0>, + Conv::template process_tile<0, 0, 3, 5, 2, 1>, + Conv::template process_tile<0, 0, 3, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 6, 0, 0>, + Conv::template process_tile<0, 0, 3, 6, 0, 1>, + Conv::template process_tile<0, 0, 3, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 6, 1, 0>, + Conv::template process_tile<0, 0, 3, 6, 1, 1>, + Conv::template process_tile<0, 0, 3, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 6, 2, 0>, + Conv::template process_tile<0, 0, 3, 6, 2, 1>, + Conv::template process_tile<0, 0, 3, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 0, 0, 0>, + Conv::template process_tile<0, 0, 4, 0, 0, 1>, + Conv::template process_tile<0, 0, 4, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 0, 1, 0>, + Conv::template process_tile<0, 0, 4, 0, 1, 1>, + Conv::template process_tile<0, 0, 4, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 0, 2, 0>, + Conv::template process_tile<0, 0, 4, 0, 2, 1>, + Conv::template process_tile<0, 0, 4, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 1, 0, 0>, + Conv::template process_tile<0, 0, 4, 1, 0, 1>, + Conv::template process_tile<0, 0, 4, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 1, 1, 0>, + Conv::template process_tile<0, 0, 4, 1, 1, 1>, + Conv::template process_tile<0, 0, 4, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 1, 2, 0>, + Conv::template process_tile<0, 0, 4, 1, 2, 1>, + Conv::template process_tile<0, 0, 4, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 2, 0, 0>, + Conv::template process_tile<0, 0, 4, 2, 0, 1>, + Conv::template process_tile<0, 0, 4, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 2, 1, 0>, + Conv::template process_tile<0, 0, 4, 2, 1, 1>, + Conv::template process_tile<0, 0, 4, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 2, 2, 0>, + Conv::template process_tile<0, 0, 4, 2, 2, 1>, + Conv::template process_tile<0, 0, 4, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 3, 0, 0>, + Conv::template process_tile<0, 0, 4, 3, 0, 1>, + Conv::template process_tile<0, 0, 4, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 3, 1, 0>, + Conv::template process_tile<0, 0, 4, 3, 1, 1>, + Conv::template process_tile<0, 0, 4, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 3, 2, 0>, + Conv::template process_tile<0, 0, 4, 3, 2, 1>, + Conv::template process_tile<0, 0, 4, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 4, 0, 0>, + Conv::template process_tile<0, 0, 4, 4, 0, 1>, + Conv::template process_tile<0, 0, 4, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 4, 1, 0>, + Conv::template process_tile<0, 0, 4, 4, 1, 1>, + Conv::template process_tile<0, 0, 4, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 4, 2, 0>, + Conv::template process_tile<0, 0, 4, 4, 2, 1>, + Conv::template process_tile<0, 0, 4, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 5, 0, 0>, + Conv::template process_tile<0, 0, 4, 5, 0, 1>, + Conv::template process_tile<0, 0, 4, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 5, 1, 0>, + Conv::template process_tile<0, 0, 4, 5, 1, 1>, + Conv::template process_tile<0, 0, 4, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 5, 2, 0>, + Conv::template process_tile<0, 0, 4, 5, 2, 1>, + Conv::template process_tile<0, 0, 4, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 6, 0, 0>, + Conv::template process_tile<0, 0, 4, 6, 0, 1>, + Conv::template process_tile<0, 0, 4, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 6, 1, 0>, + Conv::template process_tile<0, 0, 4, 6, 1, 1>, + Conv::template process_tile<0, 0, 4, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 6, 2, 0>, + Conv::template process_tile<0, 0, 4, 6, 2, 1>, + Conv::template process_tile<0, 0, 4, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 0, 0, 0>, + Conv::template process_tile<0, 0, 5, 0, 0, 1>, + Conv::template process_tile<0, 0, 5, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 0, 1, 0>, + Conv::template process_tile<0, 0, 5, 0, 1, 1>, + Conv::template process_tile<0, 0, 5, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 0, 2, 0>, + Conv::template process_tile<0, 0, 5, 0, 2, 1>, + Conv::template process_tile<0, 0, 5, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 1, 0, 0>, + Conv::template process_tile<0, 0, 5, 1, 0, 1>, + Conv::template process_tile<0, 0, 5, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 1, 1, 0>, + Conv::template process_tile<0, 0, 5, 1, 1, 1>, + Conv::template process_tile<0, 0, 5, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 1, 2, 0>, + Conv::template process_tile<0, 0, 5, 1, 2, 1>, + Conv::template process_tile<0, 0, 5, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 2, 0, 0>, + Conv::template process_tile<0, 0, 5, 2, 0, 1>, + Conv::template process_tile<0, 0, 5, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 2, 1, 0>, + Conv::template process_tile<0, 0, 5, 2, 1, 1>, + Conv::template process_tile<0, 0, 5, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 2, 2, 0>, + Conv::template process_tile<0, 0, 5, 2, 2, 1>, + Conv::template process_tile<0, 0, 5, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 3, 0, 0>, + Conv::template process_tile<0, 0, 5, 3, 0, 1>, + Conv::template process_tile<0, 0, 5, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 3, 1, 0>, + Conv::template process_tile<0, 0, 5, 3, 1, 1>, + Conv::template process_tile<0, 0, 5, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 3, 2, 0>, + Conv::template process_tile<0, 0, 5, 3, 2, 1>, + Conv::template process_tile<0, 0, 5, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 4, 0, 0>, + Conv::template process_tile<0, 0, 5, 4, 0, 1>, + Conv::template process_tile<0, 0, 5, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 4, 1, 0>, + Conv::template process_tile<0, 0, 5, 4, 1, 1>, + Conv::template process_tile<0, 0, 5, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 4, 2, 0>, + Conv::template process_tile<0, 0, 5, 4, 2, 1>, + Conv::template process_tile<0, 0, 5, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 5, 0, 0>, + Conv::template process_tile<0, 0, 5, 5, 0, 1>, + Conv::template process_tile<0, 0, 5, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 5, 1, 0>, + Conv::template process_tile<0, 0, 5, 5, 1, 1>, + Conv::template process_tile<0, 0, 5, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 5, 2, 0>, + Conv::template process_tile<0, 0, 5, 5, 2, 1>, + Conv::template process_tile<0, 0, 5, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 6, 0, 0>, + Conv::template process_tile<0, 0, 5, 6, 0, 1>, + Conv::template process_tile<0, 0, 5, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 6, 1, 0>, + Conv::template process_tile<0, 0, 5, 6, 1, 1>, + Conv::template process_tile<0, 0, 5, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 6, 2, 0>, + Conv::template process_tile<0, 0, 5, 6, 2, 1>, + Conv::template process_tile<0, 0, 5, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 0, 0, 0>, + Conv::template process_tile<0, 0, 6, 0, 0, 1>, + Conv::template process_tile<0, 0, 6, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 0, 1, 0>, + Conv::template process_tile<0, 0, 6, 0, 1, 1>, + Conv::template process_tile<0, 0, 6, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 0, 2, 0>, + Conv::template process_tile<0, 0, 6, 0, 2, 1>, + Conv::template process_tile<0, 0, 6, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 1, 0, 0>, + Conv::template process_tile<0, 0, 6, 1, 0, 1>, + Conv::template process_tile<0, 0, 6, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 1, 1, 0>, + Conv::template process_tile<0, 0, 6, 1, 1, 1>, + Conv::template process_tile<0, 0, 6, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 1, 2, 0>, + Conv::template process_tile<0, 0, 6, 1, 2, 1>, + Conv::template process_tile<0, 0, 6, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 2, 0, 0>, + Conv::template process_tile<0, 0, 6, 2, 0, 1>, + Conv::template process_tile<0, 0, 6, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 2, 1, 0>, + Conv::template process_tile<0, 0, 6, 2, 1, 1>, + Conv::template process_tile<0, 0, 6, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 2, 2, 0>, + Conv::template process_tile<0, 0, 6, 2, 2, 1>, + Conv::template process_tile<0, 0, 6, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 3, 0, 0>, + Conv::template process_tile<0, 0, 6, 3, 0, 1>, + Conv::template process_tile<0, 0, 6, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 3, 1, 0>, + Conv::template process_tile<0, 0, 6, 3, 1, 1>, + Conv::template process_tile<0, 0, 6, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 3, 2, 0>, + Conv::template process_tile<0, 0, 6, 3, 2, 1>, + Conv::template process_tile<0, 0, 6, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 4, 0, 0>, + Conv::template process_tile<0, 0, 6, 4, 0, 1>, + Conv::template process_tile<0, 0, 6, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 4, 1, 0>, + Conv::template process_tile<0, 0, 6, 4, 1, 1>, + Conv::template process_tile<0, 0, 6, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 4, 2, 0>, + Conv::template process_tile<0, 0, 6, 4, 2, 1>, + Conv::template process_tile<0, 0, 6, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 5, 0, 0>, + Conv::template process_tile<0, 0, 6, 5, 0, 1>, + Conv::template process_tile<0, 0, 6, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 5, 1, 0>, + Conv::template process_tile<0, 0, 6, 5, 1, 1>, + Conv::template process_tile<0, 0, 6, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 5, 2, 0>, + Conv::template process_tile<0, 0, 6, 5, 2, 1>, + Conv::template process_tile<0, 0, 6, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 6, 0, 0>, + Conv::template process_tile<0, 0, 6, 6, 0, 1>, + Conv::template process_tile<0, 0, 6, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 6, 1, 0>, + Conv::template process_tile<0, 0, 6, 6, 1, 1>, + Conv::template process_tile<0, 0, 6, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 6, 2, 0>, + Conv::template process_tile<0, 0, 6, 6, 2, 1>, + Conv::template process_tile<0, 0, 6, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 0, 0, 0>, + Conv::template process_tile<0, 1, 0, 0, 0, 1>, + Conv::template process_tile<0, 1, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 0, 1, 0>, + Conv::template process_tile<0, 1, 0, 0, 1, 1>, + Conv::template process_tile<0, 1, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 0, 2, 0>, + Conv::template process_tile<0, 1, 0, 0, 2, 1>, + Conv::template process_tile<0, 1, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 1, 0, 0>, + Conv::template process_tile<0, 1, 0, 1, 0, 1>, + Conv::template process_tile<0, 1, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 1, 1, 0>, + Conv::template process_tile<0, 1, 0, 1, 1, 1>, + Conv::template process_tile<0, 1, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 1, 2, 0>, + Conv::template process_tile<0, 1, 0, 1, 2, 1>, + Conv::template process_tile<0, 1, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 2, 0, 0>, + Conv::template process_tile<0, 1, 0, 2, 0, 1>, + Conv::template process_tile<0, 1, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 2, 1, 0>, + Conv::template process_tile<0, 1, 0, 2, 1, 1>, + Conv::template process_tile<0, 1, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 2, 2, 0>, + Conv::template process_tile<0, 1, 0, 2, 2, 1>, + Conv::template process_tile<0, 1, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 3, 0, 0>, + Conv::template process_tile<0, 1, 0, 3, 0, 1>, + Conv::template process_tile<0, 1, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 3, 1, 0>, + Conv::template process_tile<0, 1, 0, 3, 1, 1>, + Conv::template process_tile<0, 1, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 3, 2, 0>, + Conv::template process_tile<0, 1, 0, 3, 2, 1>, + Conv::template process_tile<0, 1, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 4, 0, 0>, + Conv::template process_tile<0, 1, 0, 4, 0, 1>, + Conv::template process_tile<0, 1, 0, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 4, 1, 0>, + Conv::template process_tile<0, 1, 0, 4, 1, 1>, + Conv::template process_tile<0, 1, 0, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 4, 2, 0>, + Conv::template process_tile<0, 1, 0, 4, 2, 1>, + Conv::template process_tile<0, 1, 0, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 5, 0, 0>, + Conv::template process_tile<0, 1, 0, 5, 0, 1>, + Conv::template process_tile<0, 1, 0, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 5, 1, 0>, + Conv::template process_tile<0, 1, 0, 5, 1, 1>, + Conv::template process_tile<0, 1, 0, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 5, 2, 0>, + Conv::template process_tile<0, 1, 0, 5, 2, 1>, + Conv::template process_tile<0, 1, 0, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 6, 0, 0>, + Conv::template process_tile<0, 1, 0, 6, 0, 1>, + Conv::template process_tile<0, 1, 0, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 6, 1, 0>, + Conv::template process_tile<0, 1, 0, 6, 1, 1>, + Conv::template process_tile<0, 1, 0, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 6, 2, 0>, + Conv::template process_tile<0, 1, 0, 6, 2, 1>, + Conv::template process_tile<0, 1, 0, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 0, 0, 0>, + Conv::template process_tile<0, 1, 1, 0, 0, 1>, + Conv::template process_tile<0, 1, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 0, 1, 0>, + Conv::template process_tile<0, 1, 1, 0, 1, 1>, + Conv::template process_tile<0, 1, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 0, 2, 0>, + Conv::template process_tile<0, 1, 1, 0, 2, 1>, + Conv::template process_tile<0, 1, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 1, 0, 0>, + Conv::template process_tile<0, 1, 1, 1, 0, 1>, + Conv::template process_tile<0, 1, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 1, 1, 0>, + Conv::template process_tile<0, 1, 1, 1, 1, 1>, + Conv::template process_tile<0, 1, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 1, 2, 0>, + Conv::template process_tile<0, 1, 1, 1, 2, 1>, + Conv::template process_tile<0, 1, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 2, 0, 0>, + Conv::template process_tile<0, 1, 1, 2, 0, 1>, + Conv::template process_tile<0, 1, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 2, 1, 0>, + Conv::template process_tile<0, 1, 1, 2, 1, 1>, + Conv::template process_tile<0, 1, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 2, 2, 0>, + Conv::template process_tile<0, 1, 1, 2, 2, 1>, + Conv::template process_tile<0, 1, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 3, 0, 0>, + Conv::template process_tile<0, 1, 1, 3, 0, 1>, + Conv::template process_tile<0, 1, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 3, 1, 0>, + Conv::template process_tile<0, 1, 1, 3, 1, 1>, + Conv::template process_tile<0, 1, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 3, 2, 0>, + Conv::template process_tile<0, 1, 1, 3, 2, 1>, + Conv::template process_tile<0, 1, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 4, 0, 0>, + Conv::template process_tile<0, 1, 1, 4, 0, 1>, + Conv::template process_tile<0, 1, 1, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 4, 1, 0>, + Conv::template process_tile<0, 1, 1, 4, 1, 1>, + Conv::template process_tile<0, 1, 1, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 4, 2, 0>, + Conv::template process_tile<0, 1, 1, 4, 2, 1>, + Conv::template process_tile<0, 1, 1, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 5, 0, 0>, + Conv::template process_tile<0, 1, 1, 5, 0, 1>, + Conv::template process_tile<0, 1, 1, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 5, 1, 0>, + Conv::template process_tile<0, 1, 1, 5, 1, 1>, + Conv::template process_tile<0, 1, 1, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 5, 2, 0>, + Conv::template process_tile<0, 1, 1, 5, 2, 1>, + Conv::template process_tile<0, 1, 1, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 6, 0, 0>, + Conv::template process_tile<0, 1, 1, 6, 0, 1>, + Conv::template process_tile<0, 1, 1, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 6, 1, 0>, + Conv::template process_tile<0, 1, 1, 6, 1, 1>, + Conv::template process_tile<0, 1, 1, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 6, 2, 0>, + Conv::template process_tile<0, 1, 1, 6, 2, 1>, + Conv::template process_tile<0, 1, 1, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 0, 0, 0>, + Conv::template process_tile<0, 1, 2, 0, 0, 1>, + Conv::template process_tile<0, 1, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 0, 1, 0>, + Conv::template process_tile<0, 1, 2, 0, 1, 1>, + Conv::template process_tile<0, 1, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 0, 2, 0>, + Conv::template process_tile<0, 1, 2, 0, 2, 1>, + Conv::template process_tile<0, 1, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 1, 0, 0>, + Conv::template process_tile<0, 1, 2, 1, 0, 1>, + Conv::template process_tile<0, 1, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 1, 1, 0>, + Conv::template process_tile<0, 1, 2, 1, 1, 1>, + Conv::template process_tile<0, 1, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 1, 2, 0>, + Conv::template process_tile<0, 1, 2, 1, 2, 1>, + Conv::template process_tile<0, 1, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 2, 0, 0>, + Conv::template process_tile<0, 1, 2, 2, 0, 1>, + Conv::template process_tile<0, 1, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 2, 1, 0>, + Conv::template process_tile<0, 1, 2, 2, 1, 1>, + Conv::template process_tile<0, 1, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 2, 2, 0>, + Conv::template process_tile<0, 1, 2, 2, 2, 1>, + Conv::template process_tile<0, 1, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 3, 0, 0>, + Conv::template process_tile<0, 1, 2, 3, 0, 1>, + Conv::template process_tile<0, 1, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 3, 1, 0>, + Conv::template process_tile<0, 1, 2, 3, 1, 1>, + Conv::template process_tile<0, 1, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 3, 2, 0>, + Conv::template process_tile<0, 1, 2, 3, 2, 1>, + Conv::template process_tile<0, 1, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 4, 0, 0>, + Conv::template process_tile<0, 1, 2, 4, 0, 1>, + Conv::template process_tile<0, 1, 2, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 4, 1, 0>, + Conv::template process_tile<0, 1, 2, 4, 1, 1>, + Conv::template process_tile<0, 1, 2, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 4, 2, 0>, + Conv::template process_tile<0, 1, 2, 4, 2, 1>, + Conv::template process_tile<0, 1, 2, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 5, 0, 0>, + Conv::template process_tile<0, 1, 2, 5, 0, 1>, + Conv::template process_tile<0, 1, 2, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 5, 1, 0>, + Conv::template process_tile<0, 1, 2, 5, 1, 1>, + Conv::template process_tile<0, 1, 2, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 5, 2, 0>, + Conv::template process_tile<0, 1, 2, 5, 2, 1>, + Conv::template process_tile<0, 1, 2, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 6, 0, 0>, + Conv::template process_tile<0, 1, 2, 6, 0, 1>, + Conv::template process_tile<0, 1, 2, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 6, 1, 0>, + Conv::template process_tile<0, 1, 2, 6, 1, 1>, + Conv::template process_tile<0, 1, 2, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 6, 2, 0>, + Conv::template process_tile<0, 1, 2, 6, 2, 1>, + Conv::template process_tile<0, 1, 2, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 0, 0, 0>, + Conv::template process_tile<0, 1, 3, 0, 0, 1>, + Conv::template process_tile<0, 1, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 0, 1, 0>, + Conv::template process_tile<0, 1, 3, 0, 1, 1>, + Conv::template process_tile<0, 1, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 0, 2, 0>, + Conv::template process_tile<0, 1, 3, 0, 2, 1>, + Conv::template process_tile<0, 1, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 1, 0, 0>, + Conv::template process_tile<0, 1, 3, 1, 0, 1>, + Conv::template process_tile<0, 1, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 1, 1, 0>, + Conv::template process_tile<0, 1, 3, 1, 1, 1>, + Conv::template process_tile<0, 1, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 1, 2, 0>, + Conv::template process_tile<0, 1, 3, 1, 2, 1>, + Conv::template process_tile<0, 1, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 2, 0, 0>, + Conv::template process_tile<0, 1, 3, 2, 0, 1>, + Conv::template process_tile<0, 1, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 2, 1, 0>, + Conv::template process_tile<0, 1, 3, 2, 1, 1>, + Conv::template process_tile<0, 1, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 2, 2, 0>, + Conv::template process_tile<0, 1, 3, 2, 2, 1>, + Conv::template process_tile<0, 1, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 3, 0, 0>, + Conv::template process_tile<0, 1, 3, 3, 0, 1>, + Conv::template process_tile<0, 1, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 3, 1, 0>, + Conv::template process_tile<0, 1, 3, 3, 1, 1>, + Conv::template process_tile<0, 1, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 3, 2, 0>, + Conv::template process_tile<0, 1, 3, 3, 2, 1>, + Conv::template process_tile<0, 1, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 4, 0, 0>, + Conv::template process_tile<0, 1, 3, 4, 0, 1>, + Conv::template process_tile<0, 1, 3, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 4, 1, 0>, + Conv::template process_tile<0, 1, 3, 4, 1, 1>, + Conv::template process_tile<0, 1, 3, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 4, 2, 0>, + Conv::template process_tile<0, 1, 3, 4, 2, 1>, + Conv::template process_tile<0, 1, 3, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 5, 0, 0>, + Conv::template process_tile<0, 1, 3, 5, 0, 1>, + Conv::template process_tile<0, 1, 3, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 5, 1, 0>, + Conv::template process_tile<0, 1, 3, 5, 1, 1>, + Conv::template process_tile<0, 1, 3, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 5, 2, 0>, + Conv::template process_tile<0, 1, 3, 5, 2, 1>, + Conv::template process_tile<0, 1, 3, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 6, 0, 0>, + Conv::template process_tile<0, 1, 3, 6, 0, 1>, + Conv::template process_tile<0, 1, 3, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 6, 1, 0>, + Conv::template process_tile<0, 1, 3, 6, 1, 1>, + Conv::template process_tile<0, 1, 3, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 6, 2, 0>, + Conv::template process_tile<0, 1, 3, 6, 2, 1>, + Conv::template process_tile<0, 1, 3, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 0, 0, 0>, + Conv::template process_tile<0, 1, 4, 0, 0, 1>, + Conv::template process_tile<0, 1, 4, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 0, 1, 0>, + Conv::template process_tile<0, 1, 4, 0, 1, 1>, + Conv::template process_tile<0, 1, 4, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 0, 2, 0>, + Conv::template process_tile<0, 1, 4, 0, 2, 1>, + Conv::template process_tile<0, 1, 4, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 1, 0, 0>, + Conv::template process_tile<0, 1, 4, 1, 0, 1>, + Conv::template process_tile<0, 1, 4, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 1, 1, 0>, + Conv::template process_tile<0, 1, 4, 1, 1, 1>, + Conv::template process_tile<0, 1, 4, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 1, 2, 0>, + Conv::template process_tile<0, 1, 4, 1, 2, 1>, + Conv::template process_tile<0, 1, 4, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 2, 0, 0>, + Conv::template process_tile<0, 1, 4, 2, 0, 1>, + Conv::template process_tile<0, 1, 4, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 2, 1, 0>, + Conv::template process_tile<0, 1, 4, 2, 1, 1>, + Conv::template process_tile<0, 1, 4, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 2, 2, 0>, + Conv::template process_tile<0, 1, 4, 2, 2, 1>, + Conv::template process_tile<0, 1, 4, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 3, 0, 0>, + Conv::template process_tile<0, 1, 4, 3, 0, 1>, + Conv::template process_tile<0, 1, 4, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 3, 1, 0>, + Conv::template process_tile<0, 1, 4, 3, 1, 1>, + Conv::template process_tile<0, 1, 4, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 3, 2, 0>, + Conv::template process_tile<0, 1, 4, 3, 2, 1>, + Conv::template process_tile<0, 1, 4, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 4, 0, 0>, + Conv::template process_tile<0, 1, 4, 4, 0, 1>, + Conv::template process_tile<0, 1, 4, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 4, 1, 0>, + Conv::template process_tile<0, 1, 4, 4, 1, 1>, + Conv::template process_tile<0, 1, 4, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 4, 2, 0>, + Conv::template process_tile<0, 1, 4, 4, 2, 1>, + Conv::template process_tile<0, 1, 4, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 5, 0, 0>, + Conv::template process_tile<0, 1, 4, 5, 0, 1>, + Conv::template process_tile<0, 1, 4, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 5, 1, 0>, + Conv::template process_tile<0, 1, 4, 5, 1, 1>, + Conv::template process_tile<0, 1, 4, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 5, 2, 0>, + Conv::template process_tile<0, 1, 4, 5, 2, 1>, + Conv::template process_tile<0, 1, 4, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 6, 0, 0>, + Conv::template process_tile<0, 1, 4, 6, 0, 1>, + Conv::template process_tile<0, 1, 4, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 6, 1, 0>, + Conv::template process_tile<0, 1, 4, 6, 1, 1>, + Conv::template process_tile<0, 1, 4, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 6, 2, 0>, + Conv::template process_tile<0, 1, 4, 6, 2, 1>, + Conv::template process_tile<0, 1, 4, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 0, 0, 0>, + Conv::template process_tile<0, 1, 5, 0, 0, 1>, + Conv::template process_tile<0, 1, 5, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 0, 1, 0>, + Conv::template process_tile<0, 1, 5, 0, 1, 1>, + Conv::template process_tile<0, 1, 5, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 0, 2, 0>, + Conv::template process_tile<0, 1, 5, 0, 2, 1>, + Conv::template process_tile<0, 1, 5, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 1, 0, 0>, + Conv::template process_tile<0, 1, 5, 1, 0, 1>, + Conv::template process_tile<0, 1, 5, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 1, 1, 0>, + Conv::template process_tile<0, 1, 5, 1, 1, 1>, + Conv::template process_tile<0, 1, 5, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 1, 2, 0>, + Conv::template process_tile<0, 1, 5, 1, 2, 1>, + Conv::template process_tile<0, 1, 5, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 2, 0, 0>, + Conv::template process_tile<0, 1, 5, 2, 0, 1>, + Conv::template process_tile<0, 1, 5, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 2, 1, 0>, + Conv::template process_tile<0, 1, 5, 2, 1, 1>, + Conv::template process_tile<0, 1, 5, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 2, 2, 0>, + Conv::template process_tile<0, 1, 5, 2, 2, 1>, + Conv::template process_tile<0, 1, 5, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 3, 0, 0>, + Conv::template process_tile<0, 1, 5, 3, 0, 1>, + Conv::template process_tile<0, 1, 5, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 3, 1, 0>, + Conv::template process_tile<0, 1, 5, 3, 1, 1>, + Conv::template process_tile<0, 1, 5, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 3, 2, 0>, + Conv::template process_tile<0, 1, 5, 3, 2, 1>, + Conv::template process_tile<0, 1, 5, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 4, 0, 0>, + Conv::template process_tile<0, 1, 5, 4, 0, 1>, + Conv::template process_tile<0, 1, 5, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 4, 1, 0>, + Conv::template process_tile<0, 1, 5, 4, 1, 1>, + Conv::template process_tile<0, 1, 5, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 4, 2, 0>, + Conv::template process_tile<0, 1, 5, 4, 2, 1>, + Conv::template process_tile<0, 1, 5, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 5, 0, 0>, + Conv::template process_tile<0, 1, 5, 5, 0, 1>, + Conv::template process_tile<0, 1, 5, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 5, 1, 0>, + Conv::template process_tile<0, 1, 5, 5, 1, 1>, + Conv::template process_tile<0, 1, 5, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 5, 2, 0>, + Conv::template process_tile<0, 1, 5, 5, 2, 1>, + Conv::template process_tile<0, 1, 5, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 6, 0, 0>, + Conv::template process_tile<0, 1, 5, 6, 0, 1>, + Conv::template process_tile<0, 1, 5, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 6, 1, 0>, + Conv::template process_tile<0, 1, 5, 6, 1, 1>, + Conv::template process_tile<0, 1, 5, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 6, 2, 0>, + Conv::template process_tile<0, 1, 5, 6, 2, 1>, + Conv::template process_tile<0, 1, 5, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 0, 0, 0>, + Conv::template process_tile<0, 1, 6, 0, 0, 1>, + Conv::template process_tile<0, 1, 6, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 0, 1, 0>, + Conv::template process_tile<0, 1, 6, 0, 1, 1>, + Conv::template process_tile<0, 1, 6, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 0, 2, 0>, + Conv::template process_tile<0, 1, 6, 0, 2, 1>, + Conv::template process_tile<0, 1, 6, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 1, 0, 0>, + Conv::template process_tile<0, 1, 6, 1, 0, 1>, + Conv::template process_tile<0, 1, 6, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 1, 1, 0>, + Conv::template process_tile<0, 1, 6, 1, 1, 1>, + Conv::template process_tile<0, 1, 6, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 1, 2, 0>, + Conv::template process_tile<0, 1, 6, 1, 2, 1>, + Conv::template process_tile<0, 1, 6, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 2, 0, 0>, + Conv::template process_tile<0, 1, 6, 2, 0, 1>, + Conv::template process_tile<0, 1, 6, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 2, 1, 0>, + Conv::template process_tile<0, 1, 6, 2, 1, 1>, + Conv::template process_tile<0, 1, 6, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 2, 2, 0>, + Conv::template process_tile<0, 1, 6, 2, 2, 1>, + Conv::template process_tile<0, 1, 6, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 3, 0, 0>, + Conv::template process_tile<0, 1, 6, 3, 0, 1>, + Conv::template process_tile<0, 1, 6, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 3, 1, 0>, + Conv::template process_tile<0, 1, 6, 3, 1, 1>, + Conv::template process_tile<0, 1, 6, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 3, 2, 0>, + Conv::template process_tile<0, 1, 6, 3, 2, 1>, + Conv::template process_tile<0, 1, 6, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 4, 0, 0>, + Conv::template process_tile<0, 1, 6, 4, 0, 1>, + Conv::template process_tile<0, 1, 6, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 4, 1, 0>, + Conv::template process_tile<0, 1, 6, 4, 1, 1>, + Conv::template process_tile<0, 1, 6, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 4, 2, 0>, + Conv::template process_tile<0, 1, 6, 4, 2, 1>, + Conv::template process_tile<0, 1, 6, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 5, 0, 0>, + Conv::template process_tile<0, 1, 6, 5, 0, 1>, + Conv::template process_tile<0, 1, 6, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 5, 1, 0>, + Conv::template process_tile<0, 1, 6, 5, 1, 1>, + Conv::template process_tile<0, 1, 6, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 5, 2, 0>, + Conv::template process_tile<0, 1, 6, 5, 2, 1>, + Conv::template process_tile<0, 1, 6, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 6, 0, 0>, + Conv::template process_tile<0, 1, 6, 6, 0, 1>, + Conv::template process_tile<0, 1, 6, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 6, 1, 0>, + Conv::template process_tile<0, 1, 6, 6, 1, 1>, + Conv::template process_tile<0, 1, 6, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 6, 2, 0>, + Conv::template process_tile<0, 1, 6, 6, 2, 1>, + Conv::template process_tile<0, 1, 6, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 0, 0, 0>, + Conv::template process_tile<1, 0, 0, 0, 0, 1>, + Conv::template process_tile<1, 0, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 0, 1, 0>, + Conv::template process_tile<1, 0, 0, 0, 1, 1>, + Conv::template process_tile<1, 0, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 0, 2, 0>, + Conv::template process_tile<1, 0, 0, 0, 2, 1>, + Conv::template process_tile<1, 0, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 1, 0, 0>, + Conv::template process_tile<1, 0, 0, 1, 0, 1>, + Conv::template process_tile<1, 0, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 1, 1, 0>, + Conv::template process_tile<1, 0, 0, 1, 1, 1>, + Conv::template process_tile<1, 0, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 1, 2, 0>, + Conv::template process_tile<1, 0, 0, 1, 2, 1>, + Conv::template process_tile<1, 0, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 2, 0, 0>, + Conv::template process_tile<1, 0, 0, 2, 0, 1>, + Conv::template process_tile<1, 0, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 2, 1, 0>, + Conv::template process_tile<1, 0, 0, 2, 1, 1>, + Conv::template process_tile<1, 0, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 2, 2, 0>, + Conv::template process_tile<1, 0, 0, 2, 2, 1>, + Conv::template process_tile<1, 0, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 3, 0, 0>, + Conv::template process_tile<1, 0, 0, 3, 0, 1>, + Conv::template process_tile<1, 0, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 3, 1, 0>, + Conv::template process_tile<1, 0, 0, 3, 1, 1>, + Conv::template process_tile<1, 0, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 3, 2, 0>, + Conv::template process_tile<1, 0, 0, 3, 2, 1>, + Conv::template process_tile<1, 0, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 4, 0, 0>, + Conv::template process_tile<1, 0, 0, 4, 0, 1>, + Conv::template process_tile<1, 0, 0, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 4, 1, 0>, + Conv::template process_tile<1, 0, 0, 4, 1, 1>, + Conv::template process_tile<1, 0, 0, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 4, 2, 0>, + Conv::template process_tile<1, 0, 0, 4, 2, 1>, + Conv::template process_tile<1, 0, 0, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 5, 0, 0>, + Conv::template process_tile<1, 0, 0, 5, 0, 1>, + Conv::template process_tile<1, 0, 0, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 5, 1, 0>, + Conv::template process_tile<1, 0, 0, 5, 1, 1>, + Conv::template process_tile<1, 0, 0, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 5, 2, 0>, + Conv::template process_tile<1, 0, 0, 5, 2, 1>, + Conv::template process_tile<1, 0, 0, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 6, 0, 0>, + Conv::template process_tile<1, 0, 0, 6, 0, 1>, + Conv::template process_tile<1, 0, 0, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 6, 1, 0>, + Conv::template process_tile<1, 0, 0, 6, 1, 1>, + Conv::template process_tile<1, 0, 0, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 6, 2, 0>, + Conv::template process_tile<1, 0, 0, 6, 2, 1>, + Conv::template process_tile<1, 0, 0, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 0, 0, 0>, + Conv::template process_tile<1, 0, 1, 0, 0, 1>, + Conv::template process_tile<1, 0, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 0, 1, 0>, + Conv::template process_tile<1, 0, 1, 0, 1, 1>, + Conv::template process_tile<1, 0, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 0, 2, 0>, + Conv::template process_tile<1, 0, 1, 0, 2, 1>, + Conv::template process_tile<1, 0, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 1, 0, 0>, + Conv::template process_tile<1, 0, 1, 1, 0, 1>, + Conv::template process_tile<1, 0, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 1, 1, 0>, + Conv::template process_tile<1, 0, 1, 1, 1, 1>, + Conv::template process_tile<1, 0, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 1, 2, 0>, + Conv::template process_tile<1, 0, 1, 1, 2, 1>, + Conv::template process_tile<1, 0, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 2, 0, 0>, + Conv::template process_tile<1, 0, 1, 2, 0, 1>, + Conv::template process_tile<1, 0, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 2, 1, 0>, + Conv::template process_tile<1, 0, 1, 2, 1, 1>, + Conv::template process_tile<1, 0, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 2, 2, 0>, + Conv::template process_tile<1, 0, 1, 2, 2, 1>, + Conv::template process_tile<1, 0, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 3, 0, 0>, + Conv::template process_tile<1, 0, 1, 3, 0, 1>, + Conv::template process_tile<1, 0, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 3, 1, 0>, + Conv::template process_tile<1, 0, 1, 3, 1, 1>, + Conv::template process_tile<1, 0, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 3, 2, 0>, + Conv::template process_tile<1, 0, 1, 3, 2, 1>, + Conv::template process_tile<1, 0, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 4, 0, 0>, + Conv::template process_tile<1, 0, 1, 4, 0, 1>, + Conv::template process_tile<1, 0, 1, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 4, 1, 0>, + Conv::template process_tile<1, 0, 1, 4, 1, 1>, + Conv::template process_tile<1, 0, 1, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 4, 2, 0>, + Conv::template process_tile<1, 0, 1, 4, 2, 1>, + Conv::template process_tile<1, 0, 1, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 5, 0, 0>, + Conv::template process_tile<1, 0, 1, 5, 0, 1>, + Conv::template process_tile<1, 0, 1, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 5, 1, 0>, + Conv::template process_tile<1, 0, 1, 5, 1, 1>, + Conv::template process_tile<1, 0, 1, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 5, 2, 0>, + Conv::template process_tile<1, 0, 1, 5, 2, 1>, + Conv::template process_tile<1, 0, 1, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 6, 0, 0>, + Conv::template process_tile<1, 0, 1, 6, 0, 1>, + Conv::template process_tile<1, 0, 1, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 6, 1, 0>, + Conv::template process_tile<1, 0, 1, 6, 1, 1>, + Conv::template process_tile<1, 0, 1, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 6, 2, 0>, + Conv::template process_tile<1, 0, 1, 6, 2, 1>, + Conv::template process_tile<1, 0, 1, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 0, 0, 0>, + Conv::template process_tile<1, 0, 2, 0, 0, 1>, + Conv::template process_tile<1, 0, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 0, 1, 0>, + Conv::template process_tile<1, 0, 2, 0, 1, 1>, + Conv::template process_tile<1, 0, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 0, 2, 0>, + Conv::template process_tile<1, 0, 2, 0, 2, 1>, + Conv::template process_tile<1, 0, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 1, 0, 0>, + Conv::template process_tile<1, 0, 2, 1, 0, 1>, + Conv::template process_tile<1, 0, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 1, 1, 0>, + Conv::template process_tile<1, 0, 2, 1, 1, 1>, + Conv::template process_tile<1, 0, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 1, 2, 0>, + Conv::template process_tile<1, 0, 2, 1, 2, 1>, + Conv::template process_tile<1, 0, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 2, 0, 0>, + Conv::template process_tile<1, 0, 2, 2, 0, 1>, + Conv::template process_tile<1, 0, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 2, 1, 0>, + Conv::template process_tile<1, 0, 2, 2, 1, 1>, + Conv::template process_tile<1, 0, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 2, 2, 0>, + Conv::template process_tile<1, 0, 2, 2, 2, 1>, + Conv::template process_tile<1, 0, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 3, 0, 0>, + Conv::template process_tile<1, 0, 2, 3, 0, 1>, + Conv::template process_tile<1, 0, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 3, 1, 0>, + Conv::template process_tile<1, 0, 2, 3, 1, 1>, + Conv::template process_tile<1, 0, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 3, 2, 0>, + Conv::template process_tile<1, 0, 2, 3, 2, 1>, + Conv::template process_tile<1, 0, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 4, 0, 0>, + Conv::template process_tile<1, 0, 2, 4, 0, 1>, + Conv::template process_tile<1, 0, 2, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 4, 1, 0>, + Conv::template process_tile<1, 0, 2, 4, 1, 1>, + Conv::template process_tile<1, 0, 2, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 4, 2, 0>, + Conv::template process_tile<1, 0, 2, 4, 2, 1>, + Conv::template process_tile<1, 0, 2, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 5, 0, 0>, + Conv::template process_tile<1, 0, 2, 5, 0, 1>, + Conv::template process_tile<1, 0, 2, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 5, 1, 0>, + Conv::template process_tile<1, 0, 2, 5, 1, 1>, + Conv::template process_tile<1, 0, 2, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 5, 2, 0>, + Conv::template process_tile<1, 0, 2, 5, 2, 1>, + Conv::template process_tile<1, 0, 2, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 6, 0, 0>, + Conv::template process_tile<1, 0, 2, 6, 0, 1>, + Conv::template process_tile<1, 0, 2, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 6, 1, 0>, + Conv::template process_tile<1, 0, 2, 6, 1, 1>, + Conv::template process_tile<1, 0, 2, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 6, 2, 0>, + Conv::template process_tile<1, 0, 2, 6, 2, 1>, + Conv::template process_tile<1, 0, 2, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 0, 0, 0>, + Conv::template process_tile<1, 0, 3, 0, 0, 1>, + Conv::template process_tile<1, 0, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 0, 1, 0>, + Conv::template process_tile<1, 0, 3, 0, 1, 1>, + Conv::template process_tile<1, 0, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 0, 2, 0>, + Conv::template process_tile<1, 0, 3, 0, 2, 1>, + Conv::template process_tile<1, 0, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 1, 0, 0>, + Conv::template process_tile<1, 0, 3, 1, 0, 1>, + Conv::template process_tile<1, 0, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 1, 1, 0>, + Conv::template process_tile<1, 0, 3, 1, 1, 1>, + Conv::template process_tile<1, 0, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 1, 2, 0>, + Conv::template process_tile<1, 0, 3, 1, 2, 1>, + Conv::template process_tile<1, 0, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 2, 0, 0>, + Conv::template process_tile<1, 0, 3, 2, 0, 1>, + Conv::template process_tile<1, 0, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 2, 1, 0>, + Conv::template process_tile<1, 0, 3, 2, 1, 1>, + Conv::template process_tile<1, 0, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 2, 2, 0>, + Conv::template process_tile<1, 0, 3, 2, 2, 1>, + Conv::template process_tile<1, 0, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 3, 0, 0>, + Conv::template process_tile<1, 0, 3, 3, 0, 1>, + Conv::template process_tile<1, 0, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 3, 1, 0>, + Conv::template process_tile<1, 0, 3, 3, 1, 1>, + Conv::template process_tile<1, 0, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 3, 2, 0>, + Conv::template process_tile<1, 0, 3, 3, 2, 1>, + Conv::template process_tile<1, 0, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 4, 0, 0>, + Conv::template process_tile<1, 0, 3, 4, 0, 1>, + Conv::template process_tile<1, 0, 3, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 4, 1, 0>, + Conv::template process_tile<1, 0, 3, 4, 1, 1>, + Conv::template process_tile<1, 0, 3, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 4, 2, 0>, + Conv::template process_tile<1, 0, 3, 4, 2, 1>, + Conv::template process_tile<1, 0, 3, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 5, 0, 0>, + Conv::template process_tile<1, 0, 3, 5, 0, 1>, + Conv::template process_tile<1, 0, 3, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 5, 1, 0>, + Conv::template process_tile<1, 0, 3, 5, 1, 1>, + Conv::template process_tile<1, 0, 3, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 5, 2, 0>, + Conv::template process_tile<1, 0, 3, 5, 2, 1>, + Conv::template process_tile<1, 0, 3, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 6, 0, 0>, + Conv::template process_tile<1, 0, 3, 6, 0, 1>, + Conv::template process_tile<1, 0, 3, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 6, 1, 0>, + Conv::template process_tile<1, 0, 3, 6, 1, 1>, + Conv::template process_tile<1, 0, 3, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 6, 2, 0>, + Conv::template process_tile<1, 0, 3, 6, 2, 1>, + Conv::template process_tile<1, 0, 3, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 0, 0, 0>, + Conv::template process_tile<1, 0, 4, 0, 0, 1>, + Conv::template process_tile<1, 0, 4, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 0, 1, 0>, + Conv::template process_tile<1, 0, 4, 0, 1, 1>, + Conv::template process_tile<1, 0, 4, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 0, 2, 0>, + Conv::template process_tile<1, 0, 4, 0, 2, 1>, + Conv::template process_tile<1, 0, 4, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 1, 0, 0>, + Conv::template process_tile<1, 0, 4, 1, 0, 1>, + Conv::template process_tile<1, 0, 4, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 1, 1, 0>, + Conv::template process_tile<1, 0, 4, 1, 1, 1>, + Conv::template process_tile<1, 0, 4, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 1, 2, 0>, + Conv::template process_tile<1, 0, 4, 1, 2, 1>, + Conv::template process_tile<1, 0, 4, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 2, 0, 0>, + Conv::template process_tile<1, 0, 4, 2, 0, 1>, + Conv::template process_tile<1, 0, 4, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 2, 1, 0>, + Conv::template process_tile<1, 0, 4, 2, 1, 1>, + Conv::template process_tile<1, 0, 4, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 2, 2, 0>, + Conv::template process_tile<1, 0, 4, 2, 2, 1>, + Conv::template process_tile<1, 0, 4, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 3, 0, 0>, + Conv::template process_tile<1, 0, 4, 3, 0, 1>, + Conv::template process_tile<1, 0, 4, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 3, 1, 0>, + Conv::template process_tile<1, 0, 4, 3, 1, 1>, + Conv::template process_tile<1, 0, 4, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 3, 2, 0>, + Conv::template process_tile<1, 0, 4, 3, 2, 1>, + Conv::template process_tile<1, 0, 4, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 4, 0, 0>, + Conv::template process_tile<1, 0, 4, 4, 0, 1>, + Conv::template process_tile<1, 0, 4, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 4, 1, 0>, + Conv::template process_tile<1, 0, 4, 4, 1, 1>, + Conv::template process_tile<1, 0, 4, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 4, 2, 0>, + Conv::template process_tile<1, 0, 4, 4, 2, 1>, + Conv::template process_tile<1, 0, 4, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 5, 0, 0>, + Conv::template process_tile<1, 0, 4, 5, 0, 1>, + Conv::template process_tile<1, 0, 4, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 5, 1, 0>, + Conv::template process_tile<1, 0, 4, 5, 1, 1>, + Conv::template process_tile<1, 0, 4, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 5, 2, 0>, + Conv::template process_tile<1, 0, 4, 5, 2, 1>, + Conv::template process_tile<1, 0, 4, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 6, 0, 0>, + Conv::template process_tile<1, 0, 4, 6, 0, 1>, + Conv::template process_tile<1, 0, 4, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 6, 1, 0>, + Conv::template process_tile<1, 0, 4, 6, 1, 1>, + Conv::template process_tile<1, 0, 4, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 6, 2, 0>, + Conv::template process_tile<1, 0, 4, 6, 2, 1>, + Conv::template process_tile<1, 0, 4, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 0, 0, 0>, + Conv::template process_tile<1, 0, 5, 0, 0, 1>, + Conv::template process_tile<1, 0, 5, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 0, 1, 0>, + Conv::template process_tile<1, 0, 5, 0, 1, 1>, + Conv::template process_tile<1, 0, 5, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 0, 2, 0>, + Conv::template process_tile<1, 0, 5, 0, 2, 1>, + Conv::template process_tile<1, 0, 5, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 1, 0, 0>, + Conv::template process_tile<1, 0, 5, 1, 0, 1>, + Conv::template process_tile<1, 0, 5, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 1, 1, 0>, + Conv::template process_tile<1, 0, 5, 1, 1, 1>, + Conv::template process_tile<1, 0, 5, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 1, 2, 0>, + Conv::template process_tile<1, 0, 5, 1, 2, 1>, + Conv::template process_tile<1, 0, 5, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 2, 0, 0>, + Conv::template process_tile<1, 0, 5, 2, 0, 1>, + Conv::template process_tile<1, 0, 5, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 2, 1, 0>, + Conv::template process_tile<1, 0, 5, 2, 1, 1>, + Conv::template process_tile<1, 0, 5, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 2, 2, 0>, + Conv::template process_tile<1, 0, 5, 2, 2, 1>, + Conv::template process_tile<1, 0, 5, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 3, 0, 0>, + Conv::template process_tile<1, 0, 5, 3, 0, 1>, + Conv::template process_tile<1, 0, 5, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 3, 1, 0>, + Conv::template process_tile<1, 0, 5, 3, 1, 1>, + Conv::template process_tile<1, 0, 5, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 3, 2, 0>, + Conv::template process_tile<1, 0, 5, 3, 2, 1>, + Conv::template process_tile<1, 0, 5, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 4, 0, 0>, + Conv::template process_tile<1, 0, 5, 4, 0, 1>, + Conv::template process_tile<1, 0, 5, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 4, 1, 0>, + Conv::template process_tile<1, 0, 5, 4, 1, 1>, + Conv::template process_tile<1, 0, 5, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 4, 2, 0>, + Conv::template process_tile<1, 0, 5, 4, 2, 1>, + Conv::template process_tile<1, 0, 5, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 5, 0, 0>, + Conv::template process_tile<1, 0, 5, 5, 0, 1>, + Conv::template process_tile<1, 0, 5, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 5, 1, 0>, + Conv::template process_tile<1, 0, 5, 5, 1, 1>, + Conv::template process_tile<1, 0, 5, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 5, 2, 0>, + Conv::template process_tile<1, 0, 5, 5, 2, 1>, + Conv::template process_tile<1, 0, 5, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 6, 0, 0>, + Conv::template process_tile<1, 0, 5, 6, 0, 1>, + Conv::template process_tile<1, 0, 5, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 6, 1, 0>, + Conv::template process_tile<1, 0, 5, 6, 1, 1>, + Conv::template process_tile<1, 0, 5, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 6, 2, 0>, + Conv::template process_tile<1, 0, 5, 6, 2, 1>, + Conv::template process_tile<1, 0, 5, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 0, 0, 0>, + Conv::template process_tile<1, 0, 6, 0, 0, 1>, + Conv::template process_tile<1, 0, 6, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 0, 1, 0>, + Conv::template process_tile<1, 0, 6, 0, 1, 1>, + Conv::template process_tile<1, 0, 6, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 0, 2, 0>, + Conv::template process_tile<1, 0, 6, 0, 2, 1>, + Conv::template process_tile<1, 0, 6, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 1, 0, 0>, + Conv::template process_tile<1, 0, 6, 1, 0, 1>, + Conv::template process_tile<1, 0, 6, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 1, 1, 0>, + Conv::template process_tile<1, 0, 6, 1, 1, 1>, + Conv::template process_tile<1, 0, 6, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 1, 2, 0>, + Conv::template process_tile<1, 0, 6, 1, 2, 1>, + Conv::template process_tile<1, 0, 6, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 2, 0, 0>, + Conv::template process_tile<1, 0, 6, 2, 0, 1>, + Conv::template process_tile<1, 0, 6, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 2, 1, 0>, + Conv::template process_tile<1, 0, 6, 2, 1, 1>, + Conv::template process_tile<1, 0, 6, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 2, 2, 0>, + Conv::template process_tile<1, 0, 6, 2, 2, 1>, + Conv::template process_tile<1, 0, 6, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 3, 0, 0>, + Conv::template process_tile<1, 0, 6, 3, 0, 1>, + Conv::template process_tile<1, 0, 6, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 3, 1, 0>, + Conv::template process_tile<1, 0, 6, 3, 1, 1>, + Conv::template process_tile<1, 0, 6, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 3, 2, 0>, + Conv::template process_tile<1, 0, 6, 3, 2, 1>, + Conv::template process_tile<1, 0, 6, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 4, 0, 0>, + Conv::template process_tile<1, 0, 6, 4, 0, 1>, + Conv::template process_tile<1, 0, 6, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 4, 1, 0>, + Conv::template process_tile<1, 0, 6, 4, 1, 1>, + Conv::template process_tile<1, 0, 6, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 4, 2, 0>, + Conv::template process_tile<1, 0, 6, 4, 2, 1>, + Conv::template process_tile<1, 0, 6, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 5, 0, 0>, + Conv::template process_tile<1, 0, 6, 5, 0, 1>, + Conv::template process_tile<1, 0, 6, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 5, 1, 0>, + Conv::template process_tile<1, 0, 6, 5, 1, 1>, + Conv::template process_tile<1, 0, 6, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 5, 2, 0>, + Conv::template process_tile<1, 0, 6, 5, 2, 1>, + Conv::template process_tile<1, 0, 6, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 6, 0, 0>, + Conv::template process_tile<1, 0, 6, 6, 0, 1>, + Conv::template process_tile<1, 0, 6, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 6, 1, 0>, + Conv::template process_tile<1, 0, 6, 6, 1, 1>, + Conv::template process_tile<1, 0, 6, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 6, 2, 0>, + Conv::template process_tile<1, 0, 6, 6, 2, 1>, + Conv::template process_tile<1, 0, 6, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 0, 0, 0>, + Conv::template process_tile<1, 1, 0, 0, 0, 1>, + Conv::template process_tile<1, 1, 0, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 0, 1, 0>, + Conv::template process_tile<1, 1, 0, 0, 1, 1>, + Conv::template process_tile<1, 1, 0, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 0, 2, 0>, + Conv::template process_tile<1, 1, 0, 0, 2, 1>, + Conv::template process_tile<1, 1, 0, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 1, 0, 0>, + Conv::template process_tile<1, 1, 0, 1, 0, 1>, + Conv::template process_tile<1, 1, 0, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 1, 1, 0>, + Conv::template process_tile<1, 1, 0, 1, 1, 1>, + Conv::template process_tile<1, 1, 0, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 1, 2, 0>, + Conv::template process_tile<1, 1, 0, 1, 2, 1>, + Conv::template process_tile<1, 1, 0, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 2, 0, 0>, + Conv::template process_tile<1, 1, 0, 2, 0, 1>, + Conv::template process_tile<1, 1, 0, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 2, 1, 0>, + Conv::template process_tile<1, 1, 0, 2, 1, 1>, + Conv::template process_tile<1, 1, 0, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 2, 2, 0>, + Conv::template process_tile<1, 1, 0, 2, 2, 1>, + Conv::template process_tile<1, 1, 0, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 3, 0, 0>, + Conv::template process_tile<1, 1, 0, 3, 0, 1>, + Conv::template process_tile<1, 1, 0, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 3, 1, 0>, + Conv::template process_tile<1, 1, 0, 3, 1, 1>, + Conv::template process_tile<1, 1, 0, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 3, 2, 0>, + Conv::template process_tile<1, 1, 0, 3, 2, 1>, + Conv::template process_tile<1, 1, 0, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 4, 0, 0>, + Conv::template process_tile<1, 1, 0, 4, 0, 1>, + Conv::template process_tile<1, 1, 0, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 4, 1, 0>, + Conv::template process_tile<1, 1, 0, 4, 1, 1>, + Conv::template process_tile<1, 1, 0, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 4, 2, 0>, + Conv::template process_tile<1, 1, 0, 4, 2, 1>, + Conv::template process_tile<1, 1, 0, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 5, 0, 0>, + Conv::template process_tile<1, 1, 0, 5, 0, 1>, + Conv::template process_tile<1, 1, 0, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 5, 1, 0>, + Conv::template process_tile<1, 1, 0, 5, 1, 1>, + Conv::template process_tile<1, 1, 0, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 5, 2, 0>, + Conv::template process_tile<1, 1, 0, 5, 2, 1>, + Conv::template process_tile<1, 1, 0, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 6, 0, 0>, + Conv::template process_tile<1, 1, 0, 6, 0, 1>, + Conv::template process_tile<1, 1, 0, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 6, 1, 0>, + Conv::template process_tile<1, 1, 0, 6, 1, 1>, + Conv::template process_tile<1, 1, 0, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 6, 2, 0>, + Conv::template process_tile<1, 1, 0, 6, 2, 1>, + Conv::template process_tile<1, 1, 0, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 0, 0, 0>, + Conv::template process_tile<1, 1, 1, 0, 0, 1>, + Conv::template process_tile<1, 1, 1, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 0, 1, 0>, + Conv::template process_tile<1, 1, 1, 0, 1, 1>, + Conv::template process_tile<1, 1, 1, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 0, 2, 0>, + Conv::template process_tile<1, 1, 1, 0, 2, 1>, + Conv::template process_tile<1, 1, 1, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 1, 0, 0>, + Conv::template process_tile<1, 1, 1, 1, 0, 1>, + Conv::template process_tile<1, 1, 1, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 1, 1, 0>, + Conv::template process_tile<1, 1, 1, 1, 1, 1>, + Conv::template process_tile<1, 1, 1, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 1, 2, 0>, + Conv::template process_tile<1, 1, 1, 1, 2, 1>, + Conv::template process_tile<1, 1, 1, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 2, 0, 0>, + Conv::template process_tile<1, 1, 1, 2, 0, 1>, + Conv::template process_tile<1, 1, 1, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 2, 1, 0>, + Conv::template process_tile<1, 1, 1, 2, 1, 1>, + Conv::template process_tile<1, 1, 1, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 2, 2, 0>, + Conv::template process_tile<1, 1, 1, 2, 2, 1>, + Conv::template process_tile<1, 1, 1, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 3, 0, 0>, + Conv::template process_tile<1, 1, 1, 3, 0, 1>, + Conv::template process_tile<1, 1, 1, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 3, 1, 0>, + Conv::template process_tile<1, 1, 1, 3, 1, 1>, + Conv::template process_tile<1, 1, 1, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 3, 2, 0>, + Conv::template process_tile<1, 1, 1, 3, 2, 1>, + Conv::template process_tile<1, 1, 1, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 4, 0, 0>, + Conv::template process_tile<1, 1, 1, 4, 0, 1>, + Conv::template process_tile<1, 1, 1, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 4, 1, 0>, + Conv::template process_tile<1, 1, 1, 4, 1, 1>, + Conv::template process_tile<1, 1, 1, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 4, 2, 0>, + Conv::template process_tile<1, 1, 1, 4, 2, 1>, + Conv::template process_tile<1, 1, 1, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 5, 0, 0>, + Conv::template process_tile<1, 1, 1, 5, 0, 1>, + Conv::template process_tile<1, 1, 1, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 5, 1, 0>, + Conv::template process_tile<1, 1, 1, 5, 1, 1>, + Conv::template process_tile<1, 1, 1, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 5, 2, 0>, + Conv::template process_tile<1, 1, 1, 5, 2, 1>, + Conv::template process_tile<1, 1, 1, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 6, 0, 0>, + Conv::template process_tile<1, 1, 1, 6, 0, 1>, + Conv::template process_tile<1, 1, 1, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 6, 1, 0>, + Conv::template process_tile<1, 1, 1, 6, 1, 1>, + Conv::template process_tile<1, 1, 1, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 6, 2, 0>, + Conv::template process_tile<1, 1, 1, 6, 2, 1>, + Conv::template process_tile<1, 1, 1, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 0, 0, 0>, + Conv::template process_tile<1, 1, 2, 0, 0, 1>, + Conv::template process_tile<1, 1, 2, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 0, 1, 0>, + Conv::template process_tile<1, 1, 2, 0, 1, 1>, + Conv::template process_tile<1, 1, 2, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 0, 2, 0>, + Conv::template process_tile<1, 1, 2, 0, 2, 1>, + Conv::template process_tile<1, 1, 2, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 1, 0, 0>, + Conv::template process_tile<1, 1, 2, 1, 0, 1>, + Conv::template process_tile<1, 1, 2, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 1, 1, 0>, + Conv::template process_tile<1, 1, 2, 1, 1, 1>, + Conv::template process_tile<1, 1, 2, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 1, 2, 0>, + Conv::template process_tile<1, 1, 2, 1, 2, 1>, + Conv::template process_tile<1, 1, 2, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 2, 0, 0>, + Conv::template process_tile<1, 1, 2, 2, 0, 1>, + Conv::template process_tile<1, 1, 2, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 2, 1, 0>, + Conv::template process_tile<1, 1, 2, 2, 1, 1>, + Conv::template process_tile<1, 1, 2, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 2, 2, 0>, + Conv::template process_tile<1, 1, 2, 2, 2, 1>, + Conv::template process_tile<1, 1, 2, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 3, 0, 0>, + Conv::template process_tile<1, 1, 2, 3, 0, 1>, + Conv::template process_tile<1, 1, 2, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 3, 1, 0>, + Conv::template process_tile<1, 1, 2, 3, 1, 1>, + Conv::template process_tile<1, 1, 2, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 3, 2, 0>, + Conv::template process_tile<1, 1, 2, 3, 2, 1>, + Conv::template process_tile<1, 1, 2, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 4, 0, 0>, + Conv::template process_tile<1, 1, 2, 4, 0, 1>, + Conv::template process_tile<1, 1, 2, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 4, 1, 0>, + Conv::template process_tile<1, 1, 2, 4, 1, 1>, + Conv::template process_tile<1, 1, 2, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 4, 2, 0>, + Conv::template process_tile<1, 1, 2, 4, 2, 1>, + Conv::template process_tile<1, 1, 2, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 5, 0, 0>, + Conv::template process_tile<1, 1, 2, 5, 0, 1>, + Conv::template process_tile<1, 1, 2, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 5, 1, 0>, + Conv::template process_tile<1, 1, 2, 5, 1, 1>, + Conv::template process_tile<1, 1, 2, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 5, 2, 0>, + Conv::template process_tile<1, 1, 2, 5, 2, 1>, + Conv::template process_tile<1, 1, 2, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 6, 0, 0>, + Conv::template process_tile<1, 1, 2, 6, 0, 1>, + Conv::template process_tile<1, 1, 2, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 6, 1, 0>, + Conv::template process_tile<1, 1, 2, 6, 1, 1>, + Conv::template process_tile<1, 1, 2, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 6, 2, 0>, + Conv::template process_tile<1, 1, 2, 6, 2, 1>, + Conv::template process_tile<1, 1, 2, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 0, 0, 0>, + Conv::template process_tile<1, 1, 3, 0, 0, 1>, + Conv::template process_tile<1, 1, 3, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 0, 1, 0>, + Conv::template process_tile<1, 1, 3, 0, 1, 1>, + Conv::template process_tile<1, 1, 3, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 0, 2, 0>, + Conv::template process_tile<1, 1, 3, 0, 2, 1>, + Conv::template process_tile<1, 1, 3, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 1, 0, 0>, + Conv::template process_tile<1, 1, 3, 1, 0, 1>, + Conv::template process_tile<1, 1, 3, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 1, 1, 0>, + Conv::template process_tile<1, 1, 3, 1, 1, 1>, + Conv::template process_tile<1, 1, 3, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 1, 2, 0>, + Conv::template process_tile<1, 1, 3, 1, 2, 1>, + Conv::template process_tile<1, 1, 3, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 2, 0, 0>, + Conv::template process_tile<1, 1, 3, 2, 0, 1>, + Conv::template process_tile<1, 1, 3, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 2, 1, 0>, + Conv::template process_tile<1, 1, 3, 2, 1, 1>, + Conv::template process_tile<1, 1, 3, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 2, 2, 0>, + Conv::template process_tile<1, 1, 3, 2, 2, 1>, + Conv::template process_tile<1, 1, 3, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 3, 0, 0>, + Conv::template process_tile<1, 1, 3, 3, 0, 1>, + Conv::template process_tile<1, 1, 3, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 3, 1, 0>, + Conv::template process_tile<1, 1, 3, 3, 1, 1>, + Conv::template process_tile<1, 1, 3, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 3, 2, 0>, + Conv::template process_tile<1, 1, 3, 3, 2, 1>, + Conv::template process_tile<1, 1, 3, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 4, 0, 0>, + Conv::template process_tile<1, 1, 3, 4, 0, 1>, + Conv::template process_tile<1, 1, 3, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 4, 1, 0>, + Conv::template process_tile<1, 1, 3, 4, 1, 1>, + Conv::template process_tile<1, 1, 3, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 4, 2, 0>, + Conv::template process_tile<1, 1, 3, 4, 2, 1>, + Conv::template process_tile<1, 1, 3, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 5, 0, 0>, + Conv::template process_tile<1, 1, 3, 5, 0, 1>, + Conv::template process_tile<1, 1, 3, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 5, 1, 0>, + Conv::template process_tile<1, 1, 3, 5, 1, 1>, + Conv::template process_tile<1, 1, 3, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 5, 2, 0>, + Conv::template process_tile<1, 1, 3, 5, 2, 1>, + Conv::template process_tile<1, 1, 3, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 6, 0, 0>, + Conv::template process_tile<1, 1, 3, 6, 0, 1>, + Conv::template process_tile<1, 1, 3, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 6, 1, 0>, + Conv::template process_tile<1, 1, 3, 6, 1, 1>, + Conv::template process_tile<1, 1, 3, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 6, 2, 0>, + Conv::template process_tile<1, 1, 3, 6, 2, 1>, + Conv::template process_tile<1, 1, 3, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 0, 0, 0>, + Conv::template process_tile<1, 1, 4, 0, 0, 1>, + Conv::template process_tile<1, 1, 4, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 0, 1, 0>, + Conv::template process_tile<1, 1, 4, 0, 1, 1>, + Conv::template process_tile<1, 1, 4, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 0, 2, 0>, + Conv::template process_tile<1, 1, 4, 0, 2, 1>, + Conv::template process_tile<1, 1, 4, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 1, 0, 0>, + Conv::template process_tile<1, 1, 4, 1, 0, 1>, + Conv::template process_tile<1, 1, 4, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 1, 1, 0>, + Conv::template process_tile<1, 1, 4, 1, 1, 1>, + Conv::template process_tile<1, 1, 4, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 1, 2, 0>, + Conv::template process_tile<1, 1, 4, 1, 2, 1>, + Conv::template process_tile<1, 1, 4, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 2, 0, 0>, + Conv::template process_tile<1, 1, 4, 2, 0, 1>, + Conv::template process_tile<1, 1, 4, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 2, 1, 0>, + Conv::template process_tile<1, 1, 4, 2, 1, 1>, + Conv::template process_tile<1, 1, 4, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 2, 2, 0>, + Conv::template process_tile<1, 1, 4, 2, 2, 1>, + Conv::template process_tile<1, 1, 4, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 3, 0, 0>, + Conv::template process_tile<1, 1, 4, 3, 0, 1>, + Conv::template process_tile<1, 1, 4, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 3, 1, 0>, + Conv::template process_tile<1, 1, 4, 3, 1, 1>, + Conv::template process_tile<1, 1, 4, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 3, 2, 0>, + Conv::template process_tile<1, 1, 4, 3, 2, 1>, + Conv::template process_tile<1, 1, 4, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 4, 0, 0>, + Conv::template process_tile<1, 1, 4, 4, 0, 1>, + Conv::template process_tile<1, 1, 4, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 4, 1, 0>, + Conv::template process_tile<1, 1, 4, 4, 1, 1>, + Conv::template process_tile<1, 1, 4, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 4, 2, 0>, + Conv::template process_tile<1, 1, 4, 4, 2, 1>, + Conv::template process_tile<1, 1, 4, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 5, 0, 0>, + Conv::template process_tile<1, 1, 4, 5, 0, 1>, + Conv::template process_tile<1, 1, 4, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 5, 1, 0>, + Conv::template process_tile<1, 1, 4, 5, 1, 1>, + Conv::template process_tile<1, 1, 4, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 5, 2, 0>, + Conv::template process_tile<1, 1, 4, 5, 2, 1>, + Conv::template process_tile<1, 1, 4, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 6, 0, 0>, + Conv::template process_tile<1, 1, 4, 6, 0, 1>, + Conv::template process_tile<1, 1, 4, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 6, 1, 0>, + Conv::template process_tile<1, 1, 4, 6, 1, 1>, + Conv::template process_tile<1, 1, 4, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 6, 2, 0>, + Conv::template process_tile<1, 1, 4, 6, 2, 1>, + Conv::template process_tile<1, 1, 4, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 0, 0, 0>, + Conv::template process_tile<1, 1, 5, 0, 0, 1>, + Conv::template process_tile<1, 1, 5, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 0, 1, 0>, + Conv::template process_tile<1, 1, 5, 0, 1, 1>, + Conv::template process_tile<1, 1, 5, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 0, 2, 0>, + Conv::template process_tile<1, 1, 5, 0, 2, 1>, + Conv::template process_tile<1, 1, 5, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 1, 0, 0>, + Conv::template process_tile<1, 1, 5, 1, 0, 1>, + Conv::template process_tile<1, 1, 5, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 1, 1, 0>, + Conv::template process_tile<1, 1, 5, 1, 1, 1>, + Conv::template process_tile<1, 1, 5, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 1, 2, 0>, + Conv::template process_tile<1, 1, 5, 1, 2, 1>, + Conv::template process_tile<1, 1, 5, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 2, 0, 0>, + Conv::template process_tile<1, 1, 5, 2, 0, 1>, + Conv::template process_tile<1, 1, 5, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 2, 1, 0>, + Conv::template process_tile<1, 1, 5, 2, 1, 1>, + Conv::template process_tile<1, 1, 5, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 2, 2, 0>, + Conv::template process_tile<1, 1, 5, 2, 2, 1>, + Conv::template process_tile<1, 1, 5, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 3, 0, 0>, + Conv::template process_tile<1, 1, 5, 3, 0, 1>, + Conv::template process_tile<1, 1, 5, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 3, 1, 0>, + Conv::template process_tile<1, 1, 5, 3, 1, 1>, + Conv::template process_tile<1, 1, 5, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 3, 2, 0>, + Conv::template process_tile<1, 1, 5, 3, 2, 1>, + Conv::template process_tile<1, 1, 5, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 4, 0, 0>, + Conv::template process_tile<1, 1, 5, 4, 0, 1>, + Conv::template process_tile<1, 1, 5, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 4, 1, 0>, + Conv::template process_tile<1, 1, 5, 4, 1, 1>, + Conv::template process_tile<1, 1, 5, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 4, 2, 0>, + Conv::template process_tile<1, 1, 5, 4, 2, 1>, + Conv::template process_tile<1, 1, 5, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 5, 0, 0>, + Conv::template process_tile<1, 1, 5, 5, 0, 1>, + Conv::template process_tile<1, 1, 5, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 5, 1, 0>, + Conv::template process_tile<1, 1, 5, 5, 1, 1>, + Conv::template process_tile<1, 1, 5, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 5, 2, 0>, + Conv::template process_tile<1, 1, 5, 5, 2, 1>, + Conv::template process_tile<1, 1, 5, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 6, 0, 0>, + Conv::template process_tile<1, 1, 5, 6, 0, 1>, + Conv::template process_tile<1, 1, 5, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 6, 1, 0>, + Conv::template process_tile<1, 1, 5, 6, 1, 1>, + Conv::template process_tile<1, 1, 5, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 6, 2, 0>, + Conv::template process_tile<1, 1, 5, 6, 2, 1>, + Conv::template process_tile<1, 1, 5, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 0, 0, 0>, + Conv::template process_tile<1, 1, 6, 0, 0, 1>, + Conv::template process_tile<1, 1, 6, 0, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 0, 1, 0>, + Conv::template process_tile<1, 1, 6, 0, 1, 1>, + Conv::template process_tile<1, 1, 6, 0, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 0, 2, 0>, + Conv::template process_tile<1, 1, 6, 0, 2, 1>, + Conv::template process_tile<1, 1, 6, 0, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 1, 0, 0>, + Conv::template process_tile<1, 1, 6, 1, 0, 1>, + Conv::template process_tile<1, 1, 6, 1, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 1, 1, 0>, + Conv::template process_tile<1, 1, 6, 1, 1, 1>, + Conv::template process_tile<1, 1, 6, 1, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 1, 2, 0>, + Conv::template process_tile<1, 1, 6, 1, 2, 1>, + Conv::template process_tile<1, 1, 6, 1, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 2, 0, 0>, + Conv::template process_tile<1, 1, 6, 2, 0, 1>, + Conv::template process_tile<1, 1, 6, 2, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 2, 1, 0>, + Conv::template process_tile<1, 1, 6, 2, 1, 1>, + Conv::template process_tile<1, 1, 6, 2, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 2, 2, 0>, + Conv::template process_tile<1, 1, 6, 2, 2, 1>, + Conv::template process_tile<1, 1, 6, 2, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 3, 0, 0>, + Conv::template process_tile<1, 1, 6, 3, 0, 1>, + Conv::template process_tile<1, 1, 6, 3, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 3, 1, 0>, + Conv::template process_tile<1, 1, 6, 3, 1, 1>, + Conv::template process_tile<1, 1, 6, 3, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 3, 2, 0>, + Conv::template process_tile<1, 1, 6, 3, 2, 1>, + Conv::template process_tile<1, 1, 6, 3, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 4, 0, 0>, + Conv::template process_tile<1, 1, 6, 4, 0, 1>, + Conv::template process_tile<1, 1, 6, 4, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 4, 1, 0>, + Conv::template process_tile<1, 1, 6, 4, 1, 1>, + Conv::template process_tile<1, 1, 6, 4, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 4, 2, 0>, + Conv::template process_tile<1, 1, 6, 4, 2, 1>, + Conv::template process_tile<1, 1, 6, 4, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 5, 0, 0>, + Conv::template process_tile<1, 1, 6, 5, 0, 1>, + Conv::template process_tile<1, 1, 6, 5, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 5, 1, 0>, + Conv::template process_tile<1, 1, 6, 5, 1, 1>, + Conv::template process_tile<1, 1, 6, 5, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 5, 2, 0>, + Conv::template process_tile<1, 1, 6, 5, 2, 1>, + Conv::template process_tile<1, 1, 6, 5, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 6, 0, 0>, + Conv::template process_tile<1, 1, 6, 6, 0, 1>, + Conv::template process_tile<1, 1, 6, 6, 0, 2>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 6, 1, 0>, + Conv::template process_tile<1, 1, 6, 6, 1, 1>, + Conv::template process_tile<1, 1, 6, 6, 1, 2>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 6, 2, 0>, + Conv::template process_tile<1, 1, 6, 6, 2, 1>, + Conv::template process_tile<1, 1, 6, 6, 2, 2>, + }, // Output pad bottom = 2 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp new file mode 100644 index 0000000000..a1aaaa078c --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp @@ -0,0 +1,2695 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>, + ConvImpl::template process_tile<0, 0, 0, 0, 0, 2>, + ConvImpl::template process_tile<0, 0, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>, + ConvImpl::template process_tile<0, 0, 0, 0, 1, 2>, + ConvImpl::template process_tile<0, 0, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 0, 0, 2, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 2, 1>, + ConvImpl::template process_tile<0, 0, 0, 0, 2, 2>, + ConvImpl::template process_tile<0, 0, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 0, 0, 3, 0>, + ConvImpl::template process_tile<0, 0, 0, 0, 3, 1>, + ConvImpl::template process_tile<0, 0, 0, 0, 3, 2>, + ConvImpl::template process_tile<0, 0, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>, + ConvImpl::template process_tile<0, 0, 0, 1, 0, 2>, + ConvImpl::template process_tile<0, 0, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>, + ConvImpl::template process_tile<0, 0, 0, 1, 1, 2>, + ConvImpl::template process_tile<0, 0, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 0, 1, 2, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 2, 1>, + ConvImpl::template process_tile<0, 0, 0, 1, 2, 2>, + ConvImpl::template process_tile<0, 0, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 0, 1, 3, 0>, + ConvImpl::template process_tile<0, 0, 0, 1, 3, 1>, + ConvImpl::template process_tile<0, 0, 0, 1, 3, 2>, + ConvImpl::template process_tile<0, 0, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>, + ConvImpl::template process_tile<0, 0, 0, 2, 0, 2>, + ConvImpl::template process_tile<0, 0, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>, + ConvImpl::template process_tile<0, 0, 0, 2, 1, 2>, + ConvImpl::template process_tile<0, 0, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 0, 2, 2, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 2, 1>, + ConvImpl::template process_tile<0, 0, 0, 2, 2, 2>, + ConvImpl::template process_tile<0, 0, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 0, 2, 3, 0>, + ConvImpl::template process_tile<0, 0, 0, 2, 3, 1>, + ConvImpl::template process_tile<0, 0, 0, 2, 3, 2>, + ConvImpl::template process_tile<0, 0, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 3, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 3, 0, 1>, + ConvImpl::template process_tile<0, 0, 0, 3, 0, 2>, + ConvImpl::template process_tile<0, 0, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 3, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 3, 1, 1>, + ConvImpl::template process_tile<0, 0, 0, 3, 1, 2>, + ConvImpl::template process_tile<0, 0, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 0, 3, 2, 0>, + ConvImpl::template process_tile<0, 0, 0, 3, 2, 1>, + ConvImpl::template process_tile<0, 0, 0, 3, 2, 2>, + ConvImpl::template process_tile<0, 0, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 0, 3, 3, 0>, + ConvImpl::template process_tile<0, 0, 0, 3, 3, 1>, + ConvImpl::template process_tile<0, 0, 0, 3, 3, 2>, + ConvImpl::template process_tile<0, 0, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 0, 4, 0, 0>, + ConvImpl::template process_tile<0, 0, 0, 4, 0, 1>, + ConvImpl::template process_tile<0, 0, 0, 4, 0, 2>, + ConvImpl::template process_tile<0, 0, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 0, 4, 1, 0>, + ConvImpl::template process_tile<0, 0, 0, 4, 1, 1>, + ConvImpl::template process_tile<0, 0, 0, 4, 1, 2>, + ConvImpl::template process_tile<0, 0, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 0, 4, 2, 0>, + ConvImpl::template process_tile<0, 0, 0, 4, 2, 1>, + ConvImpl::template process_tile<0, 0, 0, 4, 2, 2>, + ConvImpl::template process_tile<0, 0, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 0, 4, 3, 0>, + ConvImpl::template process_tile<0, 0, 0, 4, 3, 1>, + ConvImpl::template process_tile<0, 0, 0, 4, 3, 2>, + ConvImpl::template process_tile<0, 0, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>, + ConvImpl::template process_tile<0, 0, 1, 0, 0, 2>, + ConvImpl::template process_tile<0, 0, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>, + ConvImpl::template process_tile<0, 0, 1, 0, 1, 2>, + ConvImpl::template process_tile<0, 0, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 1, 0, 2, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 2, 1>, + ConvImpl::template process_tile<0, 0, 1, 0, 2, 2>, + ConvImpl::template process_tile<0, 0, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 1, 0, 3, 0>, + ConvImpl::template process_tile<0, 0, 1, 0, 3, 1>, + ConvImpl::template process_tile<0, 0, 1, 0, 3, 2>, + ConvImpl::template process_tile<0, 0, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>, + ConvImpl::template process_tile<0, 0, 1, 1, 0, 2>, + ConvImpl::template process_tile<0, 0, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>, + ConvImpl::template process_tile<0, 0, 1, 1, 1, 2>, + ConvImpl::template process_tile<0, 0, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 1, 1, 2, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 2, 1>, + ConvImpl::template process_tile<0, 0, 1, 1, 2, 2>, + ConvImpl::template process_tile<0, 0, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 1, 1, 3, 0>, + ConvImpl::template process_tile<0, 0, 1, 1, 3, 1>, + ConvImpl::template process_tile<0, 0, 1, 1, 3, 2>, + ConvImpl::template process_tile<0, 0, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>, + ConvImpl::template process_tile<0, 0, 1, 2, 0, 2>, + ConvImpl::template process_tile<0, 0, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>, + ConvImpl::template process_tile<0, 0, 1, 2, 1, 2>, + ConvImpl::template process_tile<0, 0, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 1, 2, 2, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 2, 1>, + ConvImpl::template process_tile<0, 0, 1, 2, 2, 2>, + ConvImpl::template process_tile<0, 0, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 1, 2, 3, 0>, + ConvImpl::template process_tile<0, 0, 1, 2, 3, 1>, + ConvImpl::template process_tile<0, 0, 1, 2, 3, 2>, + ConvImpl::template process_tile<0, 0, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 3, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 3, 0, 1>, + ConvImpl::template process_tile<0, 0, 1, 3, 0, 2>, + ConvImpl::template process_tile<0, 0, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 3, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 3, 1, 1>, + ConvImpl::template process_tile<0, 0, 1, 3, 1, 2>, + ConvImpl::template process_tile<0, 0, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 1, 3, 2, 0>, + ConvImpl::template process_tile<0, 0, 1, 3, 2, 1>, + ConvImpl::template process_tile<0, 0, 1, 3, 2, 2>, + ConvImpl::template process_tile<0, 0, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 1, 3, 3, 0>, + ConvImpl::template process_tile<0, 0, 1, 3, 3, 1>, + ConvImpl::template process_tile<0, 0, 1, 3, 3, 2>, + ConvImpl::template process_tile<0, 0, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 1, 4, 0, 0>, + ConvImpl::template process_tile<0, 0, 1, 4, 0, 1>, + ConvImpl::template process_tile<0, 0, 1, 4, 0, 2>, + ConvImpl::template process_tile<0, 0, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 1, 4, 1, 0>, + ConvImpl::template process_tile<0, 0, 1, 4, 1, 1>, + ConvImpl::template process_tile<0, 0, 1, 4, 1, 2>, + ConvImpl::template process_tile<0, 0, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 1, 4, 2, 0>, + ConvImpl::template process_tile<0, 0, 1, 4, 2, 1>, + ConvImpl::template process_tile<0, 0, 1, 4, 2, 2>, + ConvImpl::template process_tile<0, 0, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 1, 4, 3, 0>, + ConvImpl::template process_tile<0, 0, 1, 4, 3, 1>, + ConvImpl::template process_tile<0, 0, 1, 4, 3, 2>, + ConvImpl::template process_tile<0, 0, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>, + ConvImpl::template process_tile<0, 0, 2, 0, 0, 2>, + ConvImpl::template process_tile<0, 0, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>, + ConvImpl::template process_tile<0, 0, 2, 0, 1, 2>, + ConvImpl::template process_tile<0, 0, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 2, 0, 2, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 2, 1>, + ConvImpl::template process_tile<0, 0, 2, 0, 2, 2>, + ConvImpl::template process_tile<0, 0, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 2, 0, 3, 0>, + ConvImpl::template process_tile<0, 0, 2, 0, 3, 1>, + ConvImpl::template process_tile<0, 0, 2, 0, 3, 2>, + ConvImpl::template process_tile<0, 0, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>, + ConvImpl::template process_tile<0, 0, 2, 1, 0, 2>, + ConvImpl::template process_tile<0, 0, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>, + ConvImpl::template process_tile<0, 0, 2, 1, 1, 2>, + ConvImpl::template process_tile<0, 0, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 2, 1, 2, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 2, 1>, + ConvImpl::template process_tile<0, 0, 2, 1, 2, 2>, + ConvImpl::template process_tile<0, 0, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 2, 1, 3, 0>, + ConvImpl::template process_tile<0, 0, 2, 1, 3, 1>, + ConvImpl::template process_tile<0, 0, 2, 1, 3, 2>, + ConvImpl::template process_tile<0, 0, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>, + ConvImpl::template process_tile<0, 0, 2, 2, 0, 2>, + ConvImpl::template process_tile<0, 0, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>, + ConvImpl::template process_tile<0, 0, 2, 2, 1, 2>, + ConvImpl::template process_tile<0, 0, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 2, 2, 2, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 2, 1>, + ConvImpl::template process_tile<0, 0, 2, 2, 2, 2>, + ConvImpl::template process_tile<0, 0, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 2, 2, 3, 0>, + ConvImpl::template process_tile<0, 0, 2, 2, 3, 1>, + ConvImpl::template process_tile<0, 0, 2, 2, 3, 2>, + ConvImpl::template process_tile<0, 0, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 3, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 3, 0, 1>, + ConvImpl::template process_tile<0, 0, 2, 3, 0, 2>, + ConvImpl::template process_tile<0, 0, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 3, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 3, 1, 1>, + ConvImpl::template process_tile<0, 0, 2, 3, 1, 2>, + ConvImpl::template process_tile<0, 0, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 2, 3, 2, 0>, + ConvImpl::template process_tile<0, 0, 2, 3, 2, 1>, + ConvImpl::template process_tile<0, 0, 2, 3, 2, 2>, + ConvImpl::template process_tile<0, 0, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 2, 3, 3, 0>, + ConvImpl::template process_tile<0, 0, 2, 3, 3, 1>, + ConvImpl::template process_tile<0, 0, 2, 3, 3, 2>, + ConvImpl::template process_tile<0, 0, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 2, 4, 0, 0>, + ConvImpl::template process_tile<0, 0, 2, 4, 0, 1>, + ConvImpl::template process_tile<0, 0, 2, 4, 0, 2>, + ConvImpl::template process_tile<0, 0, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 2, 4, 1, 0>, + ConvImpl::template process_tile<0, 0, 2, 4, 1, 1>, + ConvImpl::template process_tile<0, 0, 2, 4, 1, 2>, + ConvImpl::template process_tile<0, 0, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 2, 4, 2, 0>, + ConvImpl::template process_tile<0, 0, 2, 4, 2, 1>, + ConvImpl::template process_tile<0, 0, 2, 4, 2, 2>, + ConvImpl::template process_tile<0, 0, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 2, 4, 3, 0>, + ConvImpl::template process_tile<0, 0, 2, 4, 3, 1>, + ConvImpl::template process_tile<0, 0, 2, 4, 3, 2>, + ConvImpl::template process_tile<0, 0, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 3, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 3, 0, 0, 1>, + ConvImpl::template process_tile<0, 0, 3, 0, 0, 2>, + ConvImpl::template process_tile<0, 0, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 3, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 3, 0, 1, 1>, + ConvImpl::template process_tile<0, 0, 3, 0, 1, 2>, + ConvImpl::template process_tile<0, 0, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 3, 0, 2, 0>, + ConvImpl::template process_tile<0, 0, 3, 0, 2, 1>, + ConvImpl::template process_tile<0, 0, 3, 0, 2, 2>, + ConvImpl::template process_tile<0, 0, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 3, 0, 3, 0>, + ConvImpl::template process_tile<0, 0, 3, 0, 3, 1>, + ConvImpl::template process_tile<0, 0, 3, 0, 3, 2>, + ConvImpl::template process_tile<0, 0, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 3, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 3, 1, 0, 1>, + ConvImpl::template process_tile<0, 0, 3, 1, 0, 2>, + ConvImpl::template process_tile<0, 0, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 3, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 3, 1, 1, 1>, + ConvImpl::template process_tile<0, 0, 3, 1, 1, 2>, + ConvImpl::template process_tile<0, 0, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 3, 1, 2, 0>, + ConvImpl::template process_tile<0, 0, 3, 1, 2, 1>, + ConvImpl::template process_tile<0, 0, 3, 1, 2, 2>, + ConvImpl::template process_tile<0, 0, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 3, 1, 3, 0>, + ConvImpl::template process_tile<0, 0, 3, 1, 3, 1>, + ConvImpl::template process_tile<0, 0, 3, 1, 3, 2>, + ConvImpl::template process_tile<0, 0, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 3, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 3, 2, 0, 1>, + ConvImpl::template process_tile<0, 0, 3, 2, 0, 2>, + ConvImpl::template process_tile<0, 0, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 3, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 3, 2, 1, 1>, + ConvImpl::template process_tile<0, 0, 3, 2, 1, 2>, + ConvImpl::template process_tile<0, 0, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 3, 2, 2, 0>, + ConvImpl::template process_tile<0, 0, 3, 2, 2, 1>, + ConvImpl::template process_tile<0, 0, 3, 2, 2, 2>, + ConvImpl::template process_tile<0, 0, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 3, 2, 3, 0>, + ConvImpl::template process_tile<0, 0, 3, 2, 3, 1>, + ConvImpl::template process_tile<0, 0, 3, 2, 3, 2>, + ConvImpl::template process_tile<0, 0, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 3, 3, 0, 0>, + ConvImpl::template process_tile<0, 0, 3, 3, 0, 1>, + ConvImpl::template process_tile<0, 0, 3, 3, 0, 2>, + ConvImpl::template process_tile<0, 0, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 3, 3, 1, 0>, + ConvImpl::template process_tile<0, 0, 3, 3, 1, 1>, + ConvImpl::template process_tile<0, 0, 3, 3, 1, 2>, + ConvImpl::template process_tile<0, 0, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 3, 3, 2, 0>, + ConvImpl::template process_tile<0, 0, 3, 3, 2, 1>, + ConvImpl::template process_tile<0, 0, 3, 3, 2, 2>, + ConvImpl::template process_tile<0, 0, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 3, 3, 3, 0>, + ConvImpl::template process_tile<0, 0, 3, 3, 3, 1>, + ConvImpl::template process_tile<0, 0, 3, 3, 3, 2>, + ConvImpl::template process_tile<0, 0, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 3, 4, 0, 0>, + ConvImpl::template process_tile<0, 0, 3, 4, 0, 1>, + ConvImpl::template process_tile<0, 0, 3, 4, 0, 2>, + ConvImpl::template process_tile<0, 0, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 3, 4, 1, 0>, + ConvImpl::template process_tile<0, 0, 3, 4, 1, 1>, + ConvImpl::template process_tile<0, 0, 3, 4, 1, 2>, + ConvImpl::template process_tile<0, 0, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 3, 4, 2, 0>, + ConvImpl::template process_tile<0, 0, 3, 4, 2, 1>, + ConvImpl::template process_tile<0, 0, 3, 4, 2, 2>, + ConvImpl::template process_tile<0, 0, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 3, 4, 3, 0>, + ConvImpl::template process_tile<0, 0, 3, 4, 3, 1>, + ConvImpl::template process_tile<0, 0, 3, 4, 3, 2>, + ConvImpl::template process_tile<0, 0, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 4, 0, 0, 0>, + ConvImpl::template process_tile<0, 0, 4, 0, 0, 1>, + ConvImpl::template process_tile<0, 0, 4, 0, 0, 2>, + ConvImpl::template process_tile<0, 0, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 4, 0, 1, 0>, + ConvImpl::template process_tile<0, 0, 4, 0, 1, 1>, + ConvImpl::template process_tile<0, 0, 4, 0, 1, 2>, + ConvImpl::template process_tile<0, 0, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 4, 0, 2, 0>, + ConvImpl::template process_tile<0, 0, 4, 0, 2, 1>, + ConvImpl::template process_tile<0, 0, 4, 0, 2, 2>, + ConvImpl::template process_tile<0, 0, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 4, 0, 3, 0>, + ConvImpl::template process_tile<0, 0, 4, 0, 3, 1>, + ConvImpl::template process_tile<0, 0, 4, 0, 3, 2>, + ConvImpl::template process_tile<0, 0, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 4, 1, 0, 0>, + ConvImpl::template process_tile<0, 0, 4, 1, 0, 1>, + ConvImpl::template process_tile<0, 0, 4, 1, 0, 2>, + ConvImpl::template process_tile<0, 0, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 4, 1, 1, 0>, + ConvImpl::template process_tile<0, 0, 4, 1, 1, 1>, + ConvImpl::template process_tile<0, 0, 4, 1, 1, 2>, + ConvImpl::template process_tile<0, 0, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 4, 1, 2, 0>, + ConvImpl::template process_tile<0, 0, 4, 1, 2, 1>, + ConvImpl::template process_tile<0, 0, 4, 1, 2, 2>, + ConvImpl::template process_tile<0, 0, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 4, 1, 3, 0>, + ConvImpl::template process_tile<0, 0, 4, 1, 3, 1>, + ConvImpl::template process_tile<0, 0, 4, 1, 3, 2>, + ConvImpl::template process_tile<0, 0, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 4, 2, 0, 0>, + ConvImpl::template process_tile<0, 0, 4, 2, 0, 1>, + ConvImpl::template process_tile<0, 0, 4, 2, 0, 2>, + ConvImpl::template process_tile<0, 0, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 4, 2, 1, 0>, + ConvImpl::template process_tile<0, 0, 4, 2, 1, 1>, + ConvImpl::template process_tile<0, 0, 4, 2, 1, 2>, + ConvImpl::template process_tile<0, 0, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 4, 2, 2, 0>, + ConvImpl::template process_tile<0, 0, 4, 2, 2, 1>, + ConvImpl::template process_tile<0, 0, 4, 2, 2, 2>, + ConvImpl::template process_tile<0, 0, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 4, 2, 3, 0>, + ConvImpl::template process_tile<0, 0, 4, 2, 3, 1>, + ConvImpl::template process_tile<0, 0, 4, 2, 3, 2>, + ConvImpl::template process_tile<0, 0, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 4, 3, 0, 0>, + ConvImpl::template process_tile<0, 0, 4, 3, 0, 1>, + ConvImpl::template process_tile<0, 0, 4, 3, 0, 2>, + ConvImpl::template process_tile<0, 0, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 4, 3, 1, 0>, + ConvImpl::template process_tile<0, 0, 4, 3, 1, 1>, + ConvImpl::template process_tile<0, 0, 4, 3, 1, 2>, + ConvImpl::template process_tile<0, 0, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 4, 3, 2, 0>, + ConvImpl::template process_tile<0, 0, 4, 3, 2, 1>, + ConvImpl::template process_tile<0, 0, 4, 3, 2, 2>, + ConvImpl::template process_tile<0, 0, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 4, 3, 3, 0>, + ConvImpl::template process_tile<0, 0, 4, 3, 3, 1>, + ConvImpl::template process_tile<0, 0, 4, 3, 3, 2>, + ConvImpl::template process_tile<0, 0, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 0, 4, 4, 0, 0>, + ConvImpl::template process_tile<0, 0, 4, 4, 0, 1>, + ConvImpl::template process_tile<0, 0, 4, 4, 0, 2>, + ConvImpl::template process_tile<0, 0, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 0, 4, 4, 1, 0>, + ConvImpl::template process_tile<0, 0, 4, 4, 1, 1>, + ConvImpl::template process_tile<0, 0, 4, 4, 1, 2>, + ConvImpl::template process_tile<0, 0, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 0, 4, 4, 2, 0>, + ConvImpl::template process_tile<0, 0, 4, 4, 2, 1>, + ConvImpl::template process_tile<0, 0, 4, 4, 2, 2>, + ConvImpl::template process_tile<0, 0, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 0, 4, 4, 3, 0>, + ConvImpl::template process_tile<0, 0, 4, 4, 3, 1>, + ConvImpl::template process_tile<0, 0, 4, 4, 3, 2>, + ConvImpl::template process_tile<0, 0, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>, + ConvImpl::template process_tile<0, 1, 0, 0, 0, 2>, + ConvImpl::template process_tile<0, 1, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>, + ConvImpl::template process_tile<0, 1, 0, 0, 1, 2>, + ConvImpl::template process_tile<0, 1, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 0, 0, 2, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 2, 1>, + ConvImpl::template process_tile<0, 1, 0, 0, 2, 2>, + ConvImpl::template process_tile<0, 1, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 0, 0, 3, 0>, + ConvImpl::template process_tile<0, 1, 0, 0, 3, 1>, + ConvImpl::template process_tile<0, 1, 0, 0, 3, 2>, + ConvImpl::template process_tile<0, 1, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>, + ConvImpl::template process_tile<0, 1, 0, 1, 0, 2>, + ConvImpl::template process_tile<0, 1, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>, + ConvImpl::template process_tile<0, 1, 0, 1, 1, 2>, + ConvImpl::template process_tile<0, 1, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 0, 1, 2, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 2, 1>, + ConvImpl::template process_tile<0, 1, 0, 1, 2, 2>, + ConvImpl::template process_tile<0, 1, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 0, 1, 3, 0>, + ConvImpl::template process_tile<0, 1, 0, 1, 3, 1>, + ConvImpl::template process_tile<0, 1, 0, 1, 3, 2>, + ConvImpl::template process_tile<0, 1, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>, + ConvImpl::template process_tile<0, 1, 0, 2, 0, 2>, + ConvImpl::template process_tile<0, 1, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>, + ConvImpl::template process_tile<0, 1, 0, 2, 1, 2>, + ConvImpl::template process_tile<0, 1, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 0, 2, 2, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 2, 1>, + ConvImpl::template process_tile<0, 1, 0, 2, 2, 2>, + ConvImpl::template process_tile<0, 1, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 0, 2, 3, 0>, + ConvImpl::template process_tile<0, 1, 0, 2, 3, 1>, + ConvImpl::template process_tile<0, 1, 0, 2, 3, 2>, + ConvImpl::template process_tile<0, 1, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 3, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 3, 0, 1>, + ConvImpl::template process_tile<0, 1, 0, 3, 0, 2>, + ConvImpl::template process_tile<0, 1, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 3, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 3, 1, 1>, + ConvImpl::template process_tile<0, 1, 0, 3, 1, 2>, + ConvImpl::template process_tile<0, 1, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 0, 3, 2, 0>, + ConvImpl::template process_tile<0, 1, 0, 3, 2, 1>, + ConvImpl::template process_tile<0, 1, 0, 3, 2, 2>, + ConvImpl::template process_tile<0, 1, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 0, 3, 3, 0>, + ConvImpl::template process_tile<0, 1, 0, 3, 3, 1>, + ConvImpl::template process_tile<0, 1, 0, 3, 3, 2>, + ConvImpl::template process_tile<0, 1, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 0, 4, 0, 0>, + ConvImpl::template process_tile<0, 1, 0, 4, 0, 1>, + ConvImpl::template process_tile<0, 1, 0, 4, 0, 2>, + ConvImpl::template process_tile<0, 1, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 0, 4, 1, 0>, + ConvImpl::template process_tile<0, 1, 0, 4, 1, 1>, + ConvImpl::template process_tile<0, 1, 0, 4, 1, 2>, + ConvImpl::template process_tile<0, 1, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 0, 4, 2, 0>, + ConvImpl::template process_tile<0, 1, 0, 4, 2, 1>, + ConvImpl::template process_tile<0, 1, 0, 4, 2, 2>, + ConvImpl::template process_tile<0, 1, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 0, 4, 3, 0>, + ConvImpl::template process_tile<0, 1, 0, 4, 3, 1>, + ConvImpl::template process_tile<0, 1, 0, 4, 3, 2>, + ConvImpl::template process_tile<0, 1, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>, + ConvImpl::template process_tile<0, 1, 1, 0, 0, 2>, + ConvImpl::template process_tile<0, 1, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>, + ConvImpl::template process_tile<0, 1, 1, 0, 1, 2>, + ConvImpl::template process_tile<0, 1, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 1, 0, 2, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 2, 1>, + ConvImpl::template process_tile<0, 1, 1, 0, 2, 2>, + ConvImpl::template process_tile<0, 1, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 1, 0, 3, 0>, + ConvImpl::template process_tile<0, 1, 1, 0, 3, 1>, + ConvImpl::template process_tile<0, 1, 1, 0, 3, 2>, + ConvImpl::template process_tile<0, 1, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>, + ConvImpl::template process_tile<0, 1, 1, 1, 0, 2>, + ConvImpl::template process_tile<0, 1, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>, + ConvImpl::template process_tile<0, 1, 1, 1, 1, 2>, + ConvImpl::template process_tile<0, 1, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 1, 1, 2, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 2, 1>, + ConvImpl::template process_tile<0, 1, 1, 1, 2, 2>, + ConvImpl::template process_tile<0, 1, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 1, 1, 3, 0>, + ConvImpl::template process_tile<0, 1, 1, 1, 3, 1>, + ConvImpl::template process_tile<0, 1, 1, 1, 3, 2>, + ConvImpl::template process_tile<0, 1, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>, + ConvImpl::template process_tile<0, 1, 1, 2, 0, 2>, + ConvImpl::template process_tile<0, 1, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>, + ConvImpl::template process_tile<0, 1, 1, 2, 1, 2>, + ConvImpl::template process_tile<0, 1, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 1, 2, 2, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 2, 1>, + ConvImpl::template process_tile<0, 1, 1, 2, 2, 2>, + ConvImpl::template process_tile<0, 1, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 1, 2, 3, 0>, + ConvImpl::template process_tile<0, 1, 1, 2, 3, 1>, + ConvImpl::template process_tile<0, 1, 1, 2, 3, 2>, + ConvImpl::template process_tile<0, 1, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 3, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 3, 0, 1>, + ConvImpl::template process_tile<0, 1, 1, 3, 0, 2>, + ConvImpl::template process_tile<0, 1, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 3, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 3, 1, 1>, + ConvImpl::template process_tile<0, 1, 1, 3, 1, 2>, + ConvImpl::template process_tile<0, 1, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 1, 3, 2, 0>, + ConvImpl::template process_tile<0, 1, 1, 3, 2, 1>, + ConvImpl::template process_tile<0, 1, 1, 3, 2, 2>, + ConvImpl::template process_tile<0, 1, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 1, 3, 3, 0>, + ConvImpl::template process_tile<0, 1, 1, 3, 3, 1>, + ConvImpl::template process_tile<0, 1, 1, 3, 3, 2>, + ConvImpl::template process_tile<0, 1, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 1, 4, 0, 0>, + ConvImpl::template process_tile<0, 1, 1, 4, 0, 1>, + ConvImpl::template process_tile<0, 1, 1, 4, 0, 2>, + ConvImpl::template process_tile<0, 1, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 1, 4, 1, 0>, + ConvImpl::template process_tile<0, 1, 1, 4, 1, 1>, + ConvImpl::template process_tile<0, 1, 1, 4, 1, 2>, + ConvImpl::template process_tile<0, 1, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 1, 4, 2, 0>, + ConvImpl::template process_tile<0, 1, 1, 4, 2, 1>, + ConvImpl::template process_tile<0, 1, 1, 4, 2, 2>, + ConvImpl::template process_tile<0, 1, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 1, 4, 3, 0>, + ConvImpl::template process_tile<0, 1, 1, 4, 3, 1>, + ConvImpl::template process_tile<0, 1, 1, 4, 3, 2>, + ConvImpl::template process_tile<0, 1, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>, + ConvImpl::template process_tile<0, 1, 2, 0, 0, 2>, + ConvImpl::template process_tile<0, 1, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>, + ConvImpl::template process_tile<0, 1, 2, 0, 1, 2>, + ConvImpl::template process_tile<0, 1, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 2, 0, 2, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 2, 1>, + ConvImpl::template process_tile<0, 1, 2, 0, 2, 2>, + ConvImpl::template process_tile<0, 1, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 2, 0, 3, 0>, + ConvImpl::template process_tile<0, 1, 2, 0, 3, 1>, + ConvImpl::template process_tile<0, 1, 2, 0, 3, 2>, + ConvImpl::template process_tile<0, 1, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>, + ConvImpl::template process_tile<0, 1, 2, 1, 0, 2>, + ConvImpl::template process_tile<0, 1, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>, + ConvImpl::template process_tile<0, 1, 2, 1, 1, 2>, + ConvImpl::template process_tile<0, 1, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 2, 1, 2, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 2, 1>, + ConvImpl::template process_tile<0, 1, 2, 1, 2, 2>, + ConvImpl::template process_tile<0, 1, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 2, 1, 3, 0>, + ConvImpl::template process_tile<0, 1, 2, 1, 3, 1>, + ConvImpl::template process_tile<0, 1, 2, 1, 3, 2>, + ConvImpl::template process_tile<0, 1, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>, + ConvImpl::template process_tile<0, 1, 2, 2, 0, 2>, + ConvImpl::template process_tile<0, 1, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>, + ConvImpl::template process_tile<0, 1, 2, 2, 1, 2>, + ConvImpl::template process_tile<0, 1, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 2, 2, 2, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 2, 1>, + ConvImpl::template process_tile<0, 1, 2, 2, 2, 2>, + ConvImpl::template process_tile<0, 1, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 2, 2, 3, 0>, + ConvImpl::template process_tile<0, 1, 2, 2, 3, 1>, + ConvImpl::template process_tile<0, 1, 2, 2, 3, 2>, + ConvImpl::template process_tile<0, 1, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 3, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 3, 0, 1>, + ConvImpl::template process_tile<0, 1, 2, 3, 0, 2>, + ConvImpl::template process_tile<0, 1, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 3, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 3, 1, 1>, + ConvImpl::template process_tile<0, 1, 2, 3, 1, 2>, + ConvImpl::template process_tile<0, 1, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 2, 3, 2, 0>, + ConvImpl::template process_tile<0, 1, 2, 3, 2, 1>, + ConvImpl::template process_tile<0, 1, 2, 3, 2, 2>, + ConvImpl::template process_tile<0, 1, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 2, 3, 3, 0>, + ConvImpl::template process_tile<0, 1, 2, 3, 3, 1>, + ConvImpl::template process_tile<0, 1, 2, 3, 3, 2>, + ConvImpl::template process_tile<0, 1, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 2, 4, 0, 0>, + ConvImpl::template process_tile<0, 1, 2, 4, 0, 1>, + ConvImpl::template process_tile<0, 1, 2, 4, 0, 2>, + ConvImpl::template process_tile<0, 1, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 2, 4, 1, 0>, + ConvImpl::template process_tile<0, 1, 2, 4, 1, 1>, + ConvImpl::template process_tile<0, 1, 2, 4, 1, 2>, + ConvImpl::template process_tile<0, 1, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 2, 4, 2, 0>, + ConvImpl::template process_tile<0, 1, 2, 4, 2, 1>, + ConvImpl::template process_tile<0, 1, 2, 4, 2, 2>, + ConvImpl::template process_tile<0, 1, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 2, 4, 3, 0>, + ConvImpl::template process_tile<0, 1, 2, 4, 3, 1>, + ConvImpl::template process_tile<0, 1, 2, 4, 3, 2>, + ConvImpl::template process_tile<0, 1, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 3, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 3, 0, 0, 1>, + ConvImpl::template process_tile<0, 1, 3, 0, 0, 2>, + ConvImpl::template process_tile<0, 1, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 3, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 3, 0, 1, 1>, + ConvImpl::template process_tile<0, 1, 3, 0, 1, 2>, + ConvImpl::template process_tile<0, 1, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 3, 0, 2, 0>, + ConvImpl::template process_tile<0, 1, 3, 0, 2, 1>, + ConvImpl::template process_tile<0, 1, 3, 0, 2, 2>, + ConvImpl::template process_tile<0, 1, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 3, 0, 3, 0>, + ConvImpl::template process_tile<0, 1, 3, 0, 3, 1>, + ConvImpl::template process_tile<0, 1, 3, 0, 3, 2>, + ConvImpl::template process_tile<0, 1, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 3, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 3, 1, 0, 1>, + ConvImpl::template process_tile<0, 1, 3, 1, 0, 2>, + ConvImpl::template process_tile<0, 1, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 3, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 3, 1, 1, 1>, + ConvImpl::template process_tile<0, 1, 3, 1, 1, 2>, + ConvImpl::template process_tile<0, 1, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 3, 1, 2, 0>, + ConvImpl::template process_tile<0, 1, 3, 1, 2, 1>, + ConvImpl::template process_tile<0, 1, 3, 1, 2, 2>, + ConvImpl::template process_tile<0, 1, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 3, 1, 3, 0>, + ConvImpl::template process_tile<0, 1, 3, 1, 3, 1>, + ConvImpl::template process_tile<0, 1, 3, 1, 3, 2>, + ConvImpl::template process_tile<0, 1, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 3, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 3, 2, 0, 1>, + ConvImpl::template process_tile<0, 1, 3, 2, 0, 2>, + ConvImpl::template process_tile<0, 1, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 3, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 3, 2, 1, 1>, + ConvImpl::template process_tile<0, 1, 3, 2, 1, 2>, + ConvImpl::template process_tile<0, 1, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 3, 2, 2, 0>, + ConvImpl::template process_tile<0, 1, 3, 2, 2, 1>, + ConvImpl::template process_tile<0, 1, 3, 2, 2, 2>, + ConvImpl::template process_tile<0, 1, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 3, 2, 3, 0>, + ConvImpl::template process_tile<0, 1, 3, 2, 3, 1>, + ConvImpl::template process_tile<0, 1, 3, 2, 3, 2>, + ConvImpl::template process_tile<0, 1, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 3, 3, 0, 0>, + ConvImpl::template process_tile<0, 1, 3, 3, 0, 1>, + ConvImpl::template process_tile<0, 1, 3, 3, 0, 2>, + ConvImpl::template process_tile<0, 1, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 3, 3, 1, 0>, + ConvImpl::template process_tile<0, 1, 3, 3, 1, 1>, + ConvImpl::template process_tile<0, 1, 3, 3, 1, 2>, + ConvImpl::template process_tile<0, 1, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 3, 3, 2, 0>, + ConvImpl::template process_tile<0, 1, 3, 3, 2, 1>, + ConvImpl::template process_tile<0, 1, 3, 3, 2, 2>, + ConvImpl::template process_tile<0, 1, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 3, 3, 3, 0>, + ConvImpl::template process_tile<0, 1, 3, 3, 3, 1>, + ConvImpl::template process_tile<0, 1, 3, 3, 3, 2>, + ConvImpl::template process_tile<0, 1, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 3, 4, 0, 0>, + ConvImpl::template process_tile<0, 1, 3, 4, 0, 1>, + ConvImpl::template process_tile<0, 1, 3, 4, 0, 2>, + ConvImpl::template process_tile<0, 1, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 3, 4, 1, 0>, + ConvImpl::template process_tile<0, 1, 3, 4, 1, 1>, + ConvImpl::template process_tile<0, 1, 3, 4, 1, 2>, + ConvImpl::template process_tile<0, 1, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 3, 4, 2, 0>, + ConvImpl::template process_tile<0, 1, 3, 4, 2, 1>, + ConvImpl::template process_tile<0, 1, 3, 4, 2, 2>, + ConvImpl::template process_tile<0, 1, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 3, 4, 3, 0>, + ConvImpl::template process_tile<0, 1, 3, 4, 3, 1>, + ConvImpl::template process_tile<0, 1, 3, 4, 3, 2>, + ConvImpl::template process_tile<0, 1, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 4, 0, 0, 0>, + ConvImpl::template process_tile<0, 1, 4, 0, 0, 1>, + ConvImpl::template process_tile<0, 1, 4, 0, 0, 2>, + ConvImpl::template process_tile<0, 1, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 4, 0, 1, 0>, + ConvImpl::template process_tile<0, 1, 4, 0, 1, 1>, + ConvImpl::template process_tile<0, 1, 4, 0, 1, 2>, + ConvImpl::template process_tile<0, 1, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 4, 0, 2, 0>, + ConvImpl::template process_tile<0, 1, 4, 0, 2, 1>, + ConvImpl::template process_tile<0, 1, 4, 0, 2, 2>, + ConvImpl::template process_tile<0, 1, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 4, 0, 3, 0>, + ConvImpl::template process_tile<0, 1, 4, 0, 3, 1>, + ConvImpl::template process_tile<0, 1, 4, 0, 3, 2>, + ConvImpl::template process_tile<0, 1, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 4, 1, 0, 0>, + ConvImpl::template process_tile<0, 1, 4, 1, 0, 1>, + ConvImpl::template process_tile<0, 1, 4, 1, 0, 2>, + ConvImpl::template process_tile<0, 1, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 4, 1, 1, 0>, + ConvImpl::template process_tile<0, 1, 4, 1, 1, 1>, + ConvImpl::template process_tile<0, 1, 4, 1, 1, 2>, + ConvImpl::template process_tile<0, 1, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 4, 1, 2, 0>, + ConvImpl::template process_tile<0, 1, 4, 1, 2, 1>, + ConvImpl::template process_tile<0, 1, 4, 1, 2, 2>, + ConvImpl::template process_tile<0, 1, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 4, 1, 3, 0>, + ConvImpl::template process_tile<0, 1, 4, 1, 3, 1>, + ConvImpl::template process_tile<0, 1, 4, 1, 3, 2>, + ConvImpl::template process_tile<0, 1, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 4, 2, 0, 0>, + ConvImpl::template process_tile<0, 1, 4, 2, 0, 1>, + ConvImpl::template process_tile<0, 1, 4, 2, 0, 2>, + ConvImpl::template process_tile<0, 1, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 4, 2, 1, 0>, + ConvImpl::template process_tile<0, 1, 4, 2, 1, 1>, + ConvImpl::template process_tile<0, 1, 4, 2, 1, 2>, + ConvImpl::template process_tile<0, 1, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 4, 2, 2, 0>, + ConvImpl::template process_tile<0, 1, 4, 2, 2, 1>, + ConvImpl::template process_tile<0, 1, 4, 2, 2, 2>, + ConvImpl::template process_tile<0, 1, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 4, 2, 3, 0>, + ConvImpl::template process_tile<0, 1, 4, 2, 3, 1>, + ConvImpl::template process_tile<0, 1, 4, 2, 3, 2>, + ConvImpl::template process_tile<0, 1, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 4, 3, 0, 0>, + ConvImpl::template process_tile<0, 1, 4, 3, 0, 1>, + ConvImpl::template process_tile<0, 1, 4, 3, 0, 2>, + ConvImpl::template process_tile<0, 1, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 4, 3, 1, 0>, + ConvImpl::template process_tile<0, 1, 4, 3, 1, 1>, + ConvImpl::template process_tile<0, 1, 4, 3, 1, 2>, + ConvImpl::template process_tile<0, 1, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 4, 3, 2, 0>, + ConvImpl::template process_tile<0, 1, 4, 3, 2, 1>, + ConvImpl::template process_tile<0, 1, 4, 3, 2, 2>, + ConvImpl::template process_tile<0, 1, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 4, 3, 3, 0>, + ConvImpl::template process_tile<0, 1, 4, 3, 3, 1>, + ConvImpl::template process_tile<0, 1, 4, 3, 3, 2>, + ConvImpl::template process_tile<0, 1, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<0, 1, 4, 4, 0, 0>, + ConvImpl::template process_tile<0, 1, 4, 4, 0, 1>, + ConvImpl::template process_tile<0, 1, 4, 4, 0, 2>, + ConvImpl::template process_tile<0, 1, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<0, 1, 4, 4, 1, 0>, + ConvImpl::template process_tile<0, 1, 4, 4, 1, 1>, + ConvImpl::template process_tile<0, 1, 4, 4, 1, 2>, + ConvImpl::template process_tile<0, 1, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<0, 1, 4, 4, 2, 0>, + ConvImpl::template process_tile<0, 1, 4, 4, 2, 1>, + ConvImpl::template process_tile<0, 1, 4, 4, 2, 2>, + ConvImpl::template process_tile<0, 1, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<0, 1, 4, 4, 3, 0>, + ConvImpl::template process_tile<0, 1, 4, 4, 3, 1>, + ConvImpl::template process_tile<0, 1, 4, 4, 3, 2>, + ConvImpl::template process_tile<0, 1, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>, + ConvImpl::template process_tile<1, 0, 0, 0, 0, 2>, + ConvImpl::template process_tile<1, 0, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>, + ConvImpl::template process_tile<1, 0, 0, 0, 1, 2>, + ConvImpl::template process_tile<1, 0, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 0, 0, 2, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 2, 1>, + ConvImpl::template process_tile<1, 0, 0, 0, 2, 2>, + ConvImpl::template process_tile<1, 0, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 0, 0, 3, 0>, + ConvImpl::template process_tile<1, 0, 0, 0, 3, 1>, + ConvImpl::template process_tile<1, 0, 0, 0, 3, 2>, + ConvImpl::template process_tile<1, 0, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>, + ConvImpl::template process_tile<1, 0, 0, 1, 0, 2>, + ConvImpl::template process_tile<1, 0, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>, + ConvImpl::template process_tile<1, 0, 0, 1, 1, 2>, + ConvImpl::template process_tile<1, 0, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 0, 1, 2, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 2, 1>, + ConvImpl::template process_tile<1, 0, 0, 1, 2, 2>, + ConvImpl::template process_tile<1, 0, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 0, 1, 3, 0>, + ConvImpl::template process_tile<1, 0, 0, 1, 3, 1>, + ConvImpl::template process_tile<1, 0, 0, 1, 3, 2>, + ConvImpl::template process_tile<1, 0, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>, + ConvImpl::template process_tile<1, 0, 0, 2, 0, 2>, + ConvImpl::template process_tile<1, 0, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>, + ConvImpl::template process_tile<1, 0, 0, 2, 1, 2>, + ConvImpl::template process_tile<1, 0, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 0, 2, 2, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 2, 1>, + ConvImpl::template process_tile<1, 0, 0, 2, 2, 2>, + ConvImpl::template process_tile<1, 0, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 0, 2, 3, 0>, + ConvImpl::template process_tile<1, 0, 0, 2, 3, 1>, + ConvImpl::template process_tile<1, 0, 0, 2, 3, 2>, + ConvImpl::template process_tile<1, 0, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 3, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 3, 0, 1>, + ConvImpl::template process_tile<1, 0, 0, 3, 0, 2>, + ConvImpl::template process_tile<1, 0, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 3, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 3, 1, 1>, + ConvImpl::template process_tile<1, 0, 0, 3, 1, 2>, + ConvImpl::template process_tile<1, 0, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 0, 3, 2, 0>, + ConvImpl::template process_tile<1, 0, 0, 3, 2, 1>, + ConvImpl::template process_tile<1, 0, 0, 3, 2, 2>, + ConvImpl::template process_tile<1, 0, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 0, 3, 3, 0>, + ConvImpl::template process_tile<1, 0, 0, 3, 3, 1>, + ConvImpl::template process_tile<1, 0, 0, 3, 3, 2>, + ConvImpl::template process_tile<1, 0, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 0, 4, 0, 0>, + ConvImpl::template process_tile<1, 0, 0, 4, 0, 1>, + ConvImpl::template process_tile<1, 0, 0, 4, 0, 2>, + ConvImpl::template process_tile<1, 0, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 0, 4, 1, 0>, + ConvImpl::template process_tile<1, 0, 0, 4, 1, 1>, + ConvImpl::template process_tile<1, 0, 0, 4, 1, 2>, + ConvImpl::template process_tile<1, 0, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 0, 4, 2, 0>, + ConvImpl::template process_tile<1, 0, 0, 4, 2, 1>, + ConvImpl::template process_tile<1, 0, 0, 4, 2, 2>, + ConvImpl::template process_tile<1, 0, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 0, 4, 3, 0>, + ConvImpl::template process_tile<1, 0, 0, 4, 3, 1>, + ConvImpl::template process_tile<1, 0, 0, 4, 3, 2>, + ConvImpl::template process_tile<1, 0, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>, + ConvImpl::template process_tile<1, 0, 1, 0, 0, 2>, + ConvImpl::template process_tile<1, 0, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>, + ConvImpl::template process_tile<1, 0, 1, 0, 1, 2>, + ConvImpl::template process_tile<1, 0, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 1, 0, 2, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 2, 1>, + ConvImpl::template process_tile<1, 0, 1, 0, 2, 2>, + ConvImpl::template process_tile<1, 0, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 1, 0, 3, 0>, + ConvImpl::template process_tile<1, 0, 1, 0, 3, 1>, + ConvImpl::template process_tile<1, 0, 1, 0, 3, 2>, + ConvImpl::template process_tile<1, 0, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>, + ConvImpl::template process_tile<1, 0, 1, 1, 0, 2>, + ConvImpl::template process_tile<1, 0, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>, + ConvImpl::template process_tile<1, 0, 1, 1, 1, 2>, + ConvImpl::template process_tile<1, 0, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 1, 1, 2, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 2, 1>, + ConvImpl::template process_tile<1, 0, 1, 1, 2, 2>, + ConvImpl::template process_tile<1, 0, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 1, 1, 3, 0>, + ConvImpl::template process_tile<1, 0, 1, 1, 3, 1>, + ConvImpl::template process_tile<1, 0, 1, 1, 3, 2>, + ConvImpl::template process_tile<1, 0, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>, + ConvImpl::template process_tile<1, 0, 1, 2, 0, 2>, + ConvImpl::template process_tile<1, 0, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>, + ConvImpl::template process_tile<1, 0, 1, 2, 1, 2>, + ConvImpl::template process_tile<1, 0, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 1, 2, 2, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 2, 1>, + ConvImpl::template process_tile<1, 0, 1, 2, 2, 2>, + ConvImpl::template process_tile<1, 0, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 1, 2, 3, 0>, + ConvImpl::template process_tile<1, 0, 1, 2, 3, 1>, + ConvImpl::template process_tile<1, 0, 1, 2, 3, 2>, + ConvImpl::template process_tile<1, 0, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 3, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 3, 0, 1>, + ConvImpl::template process_tile<1, 0, 1, 3, 0, 2>, + ConvImpl::template process_tile<1, 0, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 3, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 3, 1, 1>, + ConvImpl::template process_tile<1, 0, 1, 3, 1, 2>, + ConvImpl::template process_tile<1, 0, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 1, 3, 2, 0>, + ConvImpl::template process_tile<1, 0, 1, 3, 2, 1>, + ConvImpl::template process_tile<1, 0, 1, 3, 2, 2>, + ConvImpl::template process_tile<1, 0, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 1, 3, 3, 0>, + ConvImpl::template process_tile<1, 0, 1, 3, 3, 1>, + ConvImpl::template process_tile<1, 0, 1, 3, 3, 2>, + ConvImpl::template process_tile<1, 0, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 1, 4, 0, 0>, + ConvImpl::template process_tile<1, 0, 1, 4, 0, 1>, + ConvImpl::template process_tile<1, 0, 1, 4, 0, 2>, + ConvImpl::template process_tile<1, 0, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 1, 4, 1, 0>, + ConvImpl::template process_tile<1, 0, 1, 4, 1, 1>, + ConvImpl::template process_tile<1, 0, 1, 4, 1, 2>, + ConvImpl::template process_tile<1, 0, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 1, 4, 2, 0>, + ConvImpl::template process_tile<1, 0, 1, 4, 2, 1>, + ConvImpl::template process_tile<1, 0, 1, 4, 2, 2>, + ConvImpl::template process_tile<1, 0, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 1, 4, 3, 0>, + ConvImpl::template process_tile<1, 0, 1, 4, 3, 1>, + ConvImpl::template process_tile<1, 0, 1, 4, 3, 2>, + ConvImpl::template process_tile<1, 0, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>, + ConvImpl::template process_tile<1, 0, 2, 0, 0, 2>, + ConvImpl::template process_tile<1, 0, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>, + ConvImpl::template process_tile<1, 0, 2, 0, 1, 2>, + ConvImpl::template process_tile<1, 0, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 2, 0, 2, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 2, 1>, + ConvImpl::template process_tile<1, 0, 2, 0, 2, 2>, + ConvImpl::template process_tile<1, 0, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 2, 0, 3, 0>, + ConvImpl::template process_tile<1, 0, 2, 0, 3, 1>, + ConvImpl::template process_tile<1, 0, 2, 0, 3, 2>, + ConvImpl::template process_tile<1, 0, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>, + ConvImpl::template process_tile<1, 0, 2, 1, 0, 2>, + ConvImpl::template process_tile<1, 0, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>, + ConvImpl::template process_tile<1, 0, 2, 1, 1, 2>, + ConvImpl::template process_tile<1, 0, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 2, 1, 2, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 2, 1>, + ConvImpl::template process_tile<1, 0, 2, 1, 2, 2>, + ConvImpl::template process_tile<1, 0, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 2, 1, 3, 0>, + ConvImpl::template process_tile<1, 0, 2, 1, 3, 1>, + ConvImpl::template process_tile<1, 0, 2, 1, 3, 2>, + ConvImpl::template process_tile<1, 0, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>, + ConvImpl::template process_tile<1, 0, 2, 2, 0, 2>, + ConvImpl::template process_tile<1, 0, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>, + ConvImpl::template process_tile<1, 0, 2, 2, 1, 2>, + ConvImpl::template process_tile<1, 0, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 2, 2, 2, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 2, 1>, + ConvImpl::template process_tile<1, 0, 2, 2, 2, 2>, + ConvImpl::template process_tile<1, 0, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 2, 2, 3, 0>, + ConvImpl::template process_tile<1, 0, 2, 2, 3, 1>, + ConvImpl::template process_tile<1, 0, 2, 2, 3, 2>, + ConvImpl::template process_tile<1, 0, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 3, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 3, 0, 1>, + ConvImpl::template process_tile<1, 0, 2, 3, 0, 2>, + ConvImpl::template process_tile<1, 0, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 3, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 3, 1, 1>, + ConvImpl::template process_tile<1, 0, 2, 3, 1, 2>, + ConvImpl::template process_tile<1, 0, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 2, 3, 2, 0>, + ConvImpl::template process_tile<1, 0, 2, 3, 2, 1>, + ConvImpl::template process_tile<1, 0, 2, 3, 2, 2>, + ConvImpl::template process_tile<1, 0, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 2, 3, 3, 0>, + ConvImpl::template process_tile<1, 0, 2, 3, 3, 1>, + ConvImpl::template process_tile<1, 0, 2, 3, 3, 2>, + ConvImpl::template process_tile<1, 0, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 2, 4, 0, 0>, + ConvImpl::template process_tile<1, 0, 2, 4, 0, 1>, + ConvImpl::template process_tile<1, 0, 2, 4, 0, 2>, + ConvImpl::template process_tile<1, 0, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 2, 4, 1, 0>, + ConvImpl::template process_tile<1, 0, 2, 4, 1, 1>, + ConvImpl::template process_tile<1, 0, 2, 4, 1, 2>, + ConvImpl::template process_tile<1, 0, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 2, 4, 2, 0>, + ConvImpl::template process_tile<1, 0, 2, 4, 2, 1>, + ConvImpl::template process_tile<1, 0, 2, 4, 2, 2>, + ConvImpl::template process_tile<1, 0, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 2, 4, 3, 0>, + ConvImpl::template process_tile<1, 0, 2, 4, 3, 1>, + ConvImpl::template process_tile<1, 0, 2, 4, 3, 2>, + ConvImpl::template process_tile<1, 0, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 3, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 3, 0, 0, 1>, + ConvImpl::template process_tile<1, 0, 3, 0, 0, 2>, + ConvImpl::template process_tile<1, 0, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 3, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 3, 0, 1, 1>, + ConvImpl::template process_tile<1, 0, 3, 0, 1, 2>, + ConvImpl::template process_tile<1, 0, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 3, 0, 2, 0>, + ConvImpl::template process_tile<1, 0, 3, 0, 2, 1>, + ConvImpl::template process_tile<1, 0, 3, 0, 2, 2>, + ConvImpl::template process_tile<1, 0, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 3, 0, 3, 0>, + ConvImpl::template process_tile<1, 0, 3, 0, 3, 1>, + ConvImpl::template process_tile<1, 0, 3, 0, 3, 2>, + ConvImpl::template process_tile<1, 0, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 3, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 3, 1, 0, 1>, + ConvImpl::template process_tile<1, 0, 3, 1, 0, 2>, + ConvImpl::template process_tile<1, 0, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 3, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 3, 1, 1, 1>, + ConvImpl::template process_tile<1, 0, 3, 1, 1, 2>, + ConvImpl::template process_tile<1, 0, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 3, 1, 2, 0>, + ConvImpl::template process_tile<1, 0, 3, 1, 2, 1>, + ConvImpl::template process_tile<1, 0, 3, 1, 2, 2>, + ConvImpl::template process_tile<1, 0, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 3, 1, 3, 0>, + ConvImpl::template process_tile<1, 0, 3, 1, 3, 1>, + ConvImpl::template process_tile<1, 0, 3, 1, 3, 2>, + ConvImpl::template process_tile<1, 0, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 3, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 3, 2, 0, 1>, + ConvImpl::template process_tile<1, 0, 3, 2, 0, 2>, + ConvImpl::template process_tile<1, 0, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 3, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 3, 2, 1, 1>, + ConvImpl::template process_tile<1, 0, 3, 2, 1, 2>, + ConvImpl::template process_tile<1, 0, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 3, 2, 2, 0>, + ConvImpl::template process_tile<1, 0, 3, 2, 2, 1>, + ConvImpl::template process_tile<1, 0, 3, 2, 2, 2>, + ConvImpl::template process_tile<1, 0, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 3, 2, 3, 0>, + ConvImpl::template process_tile<1, 0, 3, 2, 3, 1>, + ConvImpl::template process_tile<1, 0, 3, 2, 3, 2>, + ConvImpl::template process_tile<1, 0, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 3, 3, 0, 0>, + ConvImpl::template process_tile<1, 0, 3, 3, 0, 1>, + ConvImpl::template process_tile<1, 0, 3, 3, 0, 2>, + ConvImpl::template process_tile<1, 0, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 3, 3, 1, 0>, + ConvImpl::template process_tile<1, 0, 3, 3, 1, 1>, + ConvImpl::template process_tile<1, 0, 3, 3, 1, 2>, + ConvImpl::template process_tile<1, 0, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 3, 3, 2, 0>, + ConvImpl::template process_tile<1, 0, 3, 3, 2, 1>, + ConvImpl::template process_tile<1, 0, 3, 3, 2, 2>, + ConvImpl::template process_tile<1, 0, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 3, 3, 3, 0>, + ConvImpl::template process_tile<1, 0, 3, 3, 3, 1>, + ConvImpl::template process_tile<1, 0, 3, 3, 3, 2>, + ConvImpl::template process_tile<1, 0, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 3, 4, 0, 0>, + ConvImpl::template process_tile<1, 0, 3, 4, 0, 1>, + ConvImpl::template process_tile<1, 0, 3, 4, 0, 2>, + ConvImpl::template process_tile<1, 0, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 3, 4, 1, 0>, + ConvImpl::template process_tile<1, 0, 3, 4, 1, 1>, + ConvImpl::template process_tile<1, 0, 3, 4, 1, 2>, + ConvImpl::template process_tile<1, 0, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 3, 4, 2, 0>, + ConvImpl::template process_tile<1, 0, 3, 4, 2, 1>, + ConvImpl::template process_tile<1, 0, 3, 4, 2, 2>, + ConvImpl::template process_tile<1, 0, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 3, 4, 3, 0>, + ConvImpl::template process_tile<1, 0, 3, 4, 3, 1>, + ConvImpl::template process_tile<1, 0, 3, 4, 3, 2>, + ConvImpl::template process_tile<1, 0, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 4, 0, 0, 0>, + ConvImpl::template process_tile<1, 0, 4, 0, 0, 1>, + ConvImpl::template process_tile<1, 0, 4, 0, 0, 2>, + ConvImpl::template process_tile<1, 0, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 4, 0, 1, 0>, + ConvImpl::template process_tile<1, 0, 4, 0, 1, 1>, + ConvImpl::template process_tile<1, 0, 4, 0, 1, 2>, + ConvImpl::template process_tile<1, 0, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 4, 0, 2, 0>, + ConvImpl::template process_tile<1, 0, 4, 0, 2, 1>, + ConvImpl::template process_tile<1, 0, 4, 0, 2, 2>, + ConvImpl::template process_tile<1, 0, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 4, 0, 3, 0>, + ConvImpl::template process_tile<1, 0, 4, 0, 3, 1>, + ConvImpl::template process_tile<1, 0, 4, 0, 3, 2>, + ConvImpl::template process_tile<1, 0, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 4, 1, 0, 0>, + ConvImpl::template process_tile<1, 0, 4, 1, 0, 1>, + ConvImpl::template process_tile<1, 0, 4, 1, 0, 2>, + ConvImpl::template process_tile<1, 0, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 4, 1, 1, 0>, + ConvImpl::template process_tile<1, 0, 4, 1, 1, 1>, + ConvImpl::template process_tile<1, 0, 4, 1, 1, 2>, + ConvImpl::template process_tile<1, 0, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 4, 1, 2, 0>, + ConvImpl::template process_tile<1, 0, 4, 1, 2, 1>, + ConvImpl::template process_tile<1, 0, 4, 1, 2, 2>, + ConvImpl::template process_tile<1, 0, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 4, 1, 3, 0>, + ConvImpl::template process_tile<1, 0, 4, 1, 3, 1>, + ConvImpl::template process_tile<1, 0, 4, 1, 3, 2>, + ConvImpl::template process_tile<1, 0, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 4, 2, 0, 0>, + ConvImpl::template process_tile<1, 0, 4, 2, 0, 1>, + ConvImpl::template process_tile<1, 0, 4, 2, 0, 2>, + ConvImpl::template process_tile<1, 0, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 4, 2, 1, 0>, + ConvImpl::template process_tile<1, 0, 4, 2, 1, 1>, + ConvImpl::template process_tile<1, 0, 4, 2, 1, 2>, + ConvImpl::template process_tile<1, 0, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 4, 2, 2, 0>, + ConvImpl::template process_tile<1, 0, 4, 2, 2, 1>, + ConvImpl::template process_tile<1, 0, 4, 2, 2, 2>, + ConvImpl::template process_tile<1, 0, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 4, 2, 3, 0>, + ConvImpl::template process_tile<1, 0, 4, 2, 3, 1>, + ConvImpl::template process_tile<1, 0, 4, 2, 3, 2>, + ConvImpl::template process_tile<1, 0, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 4, 3, 0, 0>, + ConvImpl::template process_tile<1, 0, 4, 3, 0, 1>, + ConvImpl::template process_tile<1, 0, 4, 3, 0, 2>, + ConvImpl::template process_tile<1, 0, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 4, 3, 1, 0>, + ConvImpl::template process_tile<1, 0, 4, 3, 1, 1>, + ConvImpl::template process_tile<1, 0, 4, 3, 1, 2>, + ConvImpl::template process_tile<1, 0, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 4, 3, 2, 0>, + ConvImpl::template process_tile<1, 0, 4, 3, 2, 1>, + ConvImpl::template process_tile<1, 0, 4, 3, 2, 2>, + ConvImpl::template process_tile<1, 0, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 4, 3, 3, 0>, + ConvImpl::template process_tile<1, 0, 4, 3, 3, 1>, + ConvImpl::template process_tile<1, 0, 4, 3, 3, 2>, + ConvImpl::template process_tile<1, 0, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 0, 4, 4, 0, 0>, + ConvImpl::template process_tile<1, 0, 4, 4, 0, 1>, + ConvImpl::template process_tile<1, 0, 4, 4, 0, 2>, + ConvImpl::template process_tile<1, 0, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 0, 4, 4, 1, 0>, + ConvImpl::template process_tile<1, 0, 4, 4, 1, 1>, + ConvImpl::template process_tile<1, 0, 4, 4, 1, 2>, + ConvImpl::template process_tile<1, 0, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 0, 4, 4, 2, 0>, + ConvImpl::template process_tile<1, 0, 4, 4, 2, 1>, + ConvImpl::template process_tile<1, 0, 4, 4, 2, 2>, + ConvImpl::template process_tile<1, 0, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 0, 4, 4, 3, 0>, + ConvImpl::template process_tile<1, 0, 4, 4, 3, 1>, + ConvImpl::template process_tile<1, 0, 4, 4, 3, 2>, + ConvImpl::template process_tile<1, 0, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>, + ConvImpl::template process_tile<1, 1, 0, 0, 0, 2>, + ConvImpl::template process_tile<1, 1, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>, + ConvImpl::template process_tile<1, 1, 0, 0, 1, 2>, + ConvImpl::template process_tile<1, 1, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 0, 0, 2, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 2, 1>, + ConvImpl::template process_tile<1, 1, 0, 0, 2, 2>, + ConvImpl::template process_tile<1, 1, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 0, 0, 3, 0>, + ConvImpl::template process_tile<1, 1, 0, 0, 3, 1>, + ConvImpl::template process_tile<1, 1, 0, 0, 3, 2>, + ConvImpl::template process_tile<1, 1, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>, + ConvImpl::template process_tile<1, 1, 0, 1, 0, 2>, + ConvImpl::template process_tile<1, 1, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>, + ConvImpl::template process_tile<1, 1, 0, 1, 1, 2>, + ConvImpl::template process_tile<1, 1, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 0, 1, 2, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 2, 1>, + ConvImpl::template process_tile<1, 1, 0, 1, 2, 2>, + ConvImpl::template process_tile<1, 1, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 0, 1, 3, 0>, + ConvImpl::template process_tile<1, 1, 0, 1, 3, 1>, + ConvImpl::template process_tile<1, 1, 0, 1, 3, 2>, + ConvImpl::template process_tile<1, 1, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>, + ConvImpl::template process_tile<1, 1, 0, 2, 0, 2>, + ConvImpl::template process_tile<1, 1, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>, + ConvImpl::template process_tile<1, 1, 0, 2, 1, 2>, + ConvImpl::template process_tile<1, 1, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 0, 2, 2, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 2, 1>, + ConvImpl::template process_tile<1, 1, 0, 2, 2, 2>, + ConvImpl::template process_tile<1, 1, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 0, 2, 3, 0>, + ConvImpl::template process_tile<1, 1, 0, 2, 3, 1>, + ConvImpl::template process_tile<1, 1, 0, 2, 3, 2>, + ConvImpl::template process_tile<1, 1, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 3, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 3, 0, 1>, + ConvImpl::template process_tile<1, 1, 0, 3, 0, 2>, + ConvImpl::template process_tile<1, 1, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 3, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 3, 1, 1>, + ConvImpl::template process_tile<1, 1, 0, 3, 1, 2>, + ConvImpl::template process_tile<1, 1, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 0, 3, 2, 0>, + ConvImpl::template process_tile<1, 1, 0, 3, 2, 1>, + ConvImpl::template process_tile<1, 1, 0, 3, 2, 2>, + ConvImpl::template process_tile<1, 1, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 0, 3, 3, 0>, + ConvImpl::template process_tile<1, 1, 0, 3, 3, 1>, + ConvImpl::template process_tile<1, 1, 0, 3, 3, 2>, + ConvImpl::template process_tile<1, 1, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 0, 4, 0, 0>, + ConvImpl::template process_tile<1, 1, 0, 4, 0, 1>, + ConvImpl::template process_tile<1, 1, 0, 4, 0, 2>, + ConvImpl::template process_tile<1, 1, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 0, 4, 1, 0>, + ConvImpl::template process_tile<1, 1, 0, 4, 1, 1>, + ConvImpl::template process_tile<1, 1, 0, 4, 1, 2>, + ConvImpl::template process_tile<1, 1, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 0, 4, 2, 0>, + ConvImpl::template process_tile<1, 1, 0, 4, 2, 1>, + ConvImpl::template process_tile<1, 1, 0, 4, 2, 2>, + ConvImpl::template process_tile<1, 1, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 0, 4, 3, 0>, + ConvImpl::template process_tile<1, 1, 0, 4, 3, 1>, + ConvImpl::template process_tile<1, 1, 0, 4, 3, 2>, + ConvImpl::template process_tile<1, 1, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>, + ConvImpl::template process_tile<1, 1, 1, 0, 0, 2>, + ConvImpl::template process_tile<1, 1, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>, + ConvImpl::template process_tile<1, 1, 1, 0, 1, 2>, + ConvImpl::template process_tile<1, 1, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 1, 0, 2, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 2, 1>, + ConvImpl::template process_tile<1, 1, 1, 0, 2, 2>, + ConvImpl::template process_tile<1, 1, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 1, 0, 3, 0>, + ConvImpl::template process_tile<1, 1, 1, 0, 3, 1>, + ConvImpl::template process_tile<1, 1, 1, 0, 3, 2>, + ConvImpl::template process_tile<1, 1, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>, + ConvImpl::template process_tile<1, 1, 1, 1, 0, 2>, + ConvImpl::template process_tile<1, 1, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>, + ConvImpl::template process_tile<1, 1, 1, 1, 1, 2>, + ConvImpl::template process_tile<1, 1, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 1, 1, 2, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 2, 1>, + ConvImpl::template process_tile<1, 1, 1, 1, 2, 2>, + ConvImpl::template process_tile<1, 1, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 1, 1, 3, 0>, + ConvImpl::template process_tile<1, 1, 1, 1, 3, 1>, + ConvImpl::template process_tile<1, 1, 1, 1, 3, 2>, + ConvImpl::template process_tile<1, 1, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>, + ConvImpl::template process_tile<1, 1, 1, 2, 0, 2>, + ConvImpl::template process_tile<1, 1, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>, + ConvImpl::template process_tile<1, 1, 1, 2, 1, 2>, + ConvImpl::template process_tile<1, 1, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 1, 2, 2, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 2, 1>, + ConvImpl::template process_tile<1, 1, 1, 2, 2, 2>, + ConvImpl::template process_tile<1, 1, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 1, 2, 3, 0>, + ConvImpl::template process_tile<1, 1, 1, 2, 3, 1>, + ConvImpl::template process_tile<1, 1, 1, 2, 3, 2>, + ConvImpl::template process_tile<1, 1, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 3, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 3, 0, 1>, + ConvImpl::template process_tile<1, 1, 1, 3, 0, 2>, + ConvImpl::template process_tile<1, 1, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 3, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 3, 1, 1>, + ConvImpl::template process_tile<1, 1, 1, 3, 1, 2>, + ConvImpl::template process_tile<1, 1, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 1, 3, 2, 0>, + ConvImpl::template process_tile<1, 1, 1, 3, 2, 1>, + ConvImpl::template process_tile<1, 1, 1, 3, 2, 2>, + ConvImpl::template process_tile<1, 1, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 1, 3, 3, 0>, + ConvImpl::template process_tile<1, 1, 1, 3, 3, 1>, + ConvImpl::template process_tile<1, 1, 1, 3, 3, 2>, + ConvImpl::template process_tile<1, 1, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 1, 4, 0, 0>, + ConvImpl::template process_tile<1, 1, 1, 4, 0, 1>, + ConvImpl::template process_tile<1, 1, 1, 4, 0, 2>, + ConvImpl::template process_tile<1, 1, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 1, 4, 1, 0>, + ConvImpl::template process_tile<1, 1, 1, 4, 1, 1>, + ConvImpl::template process_tile<1, 1, 1, 4, 1, 2>, + ConvImpl::template process_tile<1, 1, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 1, 4, 2, 0>, + ConvImpl::template process_tile<1, 1, 1, 4, 2, 1>, + ConvImpl::template process_tile<1, 1, 1, 4, 2, 2>, + ConvImpl::template process_tile<1, 1, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 1, 4, 3, 0>, + ConvImpl::template process_tile<1, 1, 1, 4, 3, 1>, + ConvImpl::template process_tile<1, 1, 1, 4, 3, 2>, + ConvImpl::template process_tile<1, 1, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>, + ConvImpl::template process_tile<1, 1, 2, 0, 0, 2>, + ConvImpl::template process_tile<1, 1, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>, + ConvImpl::template process_tile<1, 1, 2, 0, 1, 2>, + ConvImpl::template process_tile<1, 1, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 2, 0, 2, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 2, 1>, + ConvImpl::template process_tile<1, 1, 2, 0, 2, 2>, + ConvImpl::template process_tile<1, 1, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 2, 0, 3, 0>, + ConvImpl::template process_tile<1, 1, 2, 0, 3, 1>, + ConvImpl::template process_tile<1, 1, 2, 0, 3, 2>, + ConvImpl::template process_tile<1, 1, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>, + ConvImpl::template process_tile<1, 1, 2, 1, 0, 2>, + ConvImpl::template process_tile<1, 1, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>, + ConvImpl::template process_tile<1, 1, 2, 1, 1, 2>, + ConvImpl::template process_tile<1, 1, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 2, 1, 2, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 2, 1>, + ConvImpl::template process_tile<1, 1, 2, 1, 2, 2>, + ConvImpl::template process_tile<1, 1, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 2, 1, 3, 0>, + ConvImpl::template process_tile<1, 1, 2, 1, 3, 1>, + ConvImpl::template process_tile<1, 1, 2, 1, 3, 2>, + ConvImpl::template process_tile<1, 1, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>, + ConvImpl::template process_tile<1, 1, 2, 2, 0, 2>, + ConvImpl::template process_tile<1, 1, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>, + ConvImpl::template process_tile<1, 1, 2, 2, 1, 2>, + ConvImpl::template process_tile<1, 1, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 2, 2, 2, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 2, 1>, + ConvImpl::template process_tile<1, 1, 2, 2, 2, 2>, + ConvImpl::template process_tile<1, 1, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 2, 2, 3, 0>, + ConvImpl::template process_tile<1, 1, 2, 2, 3, 1>, + ConvImpl::template process_tile<1, 1, 2, 2, 3, 2>, + ConvImpl::template process_tile<1, 1, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 3, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 3, 0, 1>, + ConvImpl::template process_tile<1, 1, 2, 3, 0, 2>, + ConvImpl::template process_tile<1, 1, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 3, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 3, 1, 1>, + ConvImpl::template process_tile<1, 1, 2, 3, 1, 2>, + ConvImpl::template process_tile<1, 1, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 2, 3, 2, 0>, + ConvImpl::template process_tile<1, 1, 2, 3, 2, 1>, + ConvImpl::template process_tile<1, 1, 2, 3, 2, 2>, + ConvImpl::template process_tile<1, 1, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 2, 3, 3, 0>, + ConvImpl::template process_tile<1, 1, 2, 3, 3, 1>, + ConvImpl::template process_tile<1, 1, 2, 3, 3, 2>, + ConvImpl::template process_tile<1, 1, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 2, 4, 0, 0>, + ConvImpl::template process_tile<1, 1, 2, 4, 0, 1>, + ConvImpl::template process_tile<1, 1, 2, 4, 0, 2>, + ConvImpl::template process_tile<1, 1, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 2, 4, 1, 0>, + ConvImpl::template process_tile<1, 1, 2, 4, 1, 1>, + ConvImpl::template process_tile<1, 1, 2, 4, 1, 2>, + ConvImpl::template process_tile<1, 1, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 2, 4, 2, 0>, + ConvImpl::template process_tile<1, 1, 2, 4, 2, 1>, + ConvImpl::template process_tile<1, 1, 2, 4, 2, 2>, + ConvImpl::template process_tile<1, 1, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 2, 4, 3, 0>, + ConvImpl::template process_tile<1, 1, 2, 4, 3, 1>, + ConvImpl::template process_tile<1, 1, 2, 4, 3, 2>, + ConvImpl::template process_tile<1, 1, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 3, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 3, 0, 0, 1>, + ConvImpl::template process_tile<1, 1, 3, 0, 0, 2>, + ConvImpl::template process_tile<1, 1, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 3, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 3, 0, 1, 1>, + ConvImpl::template process_tile<1, 1, 3, 0, 1, 2>, + ConvImpl::template process_tile<1, 1, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 3, 0, 2, 0>, + ConvImpl::template process_tile<1, 1, 3, 0, 2, 1>, + ConvImpl::template process_tile<1, 1, 3, 0, 2, 2>, + ConvImpl::template process_tile<1, 1, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 3, 0, 3, 0>, + ConvImpl::template process_tile<1, 1, 3, 0, 3, 1>, + ConvImpl::template process_tile<1, 1, 3, 0, 3, 2>, + ConvImpl::template process_tile<1, 1, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 3, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 3, 1, 0, 1>, + ConvImpl::template process_tile<1, 1, 3, 1, 0, 2>, + ConvImpl::template process_tile<1, 1, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 3, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 3, 1, 1, 1>, + ConvImpl::template process_tile<1, 1, 3, 1, 1, 2>, + ConvImpl::template process_tile<1, 1, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 3, 1, 2, 0>, + ConvImpl::template process_tile<1, 1, 3, 1, 2, 1>, + ConvImpl::template process_tile<1, 1, 3, 1, 2, 2>, + ConvImpl::template process_tile<1, 1, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 3, 1, 3, 0>, + ConvImpl::template process_tile<1, 1, 3, 1, 3, 1>, + ConvImpl::template process_tile<1, 1, 3, 1, 3, 2>, + ConvImpl::template process_tile<1, 1, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 3, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 3, 2, 0, 1>, + ConvImpl::template process_tile<1, 1, 3, 2, 0, 2>, + ConvImpl::template process_tile<1, 1, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 3, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 3, 2, 1, 1>, + ConvImpl::template process_tile<1, 1, 3, 2, 1, 2>, + ConvImpl::template process_tile<1, 1, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 3, 2, 2, 0>, + ConvImpl::template process_tile<1, 1, 3, 2, 2, 1>, + ConvImpl::template process_tile<1, 1, 3, 2, 2, 2>, + ConvImpl::template process_tile<1, 1, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 3, 2, 3, 0>, + ConvImpl::template process_tile<1, 1, 3, 2, 3, 1>, + ConvImpl::template process_tile<1, 1, 3, 2, 3, 2>, + ConvImpl::template process_tile<1, 1, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 3, 3, 0, 0>, + ConvImpl::template process_tile<1, 1, 3, 3, 0, 1>, + ConvImpl::template process_tile<1, 1, 3, 3, 0, 2>, + ConvImpl::template process_tile<1, 1, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 3, 3, 1, 0>, + ConvImpl::template process_tile<1, 1, 3, 3, 1, 1>, + ConvImpl::template process_tile<1, 1, 3, 3, 1, 2>, + ConvImpl::template process_tile<1, 1, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 3, 3, 2, 0>, + ConvImpl::template process_tile<1, 1, 3, 3, 2, 1>, + ConvImpl::template process_tile<1, 1, 3, 3, 2, 2>, + ConvImpl::template process_tile<1, 1, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 3, 3, 3, 0>, + ConvImpl::template process_tile<1, 1, 3, 3, 3, 1>, + ConvImpl::template process_tile<1, 1, 3, 3, 3, 2>, + ConvImpl::template process_tile<1, 1, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 3, 4, 0, 0>, + ConvImpl::template process_tile<1, 1, 3, 4, 0, 1>, + ConvImpl::template process_tile<1, 1, 3, 4, 0, 2>, + ConvImpl::template process_tile<1, 1, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 3, 4, 1, 0>, + ConvImpl::template process_tile<1, 1, 3, 4, 1, 1>, + ConvImpl::template process_tile<1, 1, 3, 4, 1, 2>, + ConvImpl::template process_tile<1, 1, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 3, 4, 2, 0>, + ConvImpl::template process_tile<1, 1, 3, 4, 2, 1>, + ConvImpl::template process_tile<1, 1, 3, 4, 2, 2>, + ConvImpl::template process_tile<1, 1, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 3, 4, 3, 0>, + ConvImpl::template process_tile<1, 1, 3, 4, 3, 1>, + ConvImpl::template process_tile<1, 1, 3, 4, 3, 2>, + ConvImpl::template process_tile<1, 1, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 4, 0, 0, 0>, + ConvImpl::template process_tile<1, 1, 4, 0, 0, 1>, + ConvImpl::template process_tile<1, 1, 4, 0, 0, 2>, + ConvImpl::template process_tile<1, 1, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 4, 0, 1, 0>, + ConvImpl::template process_tile<1, 1, 4, 0, 1, 1>, + ConvImpl::template process_tile<1, 1, 4, 0, 1, 2>, + ConvImpl::template process_tile<1, 1, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 4, 0, 2, 0>, + ConvImpl::template process_tile<1, 1, 4, 0, 2, 1>, + ConvImpl::template process_tile<1, 1, 4, 0, 2, 2>, + ConvImpl::template process_tile<1, 1, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 4, 0, 3, 0>, + ConvImpl::template process_tile<1, 1, 4, 0, 3, 1>, + ConvImpl::template process_tile<1, 1, 4, 0, 3, 2>, + ConvImpl::template process_tile<1, 1, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 4, 1, 0, 0>, + ConvImpl::template process_tile<1, 1, 4, 1, 0, 1>, + ConvImpl::template process_tile<1, 1, 4, 1, 0, 2>, + ConvImpl::template process_tile<1, 1, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 4, 1, 1, 0>, + ConvImpl::template process_tile<1, 1, 4, 1, 1, 1>, + ConvImpl::template process_tile<1, 1, 4, 1, 1, 2>, + ConvImpl::template process_tile<1, 1, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 4, 1, 2, 0>, + ConvImpl::template process_tile<1, 1, 4, 1, 2, 1>, + ConvImpl::template process_tile<1, 1, 4, 1, 2, 2>, + ConvImpl::template process_tile<1, 1, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 4, 1, 3, 0>, + ConvImpl::template process_tile<1, 1, 4, 1, 3, 1>, + ConvImpl::template process_tile<1, 1, 4, 1, 3, 2>, + ConvImpl::template process_tile<1, 1, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 4, 2, 0, 0>, + ConvImpl::template process_tile<1, 1, 4, 2, 0, 1>, + ConvImpl::template process_tile<1, 1, 4, 2, 0, 2>, + ConvImpl::template process_tile<1, 1, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 4, 2, 1, 0>, + ConvImpl::template process_tile<1, 1, 4, 2, 1, 1>, + ConvImpl::template process_tile<1, 1, 4, 2, 1, 2>, + ConvImpl::template process_tile<1, 1, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 4, 2, 2, 0>, + ConvImpl::template process_tile<1, 1, 4, 2, 2, 1>, + ConvImpl::template process_tile<1, 1, 4, 2, 2, 2>, + ConvImpl::template process_tile<1, 1, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 4, 2, 3, 0>, + ConvImpl::template process_tile<1, 1, 4, 2, 3, 1>, + ConvImpl::template process_tile<1, 1, 4, 2, 3, 2>, + ConvImpl::template process_tile<1, 1, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 4, 3, 0, 0>, + ConvImpl::template process_tile<1, 1, 4, 3, 0, 1>, + ConvImpl::template process_tile<1, 1, 4, 3, 0, 2>, + ConvImpl::template process_tile<1, 1, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 4, 3, 1, 0>, + ConvImpl::template process_tile<1, 1, 4, 3, 1, 1>, + ConvImpl::template process_tile<1, 1, 4, 3, 1, 2>, + ConvImpl::template process_tile<1, 1, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 4, 3, 2, 0>, + ConvImpl::template process_tile<1, 1, 4, 3, 2, 1>, + ConvImpl::template process_tile<1, 1, 4, 3, 2, 2>, + ConvImpl::template process_tile<1, 1, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 4, 3, 3, 0>, + ConvImpl::template process_tile<1, 1, 4, 3, 3, 1>, + ConvImpl::template process_tile<1, 1, 4, 3, 3, 2>, + ConvImpl::template process_tile<1, 1, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + ConvImpl::template process_tile<1, 1, 4, 4, 0, 0>, + ConvImpl::template process_tile<1, 1, 4, 4, 0, 1>, + ConvImpl::template process_tile<1, 1, 4, 4, 0, 2>, + ConvImpl::template process_tile<1, 1, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + ConvImpl::template process_tile<1, 1, 4, 4, 1, 0>, + ConvImpl::template process_tile<1, 1, 4, 4, 1, 1>, + ConvImpl::template process_tile<1, 1, 4, 4, 1, 2>, + ConvImpl::template process_tile<1, 1, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + ConvImpl::template process_tile<1, 1, 4, 4, 2, 0>, + ConvImpl::template process_tile<1, 1, 4, 4, 2, 1>, + ConvImpl::template process_tile<1, 1, 4, 4, 2, 2>, + ConvImpl::template process_tile<1, 1, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + ConvImpl::template process_tile<1, 1, 4, 4, 3, 0>, + ConvImpl::template process_tile<1, 1, 4, 4, 3, 1>, + ConvImpl::template process_tile<1, 1, 4, 4, 3, 2>, + ConvImpl::template process_tile<1, 1, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + }, // Input pad bottom = 4 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp new file mode 100644 index 0000000000..2104c0bbf7 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp @@ -0,0 +1,5207 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>; +using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>; + +template <> +const Conv::TileFn Conv::tile_fns + [max_in_pad_top] + [max_in_pad_left] + [max_in_pad_bottom] + [max_in_pad_right] + [max_out_pad_bottom] + [max_out_pad_right] = { + { // Input pad top = 0 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 0, 0, 0>, + Conv::template process_tile<0, 0, 0, 0, 0, 1>, + Conv::template process_tile<0, 0, 0, 0, 0, 2>, + Conv::template process_tile<0, 0, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 0, 1, 0>, + Conv::template process_tile<0, 0, 0, 0, 1, 1>, + Conv::template process_tile<0, 0, 0, 0, 1, 2>, + Conv::template process_tile<0, 0, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 0, 2, 0>, + Conv::template process_tile<0, 0, 0, 0, 2, 1>, + Conv::template process_tile<0, 0, 0, 0, 2, 2>, + Conv::template process_tile<0, 0, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 0, 3, 0>, + Conv::template process_tile<0, 0, 0, 0, 3, 1>, + Conv::template process_tile<0, 0, 0, 0, 3, 2>, + Conv::template process_tile<0, 0, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 1, 0, 0>, + Conv::template process_tile<0, 0, 0, 1, 0, 1>, + Conv::template process_tile<0, 0, 0, 1, 0, 2>, + Conv::template process_tile<0, 0, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 1, 1, 0>, + Conv::template process_tile<0, 0, 0, 1, 1, 1>, + Conv::template process_tile<0, 0, 0, 1, 1, 2>, + Conv::template process_tile<0, 0, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 1, 2, 0>, + Conv::template process_tile<0, 0, 0, 1, 2, 1>, + Conv::template process_tile<0, 0, 0, 1, 2, 2>, + Conv::template process_tile<0, 0, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 1, 3, 0>, + Conv::template process_tile<0, 0, 0, 1, 3, 1>, + Conv::template process_tile<0, 0, 0, 1, 3, 2>, + Conv::template process_tile<0, 0, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 2, 0, 0>, + Conv::template process_tile<0, 0, 0, 2, 0, 1>, + Conv::template process_tile<0, 0, 0, 2, 0, 2>, + Conv::template process_tile<0, 0, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 2, 1, 0>, + Conv::template process_tile<0, 0, 0, 2, 1, 1>, + Conv::template process_tile<0, 0, 0, 2, 1, 2>, + Conv::template process_tile<0, 0, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 2, 2, 0>, + Conv::template process_tile<0, 0, 0, 2, 2, 1>, + Conv::template process_tile<0, 0, 0, 2, 2, 2>, + Conv::template process_tile<0, 0, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 2, 3, 0>, + Conv::template process_tile<0, 0, 0, 2, 3, 1>, + Conv::template process_tile<0, 0, 0, 2, 3, 2>, + Conv::template process_tile<0, 0, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 3, 0, 0>, + Conv::template process_tile<0, 0, 0, 3, 0, 1>, + Conv::template process_tile<0, 0, 0, 3, 0, 2>, + Conv::template process_tile<0, 0, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 3, 1, 0>, + Conv::template process_tile<0, 0, 0, 3, 1, 1>, + Conv::template process_tile<0, 0, 0, 3, 1, 2>, + Conv::template process_tile<0, 0, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 3, 2, 0>, + Conv::template process_tile<0, 0, 0, 3, 2, 1>, + Conv::template process_tile<0, 0, 0, 3, 2, 2>, + Conv::template process_tile<0, 0, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 3, 3, 0>, + Conv::template process_tile<0, 0, 0, 3, 3, 1>, + Conv::template process_tile<0, 0, 0, 3, 3, 2>, + Conv::template process_tile<0, 0, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 4, 0, 0>, + Conv::template process_tile<0, 0, 0, 4, 0, 1>, + Conv::template process_tile<0, 0, 0, 4, 0, 2>, + Conv::template process_tile<0, 0, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 4, 1, 0>, + Conv::template process_tile<0, 0, 0, 4, 1, 1>, + Conv::template process_tile<0, 0, 0, 4, 1, 2>, + Conv::template process_tile<0, 0, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 4, 2, 0>, + Conv::template process_tile<0, 0, 0, 4, 2, 1>, + Conv::template process_tile<0, 0, 0, 4, 2, 2>, + Conv::template process_tile<0, 0, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 4, 3, 0>, + Conv::template process_tile<0, 0, 0, 4, 3, 1>, + Conv::template process_tile<0, 0, 0, 4, 3, 2>, + Conv::template process_tile<0, 0, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 5, 0, 0>, + Conv::template process_tile<0, 0, 0, 5, 0, 1>, + Conv::template process_tile<0, 0, 0, 5, 0, 2>, + Conv::template process_tile<0, 0, 0, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 5, 1, 0>, + Conv::template process_tile<0, 0, 0, 5, 1, 1>, + Conv::template process_tile<0, 0, 0, 5, 1, 2>, + Conv::template process_tile<0, 0, 0, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 5, 2, 0>, + Conv::template process_tile<0, 0, 0, 5, 2, 1>, + Conv::template process_tile<0, 0, 0, 5, 2, 2>, + Conv::template process_tile<0, 0, 0, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 5, 3, 0>, + Conv::template process_tile<0, 0, 0, 5, 3, 1>, + Conv::template process_tile<0, 0, 0, 5, 3, 2>, + Conv::template process_tile<0, 0, 0, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 0, 6, 0, 0>, + Conv::template process_tile<0, 0, 0, 6, 0, 1>, + Conv::template process_tile<0, 0, 0, 6, 0, 2>, + Conv::template process_tile<0, 0, 0, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 0, 6, 1, 0>, + Conv::template process_tile<0, 0, 0, 6, 1, 1>, + Conv::template process_tile<0, 0, 0, 6, 1, 2>, + Conv::template process_tile<0, 0, 0, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 0, 6, 2, 0>, + Conv::template process_tile<0, 0, 0, 6, 2, 1>, + Conv::template process_tile<0, 0, 0, 6, 2, 2>, + Conv::template process_tile<0, 0, 0, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 0, 6, 3, 0>, + Conv::template process_tile<0, 0, 0, 6, 3, 1>, + Conv::template process_tile<0, 0, 0, 6, 3, 2>, + Conv::template process_tile<0, 0, 0, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 0, 0, 0>, + Conv::template process_tile<0, 0, 1, 0, 0, 1>, + Conv::template process_tile<0, 0, 1, 0, 0, 2>, + Conv::template process_tile<0, 0, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 0, 1, 0>, + Conv::template process_tile<0, 0, 1, 0, 1, 1>, + Conv::template process_tile<0, 0, 1, 0, 1, 2>, + Conv::template process_tile<0, 0, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 0, 2, 0>, + Conv::template process_tile<0, 0, 1, 0, 2, 1>, + Conv::template process_tile<0, 0, 1, 0, 2, 2>, + Conv::template process_tile<0, 0, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 0, 3, 0>, + Conv::template process_tile<0, 0, 1, 0, 3, 1>, + Conv::template process_tile<0, 0, 1, 0, 3, 2>, + Conv::template process_tile<0, 0, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 1, 0, 0>, + Conv::template process_tile<0, 0, 1, 1, 0, 1>, + Conv::template process_tile<0, 0, 1, 1, 0, 2>, + Conv::template process_tile<0, 0, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 1, 1, 0>, + Conv::template process_tile<0, 0, 1, 1, 1, 1>, + Conv::template process_tile<0, 0, 1, 1, 1, 2>, + Conv::template process_tile<0, 0, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 1, 2, 0>, + Conv::template process_tile<0, 0, 1, 1, 2, 1>, + Conv::template process_tile<0, 0, 1, 1, 2, 2>, + Conv::template process_tile<0, 0, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 1, 3, 0>, + Conv::template process_tile<0, 0, 1, 1, 3, 1>, + Conv::template process_tile<0, 0, 1, 1, 3, 2>, + Conv::template process_tile<0, 0, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 2, 0, 0>, + Conv::template process_tile<0, 0, 1, 2, 0, 1>, + Conv::template process_tile<0, 0, 1, 2, 0, 2>, + Conv::template process_tile<0, 0, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 2, 1, 0>, + Conv::template process_tile<0, 0, 1, 2, 1, 1>, + Conv::template process_tile<0, 0, 1, 2, 1, 2>, + Conv::template process_tile<0, 0, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 2, 2, 0>, + Conv::template process_tile<0, 0, 1, 2, 2, 1>, + Conv::template process_tile<0, 0, 1, 2, 2, 2>, + Conv::template process_tile<0, 0, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 2, 3, 0>, + Conv::template process_tile<0, 0, 1, 2, 3, 1>, + Conv::template process_tile<0, 0, 1, 2, 3, 2>, + Conv::template process_tile<0, 0, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 3, 0, 0>, + Conv::template process_tile<0, 0, 1, 3, 0, 1>, + Conv::template process_tile<0, 0, 1, 3, 0, 2>, + Conv::template process_tile<0, 0, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 3, 1, 0>, + Conv::template process_tile<0, 0, 1, 3, 1, 1>, + Conv::template process_tile<0, 0, 1, 3, 1, 2>, + Conv::template process_tile<0, 0, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 3, 2, 0>, + Conv::template process_tile<0, 0, 1, 3, 2, 1>, + Conv::template process_tile<0, 0, 1, 3, 2, 2>, + Conv::template process_tile<0, 0, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 3, 3, 0>, + Conv::template process_tile<0, 0, 1, 3, 3, 1>, + Conv::template process_tile<0, 0, 1, 3, 3, 2>, + Conv::template process_tile<0, 0, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 4, 0, 0>, + Conv::template process_tile<0, 0, 1, 4, 0, 1>, + Conv::template process_tile<0, 0, 1, 4, 0, 2>, + Conv::template process_tile<0, 0, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 4, 1, 0>, + Conv::template process_tile<0, 0, 1, 4, 1, 1>, + Conv::template process_tile<0, 0, 1, 4, 1, 2>, + Conv::template process_tile<0, 0, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 4, 2, 0>, + Conv::template process_tile<0, 0, 1, 4, 2, 1>, + Conv::template process_tile<0, 0, 1, 4, 2, 2>, + Conv::template process_tile<0, 0, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 4, 3, 0>, + Conv::template process_tile<0, 0, 1, 4, 3, 1>, + Conv::template process_tile<0, 0, 1, 4, 3, 2>, + Conv::template process_tile<0, 0, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 5, 0, 0>, + Conv::template process_tile<0, 0, 1, 5, 0, 1>, + Conv::template process_tile<0, 0, 1, 5, 0, 2>, + Conv::template process_tile<0, 0, 1, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 5, 1, 0>, + Conv::template process_tile<0, 0, 1, 5, 1, 1>, + Conv::template process_tile<0, 0, 1, 5, 1, 2>, + Conv::template process_tile<0, 0, 1, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 5, 2, 0>, + Conv::template process_tile<0, 0, 1, 5, 2, 1>, + Conv::template process_tile<0, 0, 1, 5, 2, 2>, + Conv::template process_tile<0, 0, 1, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 5, 3, 0>, + Conv::template process_tile<0, 0, 1, 5, 3, 1>, + Conv::template process_tile<0, 0, 1, 5, 3, 2>, + Conv::template process_tile<0, 0, 1, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 1, 6, 0, 0>, + Conv::template process_tile<0, 0, 1, 6, 0, 1>, + Conv::template process_tile<0, 0, 1, 6, 0, 2>, + Conv::template process_tile<0, 0, 1, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 1, 6, 1, 0>, + Conv::template process_tile<0, 0, 1, 6, 1, 1>, + Conv::template process_tile<0, 0, 1, 6, 1, 2>, + Conv::template process_tile<0, 0, 1, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 1, 6, 2, 0>, + Conv::template process_tile<0, 0, 1, 6, 2, 1>, + Conv::template process_tile<0, 0, 1, 6, 2, 2>, + Conv::template process_tile<0, 0, 1, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 1, 6, 3, 0>, + Conv::template process_tile<0, 0, 1, 6, 3, 1>, + Conv::template process_tile<0, 0, 1, 6, 3, 2>, + Conv::template process_tile<0, 0, 1, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 0, 0, 0>, + Conv::template process_tile<0, 0, 2, 0, 0, 1>, + Conv::template process_tile<0, 0, 2, 0, 0, 2>, + Conv::template process_tile<0, 0, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 0, 1, 0>, + Conv::template process_tile<0, 0, 2, 0, 1, 1>, + Conv::template process_tile<0, 0, 2, 0, 1, 2>, + Conv::template process_tile<0, 0, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 0, 2, 0>, + Conv::template process_tile<0, 0, 2, 0, 2, 1>, + Conv::template process_tile<0, 0, 2, 0, 2, 2>, + Conv::template process_tile<0, 0, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 0, 3, 0>, + Conv::template process_tile<0, 0, 2, 0, 3, 1>, + Conv::template process_tile<0, 0, 2, 0, 3, 2>, + Conv::template process_tile<0, 0, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 1, 0, 0>, + Conv::template process_tile<0, 0, 2, 1, 0, 1>, + Conv::template process_tile<0, 0, 2, 1, 0, 2>, + Conv::template process_tile<0, 0, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 1, 1, 0>, + Conv::template process_tile<0, 0, 2, 1, 1, 1>, + Conv::template process_tile<0, 0, 2, 1, 1, 2>, + Conv::template process_tile<0, 0, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 1, 2, 0>, + Conv::template process_tile<0, 0, 2, 1, 2, 1>, + Conv::template process_tile<0, 0, 2, 1, 2, 2>, + Conv::template process_tile<0, 0, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 1, 3, 0>, + Conv::template process_tile<0, 0, 2, 1, 3, 1>, + Conv::template process_tile<0, 0, 2, 1, 3, 2>, + Conv::template process_tile<0, 0, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 2, 0, 0>, + Conv::template process_tile<0, 0, 2, 2, 0, 1>, + Conv::template process_tile<0, 0, 2, 2, 0, 2>, + Conv::template process_tile<0, 0, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 2, 1, 0>, + Conv::template process_tile<0, 0, 2, 2, 1, 1>, + Conv::template process_tile<0, 0, 2, 2, 1, 2>, + Conv::template process_tile<0, 0, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 2, 2, 0>, + Conv::template process_tile<0, 0, 2, 2, 2, 1>, + Conv::template process_tile<0, 0, 2, 2, 2, 2>, + Conv::template process_tile<0, 0, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 2, 3, 0>, + Conv::template process_tile<0, 0, 2, 2, 3, 1>, + Conv::template process_tile<0, 0, 2, 2, 3, 2>, + Conv::template process_tile<0, 0, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 3, 0, 0>, + Conv::template process_tile<0, 0, 2, 3, 0, 1>, + Conv::template process_tile<0, 0, 2, 3, 0, 2>, + Conv::template process_tile<0, 0, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 3, 1, 0>, + Conv::template process_tile<0, 0, 2, 3, 1, 1>, + Conv::template process_tile<0, 0, 2, 3, 1, 2>, + Conv::template process_tile<0, 0, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 3, 2, 0>, + Conv::template process_tile<0, 0, 2, 3, 2, 1>, + Conv::template process_tile<0, 0, 2, 3, 2, 2>, + Conv::template process_tile<0, 0, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 3, 3, 0>, + Conv::template process_tile<0, 0, 2, 3, 3, 1>, + Conv::template process_tile<0, 0, 2, 3, 3, 2>, + Conv::template process_tile<0, 0, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 4, 0, 0>, + Conv::template process_tile<0, 0, 2, 4, 0, 1>, + Conv::template process_tile<0, 0, 2, 4, 0, 2>, + Conv::template process_tile<0, 0, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 4, 1, 0>, + Conv::template process_tile<0, 0, 2, 4, 1, 1>, + Conv::template process_tile<0, 0, 2, 4, 1, 2>, + Conv::template process_tile<0, 0, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 4, 2, 0>, + Conv::template process_tile<0, 0, 2, 4, 2, 1>, + Conv::template process_tile<0, 0, 2, 4, 2, 2>, + Conv::template process_tile<0, 0, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 4, 3, 0>, + Conv::template process_tile<0, 0, 2, 4, 3, 1>, + Conv::template process_tile<0, 0, 2, 4, 3, 2>, + Conv::template process_tile<0, 0, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 5, 0, 0>, + Conv::template process_tile<0, 0, 2, 5, 0, 1>, + Conv::template process_tile<0, 0, 2, 5, 0, 2>, + Conv::template process_tile<0, 0, 2, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 5, 1, 0>, + Conv::template process_tile<0, 0, 2, 5, 1, 1>, + Conv::template process_tile<0, 0, 2, 5, 1, 2>, + Conv::template process_tile<0, 0, 2, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 5, 2, 0>, + Conv::template process_tile<0, 0, 2, 5, 2, 1>, + Conv::template process_tile<0, 0, 2, 5, 2, 2>, + Conv::template process_tile<0, 0, 2, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 5, 3, 0>, + Conv::template process_tile<0, 0, 2, 5, 3, 1>, + Conv::template process_tile<0, 0, 2, 5, 3, 2>, + Conv::template process_tile<0, 0, 2, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 2, 6, 0, 0>, + Conv::template process_tile<0, 0, 2, 6, 0, 1>, + Conv::template process_tile<0, 0, 2, 6, 0, 2>, + Conv::template process_tile<0, 0, 2, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 2, 6, 1, 0>, + Conv::template process_tile<0, 0, 2, 6, 1, 1>, + Conv::template process_tile<0, 0, 2, 6, 1, 2>, + Conv::template process_tile<0, 0, 2, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 2, 6, 2, 0>, + Conv::template process_tile<0, 0, 2, 6, 2, 1>, + Conv::template process_tile<0, 0, 2, 6, 2, 2>, + Conv::template process_tile<0, 0, 2, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 2, 6, 3, 0>, + Conv::template process_tile<0, 0, 2, 6, 3, 1>, + Conv::template process_tile<0, 0, 2, 6, 3, 2>, + Conv::template process_tile<0, 0, 2, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 0, 0, 0>, + Conv::template process_tile<0, 0, 3, 0, 0, 1>, + Conv::template process_tile<0, 0, 3, 0, 0, 2>, + Conv::template process_tile<0, 0, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 0, 1, 0>, + Conv::template process_tile<0, 0, 3, 0, 1, 1>, + Conv::template process_tile<0, 0, 3, 0, 1, 2>, + Conv::template process_tile<0, 0, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 0, 2, 0>, + Conv::template process_tile<0, 0, 3, 0, 2, 1>, + Conv::template process_tile<0, 0, 3, 0, 2, 2>, + Conv::template process_tile<0, 0, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 0, 3, 0>, + Conv::template process_tile<0, 0, 3, 0, 3, 1>, + Conv::template process_tile<0, 0, 3, 0, 3, 2>, + Conv::template process_tile<0, 0, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 1, 0, 0>, + Conv::template process_tile<0, 0, 3, 1, 0, 1>, + Conv::template process_tile<0, 0, 3, 1, 0, 2>, + Conv::template process_tile<0, 0, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 1, 1, 0>, + Conv::template process_tile<0, 0, 3, 1, 1, 1>, + Conv::template process_tile<0, 0, 3, 1, 1, 2>, + Conv::template process_tile<0, 0, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 1, 2, 0>, + Conv::template process_tile<0, 0, 3, 1, 2, 1>, + Conv::template process_tile<0, 0, 3, 1, 2, 2>, + Conv::template process_tile<0, 0, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 1, 3, 0>, + Conv::template process_tile<0, 0, 3, 1, 3, 1>, + Conv::template process_tile<0, 0, 3, 1, 3, 2>, + Conv::template process_tile<0, 0, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 2, 0, 0>, + Conv::template process_tile<0, 0, 3, 2, 0, 1>, + Conv::template process_tile<0, 0, 3, 2, 0, 2>, + Conv::template process_tile<0, 0, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 2, 1, 0>, + Conv::template process_tile<0, 0, 3, 2, 1, 1>, + Conv::template process_tile<0, 0, 3, 2, 1, 2>, + Conv::template process_tile<0, 0, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 2, 2, 0>, + Conv::template process_tile<0, 0, 3, 2, 2, 1>, + Conv::template process_tile<0, 0, 3, 2, 2, 2>, + Conv::template process_tile<0, 0, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 2, 3, 0>, + Conv::template process_tile<0, 0, 3, 2, 3, 1>, + Conv::template process_tile<0, 0, 3, 2, 3, 2>, + Conv::template process_tile<0, 0, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 3, 0, 0>, + Conv::template process_tile<0, 0, 3, 3, 0, 1>, + Conv::template process_tile<0, 0, 3, 3, 0, 2>, + Conv::template process_tile<0, 0, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 3, 1, 0>, + Conv::template process_tile<0, 0, 3, 3, 1, 1>, + Conv::template process_tile<0, 0, 3, 3, 1, 2>, + Conv::template process_tile<0, 0, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 3, 2, 0>, + Conv::template process_tile<0, 0, 3, 3, 2, 1>, + Conv::template process_tile<0, 0, 3, 3, 2, 2>, + Conv::template process_tile<0, 0, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 3, 3, 0>, + Conv::template process_tile<0, 0, 3, 3, 3, 1>, + Conv::template process_tile<0, 0, 3, 3, 3, 2>, + Conv::template process_tile<0, 0, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 4, 0, 0>, + Conv::template process_tile<0, 0, 3, 4, 0, 1>, + Conv::template process_tile<0, 0, 3, 4, 0, 2>, + Conv::template process_tile<0, 0, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 4, 1, 0>, + Conv::template process_tile<0, 0, 3, 4, 1, 1>, + Conv::template process_tile<0, 0, 3, 4, 1, 2>, + Conv::template process_tile<0, 0, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 4, 2, 0>, + Conv::template process_tile<0, 0, 3, 4, 2, 1>, + Conv::template process_tile<0, 0, 3, 4, 2, 2>, + Conv::template process_tile<0, 0, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 4, 3, 0>, + Conv::template process_tile<0, 0, 3, 4, 3, 1>, + Conv::template process_tile<0, 0, 3, 4, 3, 2>, + Conv::template process_tile<0, 0, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 5, 0, 0>, + Conv::template process_tile<0, 0, 3, 5, 0, 1>, + Conv::template process_tile<0, 0, 3, 5, 0, 2>, + Conv::template process_tile<0, 0, 3, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 5, 1, 0>, + Conv::template process_tile<0, 0, 3, 5, 1, 1>, + Conv::template process_tile<0, 0, 3, 5, 1, 2>, + Conv::template process_tile<0, 0, 3, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 5, 2, 0>, + Conv::template process_tile<0, 0, 3, 5, 2, 1>, + Conv::template process_tile<0, 0, 3, 5, 2, 2>, + Conv::template process_tile<0, 0, 3, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 5, 3, 0>, + Conv::template process_tile<0, 0, 3, 5, 3, 1>, + Conv::template process_tile<0, 0, 3, 5, 3, 2>, + Conv::template process_tile<0, 0, 3, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 3, 6, 0, 0>, + Conv::template process_tile<0, 0, 3, 6, 0, 1>, + Conv::template process_tile<0, 0, 3, 6, 0, 2>, + Conv::template process_tile<0, 0, 3, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 3, 6, 1, 0>, + Conv::template process_tile<0, 0, 3, 6, 1, 1>, + Conv::template process_tile<0, 0, 3, 6, 1, 2>, + Conv::template process_tile<0, 0, 3, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 3, 6, 2, 0>, + Conv::template process_tile<0, 0, 3, 6, 2, 1>, + Conv::template process_tile<0, 0, 3, 6, 2, 2>, + Conv::template process_tile<0, 0, 3, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 3, 6, 3, 0>, + Conv::template process_tile<0, 0, 3, 6, 3, 1>, + Conv::template process_tile<0, 0, 3, 6, 3, 2>, + Conv::template process_tile<0, 0, 3, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 0, 0, 0>, + Conv::template process_tile<0, 0, 4, 0, 0, 1>, + Conv::template process_tile<0, 0, 4, 0, 0, 2>, + Conv::template process_tile<0, 0, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 0, 1, 0>, + Conv::template process_tile<0, 0, 4, 0, 1, 1>, + Conv::template process_tile<0, 0, 4, 0, 1, 2>, + Conv::template process_tile<0, 0, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 0, 2, 0>, + Conv::template process_tile<0, 0, 4, 0, 2, 1>, + Conv::template process_tile<0, 0, 4, 0, 2, 2>, + Conv::template process_tile<0, 0, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 0, 3, 0>, + Conv::template process_tile<0, 0, 4, 0, 3, 1>, + Conv::template process_tile<0, 0, 4, 0, 3, 2>, + Conv::template process_tile<0, 0, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 1, 0, 0>, + Conv::template process_tile<0, 0, 4, 1, 0, 1>, + Conv::template process_tile<0, 0, 4, 1, 0, 2>, + Conv::template process_tile<0, 0, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 1, 1, 0>, + Conv::template process_tile<0, 0, 4, 1, 1, 1>, + Conv::template process_tile<0, 0, 4, 1, 1, 2>, + Conv::template process_tile<0, 0, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 1, 2, 0>, + Conv::template process_tile<0, 0, 4, 1, 2, 1>, + Conv::template process_tile<0, 0, 4, 1, 2, 2>, + Conv::template process_tile<0, 0, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 1, 3, 0>, + Conv::template process_tile<0, 0, 4, 1, 3, 1>, + Conv::template process_tile<0, 0, 4, 1, 3, 2>, + Conv::template process_tile<0, 0, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 2, 0, 0>, + Conv::template process_tile<0, 0, 4, 2, 0, 1>, + Conv::template process_tile<0, 0, 4, 2, 0, 2>, + Conv::template process_tile<0, 0, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 2, 1, 0>, + Conv::template process_tile<0, 0, 4, 2, 1, 1>, + Conv::template process_tile<0, 0, 4, 2, 1, 2>, + Conv::template process_tile<0, 0, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 2, 2, 0>, + Conv::template process_tile<0, 0, 4, 2, 2, 1>, + Conv::template process_tile<0, 0, 4, 2, 2, 2>, + Conv::template process_tile<0, 0, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 2, 3, 0>, + Conv::template process_tile<0, 0, 4, 2, 3, 1>, + Conv::template process_tile<0, 0, 4, 2, 3, 2>, + Conv::template process_tile<0, 0, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 3, 0, 0>, + Conv::template process_tile<0, 0, 4, 3, 0, 1>, + Conv::template process_tile<0, 0, 4, 3, 0, 2>, + Conv::template process_tile<0, 0, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 3, 1, 0>, + Conv::template process_tile<0, 0, 4, 3, 1, 1>, + Conv::template process_tile<0, 0, 4, 3, 1, 2>, + Conv::template process_tile<0, 0, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 3, 2, 0>, + Conv::template process_tile<0, 0, 4, 3, 2, 1>, + Conv::template process_tile<0, 0, 4, 3, 2, 2>, + Conv::template process_tile<0, 0, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 3, 3, 0>, + Conv::template process_tile<0, 0, 4, 3, 3, 1>, + Conv::template process_tile<0, 0, 4, 3, 3, 2>, + Conv::template process_tile<0, 0, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 4, 0, 0>, + Conv::template process_tile<0, 0, 4, 4, 0, 1>, + Conv::template process_tile<0, 0, 4, 4, 0, 2>, + Conv::template process_tile<0, 0, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 4, 1, 0>, + Conv::template process_tile<0, 0, 4, 4, 1, 1>, + Conv::template process_tile<0, 0, 4, 4, 1, 2>, + Conv::template process_tile<0, 0, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 4, 2, 0>, + Conv::template process_tile<0, 0, 4, 4, 2, 1>, + Conv::template process_tile<0, 0, 4, 4, 2, 2>, + Conv::template process_tile<0, 0, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 4, 3, 0>, + Conv::template process_tile<0, 0, 4, 4, 3, 1>, + Conv::template process_tile<0, 0, 4, 4, 3, 2>, + Conv::template process_tile<0, 0, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 5, 0, 0>, + Conv::template process_tile<0, 0, 4, 5, 0, 1>, + Conv::template process_tile<0, 0, 4, 5, 0, 2>, + Conv::template process_tile<0, 0, 4, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 5, 1, 0>, + Conv::template process_tile<0, 0, 4, 5, 1, 1>, + Conv::template process_tile<0, 0, 4, 5, 1, 2>, + Conv::template process_tile<0, 0, 4, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 5, 2, 0>, + Conv::template process_tile<0, 0, 4, 5, 2, 1>, + Conv::template process_tile<0, 0, 4, 5, 2, 2>, + Conv::template process_tile<0, 0, 4, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 5, 3, 0>, + Conv::template process_tile<0, 0, 4, 5, 3, 1>, + Conv::template process_tile<0, 0, 4, 5, 3, 2>, + Conv::template process_tile<0, 0, 4, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 4, 6, 0, 0>, + Conv::template process_tile<0, 0, 4, 6, 0, 1>, + Conv::template process_tile<0, 0, 4, 6, 0, 2>, + Conv::template process_tile<0, 0, 4, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 4, 6, 1, 0>, + Conv::template process_tile<0, 0, 4, 6, 1, 1>, + Conv::template process_tile<0, 0, 4, 6, 1, 2>, + Conv::template process_tile<0, 0, 4, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 4, 6, 2, 0>, + Conv::template process_tile<0, 0, 4, 6, 2, 1>, + Conv::template process_tile<0, 0, 4, 6, 2, 2>, + Conv::template process_tile<0, 0, 4, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 4, 6, 3, 0>, + Conv::template process_tile<0, 0, 4, 6, 3, 1>, + Conv::template process_tile<0, 0, 4, 6, 3, 2>, + Conv::template process_tile<0, 0, 4, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 0, 0, 0>, + Conv::template process_tile<0, 0, 5, 0, 0, 1>, + Conv::template process_tile<0, 0, 5, 0, 0, 2>, + Conv::template process_tile<0, 0, 5, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 0, 1, 0>, + Conv::template process_tile<0, 0, 5, 0, 1, 1>, + Conv::template process_tile<0, 0, 5, 0, 1, 2>, + Conv::template process_tile<0, 0, 5, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 0, 2, 0>, + Conv::template process_tile<0, 0, 5, 0, 2, 1>, + Conv::template process_tile<0, 0, 5, 0, 2, 2>, + Conv::template process_tile<0, 0, 5, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 0, 3, 0>, + Conv::template process_tile<0, 0, 5, 0, 3, 1>, + Conv::template process_tile<0, 0, 5, 0, 3, 2>, + Conv::template process_tile<0, 0, 5, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 1, 0, 0>, + Conv::template process_tile<0, 0, 5, 1, 0, 1>, + Conv::template process_tile<0, 0, 5, 1, 0, 2>, + Conv::template process_tile<0, 0, 5, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 1, 1, 0>, + Conv::template process_tile<0, 0, 5, 1, 1, 1>, + Conv::template process_tile<0, 0, 5, 1, 1, 2>, + Conv::template process_tile<0, 0, 5, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 1, 2, 0>, + Conv::template process_tile<0, 0, 5, 1, 2, 1>, + Conv::template process_tile<0, 0, 5, 1, 2, 2>, + Conv::template process_tile<0, 0, 5, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 1, 3, 0>, + Conv::template process_tile<0, 0, 5, 1, 3, 1>, + Conv::template process_tile<0, 0, 5, 1, 3, 2>, + Conv::template process_tile<0, 0, 5, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 2, 0, 0>, + Conv::template process_tile<0, 0, 5, 2, 0, 1>, + Conv::template process_tile<0, 0, 5, 2, 0, 2>, + Conv::template process_tile<0, 0, 5, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 2, 1, 0>, + Conv::template process_tile<0, 0, 5, 2, 1, 1>, + Conv::template process_tile<0, 0, 5, 2, 1, 2>, + Conv::template process_tile<0, 0, 5, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 2, 2, 0>, + Conv::template process_tile<0, 0, 5, 2, 2, 1>, + Conv::template process_tile<0, 0, 5, 2, 2, 2>, + Conv::template process_tile<0, 0, 5, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 2, 3, 0>, + Conv::template process_tile<0, 0, 5, 2, 3, 1>, + Conv::template process_tile<0, 0, 5, 2, 3, 2>, + Conv::template process_tile<0, 0, 5, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 3, 0, 0>, + Conv::template process_tile<0, 0, 5, 3, 0, 1>, + Conv::template process_tile<0, 0, 5, 3, 0, 2>, + Conv::template process_tile<0, 0, 5, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 3, 1, 0>, + Conv::template process_tile<0, 0, 5, 3, 1, 1>, + Conv::template process_tile<0, 0, 5, 3, 1, 2>, + Conv::template process_tile<0, 0, 5, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 3, 2, 0>, + Conv::template process_tile<0, 0, 5, 3, 2, 1>, + Conv::template process_tile<0, 0, 5, 3, 2, 2>, + Conv::template process_tile<0, 0, 5, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 3, 3, 0>, + Conv::template process_tile<0, 0, 5, 3, 3, 1>, + Conv::template process_tile<0, 0, 5, 3, 3, 2>, + Conv::template process_tile<0, 0, 5, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 4, 0, 0>, + Conv::template process_tile<0, 0, 5, 4, 0, 1>, + Conv::template process_tile<0, 0, 5, 4, 0, 2>, + Conv::template process_tile<0, 0, 5, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 4, 1, 0>, + Conv::template process_tile<0, 0, 5, 4, 1, 1>, + Conv::template process_tile<0, 0, 5, 4, 1, 2>, + Conv::template process_tile<0, 0, 5, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 4, 2, 0>, + Conv::template process_tile<0, 0, 5, 4, 2, 1>, + Conv::template process_tile<0, 0, 5, 4, 2, 2>, + Conv::template process_tile<0, 0, 5, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 4, 3, 0>, + Conv::template process_tile<0, 0, 5, 4, 3, 1>, + Conv::template process_tile<0, 0, 5, 4, 3, 2>, + Conv::template process_tile<0, 0, 5, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 5, 0, 0>, + Conv::template process_tile<0, 0, 5, 5, 0, 1>, + Conv::template process_tile<0, 0, 5, 5, 0, 2>, + Conv::template process_tile<0, 0, 5, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 5, 1, 0>, + Conv::template process_tile<0, 0, 5, 5, 1, 1>, + Conv::template process_tile<0, 0, 5, 5, 1, 2>, + Conv::template process_tile<0, 0, 5, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 5, 2, 0>, + Conv::template process_tile<0, 0, 5, 5, 2, 1>, + Conv::template process_tile<0, 0, 5, 5, 2, 2>, + Conv::template process_tile<0, 0, 5, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 5, 3, 0>, + Conv::template process_tile<0, 0, 5, 5, 3, 1>, + Conv::template process_tile<0, 0, 5, 5, 3, 2>, + Conv::template process_tile<0, 0, 5, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 5, 6, 0, 0>, + Conv::template process_tile<0, 0, 5, 6, 0, 1>, + Conv::template process_tile<0, 0, 5, 6, 0, 2>, + Conv::template process_tile<0, 0, 5, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 5, 6, 1, 0>, + Conv::template process_tile<0, 0, 5, 6, 1, 1>, + Conv::template process_tile<0, 0, 5, 6, 1, 2>, + Conv::template process_tile<0, 0, 5, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 5, 6, 2, 0>, + Conv::template process_tile<0, 0, 5, 6, 2, 1>, + Conv::template process_tile<0, 0, 5, 6, 2, 2>, + Conv::template process_tile<0, 0, 5, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 5, 6, 3, 0>, + Conv::template process_tile<0, 0, 5, 6, 3, 1>, + Conv::template process_tile<0, 0, 5, 6, 3, 2>, + Conv::template process_tile<0, 0, 5, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 0, 0, 0>, + Conv::template process_tile<0, 0, 6, 0, 0, 1>, + Conv::template process_tile<0, 0, 6, 0, 0, 2>, + Conv::template process_tile<0, 0, 6, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 0, 1, 0>, + Conv::template process_tile<0, 0, 6, 0, 1, 1>, + Conv::template process_tile<0, 0, 6, 0, 1, 2>, + Conv::template process_tile<0, 0, 6, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 0, 2, 0>, + Conv::template process_tile<0, 0, 6, 0, 2, 1>, + Conv::template process_tile<0, 0, 6, 0, 2, 2>, + Conv::template process_tile<0, 0, 6, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 0, 3, 0>, + Conv::template process_tile<0, 0, 6, 0, 3, 1>, + Conv::template process_tile<0, 0, 6, 0, 3, 2>, + Conv::template process_tile<0, 0, 6, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 1, 0, 0>, + Conv::template process_tile<0, 0, 6, 1, 0, 1>, + Conv::template process_tile<0, 0, 6, 1, 0, 2>, + Conv::template process_tile<0, 0, 6, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 1, 1, 0>, + Conv::template process_tile<0, 0, 6, 1, 1, 1>, + Conv::template process_tile<0, 0, 6, 1, 1, 2>, + Conv::template process_tile<0, 0, 6, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 1, 2, 0>, + Conv::template process_tile<0, 0, 6, 1, 2, 1>, + Conv::template process_tile<0, 0, 6, 1, 2, 2>, + Conv::template process_tile<0, 0, 6, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 1, 3, 0>, + Conv::template process_tile<0, 0, 6, 1, 3, 1>, + Conv::template process_tile<0, 0, 6, 1, 3, 2>, + Conv::template process_tile<0, 0, 6, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 2, 0, 0>, + Conv::template process_tile<0, 0, 6, 2, 0, 1>, + Conv::template process_tile<0, 0, 6, 2, 0, 2>, + Conv::template process_tile<0, 0, 6, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 2, 1, 0>, + Conv::template process_tile<0, 0, 6, 2, 1, 1>, + Conv::template process_tile<0, 0, 6, 2, 1, 2>, + Conv::template process_tile<0, 0, 6, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 2, 2, 0>, + Conv::template process_tile<0, 0, 6, 2, 2, 1>, + Conv::template process_tile<0, 0, 6, 2, 2, 2>, + Conv::template process_tile<0, 0, 6, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 2, 3, 0>, + Conv::template process_tile<0, 0, 6, 2, 3, 1>, + Conv::template process_tile<0, 0, 6, 2, 3, 2>, + Conv::template process_tile<0, 0, 6, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 3, 0, 0>, + Conv::template process_tile<0, 0, 6, 3, 0, 1>, + Conv::template process_tile<0, 0, 6, 3, 0, 2>, + Conv::template process_tile<0, 0, 6, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 3, 1, 0>, + Conv::template process_tile<0, 0, 6, 3, 1, 1>, + Conv::template process_tile<0, 0, 6, 3, 1, 2>, + Conv::template process_tile<0, 0, 6, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 3, 2, 0>, + Conv::template process_tile<0, 0, 6, 3, 2, 1>, + Conv::template process_tile<0, 0, 6, 3, 2, 2>, + Conv::template process_tile<0, 0, 6, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 3, 3, 0>, + Conv::template process_tile<0, 0, 6, 3, 3, 1>, + Conv::template process_tile<0, 0, 6, 3, 3, 2>, + Conv::template process_tile<0, 0, 6, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 4, 0, 0>, + Conv::template process_tile<0, 0, 6, 4, 0, 1>, + Conv::template process_tile<0, 0, 6, 4, 0, 2>, + Conv::template process_tile<0, 0, 6, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 4, 1, 0>, + Conv::template process_tile<0, 0, 6, 4, 1, 1>, + Conv::template process_tile<0, 0, 6, 4, 1, 2>, + Conv::template process_tile<0, 0, 6, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 4, 2, 0>, + Conv::template process_tile<0, 0, 6, 4, 2, 1>, + Conv::template process_tile<0, 0, 6, 4, 2, 2>, + Conv::template process_tile<0, 0, 6, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 4, 3, 0>, + Conv::template process_tile<0, 0, 6, 4, 3, 1>, + Conv::template process_tile<0, 0, 6, 4, 3, 2>, + Conv::template process_tile<0, 0, 6, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 5, 0, 0>, + Conv::template process_tile<0, 0, 6, 5, 0, 1>, + Conv::template process_tile<0, 0, 6, 5, 0, 2>, + Conv::template process_tile<0, 0, 6, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 5, 1, 0>, + Conv::template process_tile<0, 0, 6, 5, 1, 1>, + Conv::template process_tile<0, 0, 6, 5, 1, 2>, + Conv::template process_tile<0, 0, 6, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 5, 2, 0>, + Conv::template process_tile<0, 0, 6, 5, 2, 1>, + Conv::template process_tile<0, 0, 6, 5, 2, 2>, + Conv::template process_tile<0, 0, 6, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 5, 3, 0>, + Conv::template process_tile<0, 0, 6, 5, 3, 1>, + Conv::template process_tile<0, 0, 6, 5, 3, 2>, + Conv::template process_tile<0, 0, 6, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 0, 6, 6, 0, 0>, + Conv::template process_tile<0, 0, 6, 6, 0, 1>, + Conv::template process_tile<0, 0, 6, 6, 0, 2>, + Conv::template process_tile<0, 0, 6, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 0, 6, 6, 1, 0>, + Conv::template process_tile<0, 0, 6, 6, 1, 1>, + Conv::template process_tile<0, 0, 6, 6, 1, 2>, + Conv::template process_tile<0, 0, 6, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 0, 6, 6, 2, 0>, + Conv::template process_tile<0, 0, 6, 6, 2, 1>, + Conv::template process_tile<0, 0, 6, 6, 2, 2>, + Conv::template process_tile<0, 0, 6, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 0, 6, 6, 3, 0>, + Conv::template process_tile<0, 0, 6, 6, 3, 1>, + Conv::template process_tile<0, 0, 6, 6, 3, 2>, + Conv::template process_tile<0, 0, 6, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 0, 0, 0>, + Conv::template process_tile<0, 1, 0, 0, 0, 1>, + Conv::template process_tile<0, 1, 0, 0, 0, 2>, + Conv::template process_tile<0, 1, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 0, 1, 0>, + Conv::template process_tile<0, 1, 0, 0, 1, 1>, + Conv::template process_tile<0, 1, 0, 0, 1, 2>, + Conv::template process_tile<0, 1, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 0, 2, 0>, + Conv::template process_tile<0, 1, 0, 0, 2, 1>, + Conv::template process_tile<0, 1, 0, 0, 2, 2>, + Conv::template process_tile<0, 1, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 0, 3, 0>, + Conv::template process_tile<0, 1, 0, 0, 3, 1>, + Conv::template process_tile<0, 1, 0, 0, 3, 2>, + Conv::template process_tile<0, 1, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 1, 0, 0>, + Conv::template process_tile<0, 1, 0, 1, 0, 1>, + Conv::template process_tile<0, 1, 0, 1, 0, 2>, + Conv::template process_tile<0, 1, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 1, 1, 0>, + Conv::template process_tile<0, 1, 0, 1, 1, 1>, + Conv::template process_tile<0, 1, 0, 1, 1, 2>, + Conv::template process_tile<0, 1, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 1, 2, 0>, + Conv::template process_tile<0, 1, 0, 1, 2, 1>, + Conv::template process_tile<0, 1, 0, 1, 2, 2>, + Conv::template process_tile<0, 1, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 1, 3, 0>, + Conv::template process_tile<0, 1, 0, 1, 3, 1>, + Conv::template process_tile<0, 1, 0, 1, 3, 2>, + Conv::template process_tile<0, 1, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 2, 0, 0>, + Conv::template process_tile<0, 1, 0, 2, 0, 1>, + Conv::template process_tile<0, 1, 0, 2, 0, 2>, + Conv::template process_tile<0, 1, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 2, 1, 0>, + Conv::template process_tile<0, 1, 0, 2, 1, 1>, + Conv::template process_tile<0, 1, 0, 2, 1, 2>, + Conv::template process_tile<0, 1, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 2, 2, 0>, + Conv::template process_tile<0, 1, 0, 2, 2, 1>, + Conv::template process_tile<0, 1, 0, 2, 2, 2>, + Conv::template process_tile<0, 1, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 2, 3, 0>, + Conv::template process_tile<0, 1, 0, 2, 3, 1>, + Conv::template process_tile<0, 1, 0, 2, 3, 2>, + Conv::template process_tile<0, 1, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 3, 0, 0>, + Conv::template process_tile<0, 1, 0, 3, 0, 1>, + Conv::template process_tile<0, 1, 0, 3, 0, 2>, + Conv::template process_tile<0, 1, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 3, 1, 0>, + Conv::template process_tile<0, 1, 0, 3, 1, 1>, + Conv::template process_tile<0, 1, 0, 3, 1, 2>, + Conv::template process_tile<0, 1, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 3, 2, 0>, + Conv::template process_tile<0, 1, 0, 3, 2, 1>, + Conv::template process_tile<0, 1, 0, 3, 2, 2>, + Conv::template process_tile<0, 1, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 3, 3, 0>, + Conv::template process_tile<0, 1, 0, 3, 3, 1>, + Conv::template process_tile<0, 1, 0, 3, 3, 2>, + Conv::template process_tile<0, 1, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 4, 0, 0>, + Conv::template process_tile<0, 1, 0, 4, 0, 1>, + Conv::template process_tile<0, 1, 0, 4, 0, 2>, + Conv::template process_tile<0, 1, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 4, 1, 0>, + Conv::template process_tile<0, 1, 0, 4, 1, 1>, + Conv::template process_tile<0, 1, 0, 4, 1, 2>, + Conv::template process_tile<0, 1, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 4, 2, 0>, + Conv::template process_tile<0, 1, 0, 4, 2, 1>, + Conv::template process_tile<0, 1, 0, 4, 2, 2>, + Conv::template process_tile<0, 1, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 4, 3, 0>, + Conv::template process_tile<0, 1, 0, 4, 3, 1>, + Conv::template process_tile<0, 1, 0, 4, 3, 2>, + Conv::template process_tile<0, 1, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 5, 0, 0>, + Conv::template process_tile<0, 1, 0, 5, 0, 1>, + Conv::template process_tile<0, 1, 0, 5, 0, 2>, + Conv::template process_tile<0, 1, 0, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 5, 1, 0>, + Conv::template process_tile<0, 1, 0, 5, 1, 1>, + Conv::template process_tile<0, 1, 0, 5, 1, 2>, + Conv::template process_tile<0, 1, 0, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 5, 2, 0>, + Conv::template process_tile<0, 1, 0, 5, 2, 1>, + Conv::template process_tile<0, 1, 0, 5, 2, 2>, + Conv::template process_tile<0, 1, 0, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 5, 3, 0>, + Conv::template process_tile<0, 1, 0, 5, 3, 1>, + Conv::template process_tile<0, 1, 0, 5, 3, 2>, + Conv::template process_tile<0, 1, 0, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 0, 6, 0, 0>, + Conv::template process_tile<0, 1, 0, 6, 0, 1>, + Conv::template process_tile<0, 1, 0, 6, 0, 2>, + Conv::template process_tile<0, 1, 0, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 0, 6, 1, 0>, + Conv::template process_tile<0, 1, 0, 6, 1, 1>, + Conv::template process_tile<0, 1, 0, 6, 1, 2>, + Conv::template process_tile<0, 1, 0, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 0, 6, 2, 0>, + Conv::template process_tile<0, 1, 0, 6, 2, 1>, + Conv::template process_tile<0, 1, 0, 6, 2, 2>, + Conv::template process_tile<0, 1, 0, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 0, 6, 3, 0>, + Conv::template process_tile<0, 1, 0, 6, 3, 1>, + Conv::template process_tile<0, 1, 0, 6, 3, 2>, + Conv::template process_tile<0, 1, 0, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 0, 0, 0>, + Conv::template process_tile<0, 1, 1, 0, 0, 1>, + Conv::template process_tile<0, 1, 1, 0, 0, 2>, + Conv::template process_tile<0, 1, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 0, 1, 0>, + Conv::template process_tile<0, 1, 1, 0, 1, 1>, + Conv::template process_tile<0, 1, 1, 0, 1, 2>, + Conv::template process_tile<0, 1, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 0, 2, 0>, + Conv::template process_tile<0, 1, 1, 0, 2, 1>, + Conv::template process_tile<0, 1, 1, 0, 2, 2>, + Conv::template process_tile<0, 1, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 0, 3, 0>, + Conv::template process_tile<0, 1, 1, 0, 3, 1>, + Conv::template process_tile<0, 1, 1, 0, 3, 2>, + Conv::template process_tile<0, 1, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 1, 0, 0>, + Conv::template process_tile<0, 1, 1, 1, 0, 1>, + Conv::template process_tile<0, 1, 1, 1, 0, 2>, + Conv::template process_tile<0, 1, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 1, 1, 0>, + Conv::template process_tile<0, 1, 1, 1, 1, 1>, + Conv::template process_tile<0, 1, 1, 1, 1, 2>, + Conv::template process_tile<0, 1, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 1, 2, 0>, + Conv::template process_tile<0, 1, 1, 1, 2, 1>, + Conv::template process_tile<0, 1, 1, 1, 2, 2>, + Conv::template process_tile<0, 1, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 1, 3, 0>, + Conv::template process_tile<0, 1, 1, 1, 3, 1>, + Conv::template process_tile<0, 1, 1, 1, 3, 2>, + Conv::template process_tile<0, 1, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 2, 0, 0>, + Conv::template process_tile<0, 1, 1, 2, 0, 1>, + Conv::template process_tile<0, 1, 1, 2, 0, 2>, + Conv::template process_tile<0, 1, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 2, 1, 0>, + Conv::template process_tile<0, 1, 1, 2, 1, 1>, + Conv::template process_tile<0, 1, 1, 2, 1, 2>, + Conv::template process_tile<0, 1, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 2, 2, 0>, + Conv::template process_tile<0, 1, 1, 2, 2, 1>, + Conv::template process_tile<0, 1, 1, 2, 2, 2>, + Conv::template process_tile<0, 1, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 2, 3, 0>, + Conv::template process_tile<0, 1, 1, 2, 3, 1>, + Conv::template process_tile<0, 1, 1, 2, 3, 2>, + Conv::template process_tile<0, 1, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 3, 0, 0>, + Conv::template process_tile<0, 1, 1, 3, 0, 1>, + Conv::template process_tile<0, 1, 1, 3, 0, 2>, + Conv::template process_tile<0, 1, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 3, 1, 0>, + Conv::template process_tile<0, 1, 1, 3, 1, 1>, + Conv::template process_tile<0, 1, 1, 3, 1, 2>, + Conv::template process_tile<0, 1, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 3, 2, 0>, + Conv::template process_tile<0, 1, 1, 3, 2, 1>, + Conv::template process_tile<0, 1, 1, 3, 2, 2>, + Conv::template process_tile<0, 1, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 3, 3, 0>, + Conv::template process_tile<0, 1, 1, 3, 3, 1>, + Conv::template process_tile<0, 1, 1, 3, 3, 2>, + Conv::template process_tile<0, 1, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 4, 0, 0>, + Conv::template process_tile<0, 1, 1, 4, 0, 1>, + Conv::template process_tile<0, 1, 1, 4, 0, 2>, + Conv::template process_tile<0, 1, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 4, 1, 0>, + Conv::template process_tile<0, 1, 1, 4, 1, 1>, + Conv::template process_tile<0, 1, 1, 4, 1, 2>, + Conv::template process_tile<0, 1, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 4, 2, 0>, + Conv::template process_tile<0, 1, 1, 4, 2, 1>, + Conv::template process_tile<0, 1, 1, 4, 2, 2>, + Conv::template process_tile<0, 1, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 4, 3, 0>, + Conv::template process_tile<0, 1, 1, 4, 3, 1>, + Conv::template process_tile<0, 1, 1, 4, 3, 2>, + Conv::template process_tile<0, 1, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 5, 0, 0>, + Conv::template process_tile<0, 1, 1, 5, 0, 1>, + Conv::template process_tile<0, 1, 1, 5, 0, 2>, + Conv::template process_tile<0, 1, 1, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 5, 1, 0>, + Conv::template process_tile<0, 1, 1, 5, 1, 1>, + Conv::template process_tile<0, 1, 1, 5, 1, 2>, + Conv::template process_tile<0, 1, 1, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 5, 2, 0>, + Conv::template process_tile<0, 1, 1, 5, 2, 1>, + Conv::template process_tile<0, 1, 1, 5, 2, 2>, + Conv::template process_tile<0, 1, 1, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 5, 3, 0>, + Conv::template process_tile<0, 1, 1, 5, 3, 1>, + Conv::template process_tile<0, 1, 1, 5, 3, 2>, + Conv::template process_tile<0, 1, 1, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 1, 6, 0, 0>, + Conv::template process_tile<0, 1, 1, 6, 0, 1>, + Conv::template process_tile<0, 1, 1, 6, 0, 2>, + Conv::template process_tile<0, 1, 1, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 1, 6, 1, 0>, + Conv::template process_tile<0, 1, 1, 6, 1, 1>, + Conv::template process_tile<0, 1, 1, 6, 1, 2>, + Conv::template process_tile<0, 1, 1, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 1, 6, 2, 0>, + Conv::template process_tile<0, 1, 1, 6, 2, 1>, + Conv::template process_tile<0, 1, 1, 6, 2, 2>, + Conv::template process_tile<0, 1, 1, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 1, 6, 3, 0>, + Conv::template process_tile<0, 1, 1, 6, 3, 1>, + Conv::template process_tile<0, 1, 1, 6, 3, 2>, + Conv::template process_tile<0, 1, 1, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 0, 0, 0>, + Conv::template process_tile<0, 1, 2, 0, 0, 1>, + Conv::template process_tile<0, 1, 2, 0, 0, 2>, + Conv::template process_tile<0, 1, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 0, 1, 0>, + Conv::template process_tile<0, 1, 2, 0, 1, 1>, + Conv::template process_tile<0, 1, 2, 0, 1, 2>, + Conv::template process_tile<0, 1, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 0, 2, 0>, + Conv::template process_tile<0, 1, 2, 0, 2, 1>, + Conv::template process_tile<0, 1, 2, 0, 2, 2>, + Conv::template process_tile<0, 1, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 0, 3, 0>, + Conv::template process_tile<0, 1, 2, 0, 3, 1>, + Conv::template process_tile<0, 1, 2, 0, 3, 2>, + Conv::template process_tile<0, 1, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 1, 0, 0>, + Conv::template process_tile<0, 1, 2, 1, 0, 1>, + Conv::template process_tile<0, 1, 2, 1, 0, 2>, + Conv::template process_tile<0, 1, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 1, 1, 0>, + Conv::template process_tile<0, 1, 2, 1, 1, 1>, + Conv::template process_tile<0, 1, 2, 1, 1, 2>, + Conv::template process_tile<0, 1, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 1, 2, 0>, + Conv::template process_tile<0, 1, 2, 1, 2, 1>, + Conv::template process_tile<0, 1, 2, 1, 2, 2>, + Conv::template process_tile<0, 1, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 1, 3, 0>, + Conv::template process_tile<0, 1, 2, 1, 3, 1>, + Conv::template process_tile<0, 1, 2, 1, 3, 2>, + Conv::template process_tile<0, 1, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 2, 0, 0>, + Conv::template process_tile<0, 1, 2, 2, 0, 1>, + Conv::template process_tile<0, 1, 2, 2, 0, 2>, + Conv::template process_tile<0, 1, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 2, 1, 0>, + Conv::template process_tile<0, 1, 2, 2, 1, 1>, + Conv::template process_tile<0, 1, 2, 2, 1, 2>, + Conv::template process_tile<0, 1, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 2, 2, 0>, + Conv::template process_tile<0, 1, 2, 2, 2, 1>, + Conv::template process_tile<0, 1, 2, 2, 2, 2>, + Conv::template process_tile<0, 1, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 2, 3, 0>, + Conv::template process_tile<0, 1, 2, 2, 3, 1>, + Conv::template process_tile<0, 1, 2, 2, 3, 2>, + Conv::template process_tile<0, 1, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 3, 0, 0>, + Conv::template process_tile<0, 1, 2, 3, 0, 1>, + Conv::template process_tile<0, 1, 2, 3, 0, 2>, + Conv::template process_tile<0, 1, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 3, 1, 0>, + Conv::template process_tile<0, 1, 2, 3, 1, 1>, + Conv::template process_tile<0, 1, 2, 3, 1, 2>, + Conv::template process_tile<0, 1, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 3, 2, 0>, + Conv::template process_tile<0, 1, 2, 3, 2, 1>, + Conv::template process_tile<0, 1, 2, 3, 2, 2>, + Conv::template process_tile<0, 1, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 3, 3, 0>, + Conv::template process_tile<0, 1, 2, 3, 3, 1>, + Conv::template process_tile<0, 1, 2, 3, 3, 2>, + Conv::template process_tile<0, 1, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 4, 0, 0>, + Conv::template process_tile<0, 1, 2, 4, 0, 1>, + Conv::template process_tile<0, 1, 2, 4, 0, 2>, + Conv::template process_tile<0, 1, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 4, 1, 0>, + Conv::template process_tile<0, 1, 2, 4, 1, 1>, + Conv::template process_tile<0, 1, 2, 4, 1, 2>, + Conv::template process_tile<0, 1, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 4, 2, 0>, + Conv::template process_tile<0, 1, 2, 4, 2, 1>, + Conv::template process_tile<0, 1, 2, 4, 2, 2>, + Conv::template process_tile<0, 1, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 4, 3, 0>, + Conv::template process_tile<0, 1, 2, 4, 3, 1>, + Conv::template process_tile<0, 1, 2, 4, 3, 2>, + Conv::template process_tile<0, 1, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 5, 0, 0>, + Conv::template process_tile<0, 1, 2, 5, 0, 1>, + Conv::template process_tile<0, 1, 2, 5, 0, 2>, + Conv::template process_tile<0, 1, 2, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 5, 1, 0>, + Conv::template process_tile<0, 1, 2, 5, 1, 1>, + Conv::template process_tile<0, 1, 2, 5, 1, 2>, + Conv::template process_tile<0, 1, 2, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 5, 2, 0>, + Conv::template process_tile<0, 1, 2, 5, 2, 1>, + Conv::template process_tile<0, 1, 2, 5, 2, 2>, + Conv::template process_tile<0, 1, 2, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 5, 3, 0>, + Conv::template process_tile<0, 1, 2, 5, 3, 1>, + Conv::template process_tile<0, 1, 2, 5, 3, 2>, + Conv::template process_tile<0, 1, 2, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 2, 6, 0, 0>, + Conv::template process_tile<0, 1, 2, 6, 0, 1>, + Conv::template process_tile<0, 1, 2, 6, 0, 2>, + Conv::template process_tile<0, 1, 2, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 2, 6, 1, 0>, + Conv::template process_tile<0, 1, 2, 6, 1, 1>, + Conv::template process_tile<0, 1, 2, 6, 1, 2>, + Conv::template process_tile<0, 1, 2, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 2, 6, 2, 0>, + Conv::template process_tile<0, 1, 2, 6, 2, 1>, + Conv::template process_tile<0, 1, 2, 6, 2, 2>, + Conv::template process_tile<0, 1, 2, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 2, 6, 3, 0>, + Conv::template process_tile<0, 1, 2, 6, 3, 1>, + Conv::template process_tile<0, 1, 2, 6, 3, 2>, + Conv::template process_tile<0, 1, 2, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 0, 0, 0>, + Conv::template process_tile<0, 1, 3, 0, 0, 1>, + Conv::template process_tile<0, 1, 3, 0, 0, 2>, + Conv::template process_tile<0, 1, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 0, 1, 0>, + Conv::template process_tile<0, 1, 3, 0, 1, 1>, + Conv::template process_tile<0, 1, 3, 0, 1, 2>, + Conv::template process_tile<0, 1, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 0, 2, 0>, + Conv::template process_tile<0, 1, 3, 0, 2, 1>, + Conv::template process_tile<0, 1, 3, 0, 2, 2>, + Conv::template process_tile<0, 1, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 0, 3, 0>, + Conv::template process_tile<0, 1, 3, 0, 3, 1>, + Conv::template process_tile<0, 1, 3, 0, 3, 2>, + Conv::template process_tile<0, 1, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 1, 0, 0>, + Conv::template process_tile<0, 1, 3, 1, 0, 1>, + Conv::template process_tile<0, 1, 3, 1, 0, 2>, + Conv::template process_tile<0, 1, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 1, 1, 0>, + Conv::template process_tile<0, 1, 3, 1, 1, 1>, + Conv::template process_tile<0, 1, 3, 1, 1, 2>, + Conv::template process_tile<0, 1, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 1, 2, 0>, + Conv::template process_tile<0, 1, 3, 1, 2, 1>, + Conv::template process_tile<0, 1, 3, 1, 2, 2>, + Conv::template process_tile<0, 1, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 1, 3, 0>, + Conv::template process_tile<0, 1, 3, 1, 3, 1>, + Conv::template process_tile<0, 1, 3, 1, 3, 2>, + Conv::template process_tile<0, 1, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 2, 0, 0>, + Conv::template process_tile<0, 1, 3, 2, 0, 1>, + Conv::template process_tile<0, 1, 3, 2, 0, 2>, + Conv::template process_tile<0, 1, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 2, 1, 0>, + Conv::template process_tile<0, 1, 3, 2, 1, 1>, + Conv::template process_tile<0, 1, 3, 2, 1, 2>, + Conv::template process_tile<0, 1, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 2, 2, 0>, + Conv::template process_tile<0, 1, 3, 2, 2, 1>, + Conv::template process_tile<0, 1, 3, 2, 2, 2>, + Conv::template process_tile<0, 1, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 2, 3, 0>, + Conv::template process_tile<0, 1, 3, 2, 3, 1>, + Conv::template process_tile<0, 1, 3, 2, 3, 2>, + Conv::template process_tile<0, 1, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 3, 0, 0>, + Conv::template process_tile<0, 1, 3, 3, 0, 1>, + Conv::template process_tile<0, 1, 3, 3, 0, 2>, + Conv::template process_tile<0, 1, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 3, 1, 0>, + Conv::template process_tile<0, 1, 3, 3, 1, 1>, + Conv::template process_tile<0, 1, 3, 3, 1, 2>, + Conv::template process_tile<0, 1, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 3, 2, 0>, + Conv::template process_tile<0, 1, 3, 3, 2, 1>, + Conv::template process_tile<0, 1, 3, 3, 2, 2>, + Conv::template process_tile<0, 1, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 3, 3, 0>, + Conv::template process_tile<0, 1, 3, 3, 3, 1>, + Conv::template process_tile<0, 1, 3, 3, 3, 2>, + Conv::template process_tile<0, 1, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 4, 0, 0>, + Conv::template process_tile<0, 1, 3, 4, 0, 1>, + Conv::template process_tile<0, 1, 3, 4, 0, 2>, + Conv::template process_tile<0, 1, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 4, 1, 0>, + Conv::template process_tile<0, 1, 3, 4, 1, 1>, + Conv::template process_tile<0, 1, 3, 4, 1, 2>, + Conv::template process_tile<0, 1, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 4, 2, 0>, + Conv::template process_tile<0, 1, 3, 4, 2, 1>, + Conv::template process_tile<0, 1, 3, 4, 2, 2>, + Conv::template process_tile<0, 1, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 4, 3, 0>, + Conv::template process_tile<0, 1, 3, 4, 3, 1>, + Conv::template process_tile<0, 1, 3, 4, 3, 2>, + Conv::template process_tile<0, 1, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 5, 0, 0>, + Conv::template process_tile<0, 1, 3, 5, 0, 1>, + Conv::template process_tile<0, 1, 3, 5, 0, 2>, + Conv::template process_tile<0, 1, 3, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 5, 1, 0>, + Conv::template process_tile<0, 1, 3, 5, 1, 1>, + Conv::template process_tile<0, 1, 3, 5, 1, 2>, + Conv::template process_tile<0, 1, 3, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 5, 2, 0>, + Conv::template process_tile<0, 1, 3, 5, 2, 1>, + Conv::template process_tile<0, 1, 3, 5, 2, 2>, + Conv::template process_tile<0, 1, 3, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 5, 3, 0>, + Conv::template process_tile<0, 1, 3, 5, 3, 1>, + Conv::template process_tile<0, 1, 3, 5, 3, 2>, + Conv::template process_tile<0, 1, 3, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 3, 6, 0, 0>, + Conv::template process_tile<0, 1, 3, 6, 0, 1>, + Conv::template process_tile<0, 1, 3, 6, 0, 2>, + Conv::template process_tile<0, 1, 3, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 3, 6, 1, 0>, + Conv::template process_tile<0, 1, 3, 6, 1, 1>, + Conv::template process_tile<0, 1, 3, 6, 1, 2>, + Conv::template process_tile<0, 1, 3, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 3, 6, 2, 0>, + Conv::template process_tile<0, 1, 3, 6, 2, 1>, + Conv::template process_tile<0, 1, 3, 6, 2, 2>, + Conv::template process_tile<0, 1, 3, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 3, 6, 3, 0>, + Conv::template process_tile<0, 1, 3, 6, 3, 1>, + Conv::template process_tile<0, 1, 3, 6, 3, 2>, + Conv::template process_tile<0, 1, 3, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 0, 0, 0>, + Conv::template process_tile<0, 1, 4, 0, 0, 1>, + Conv::template process_tile<0, 1, 4, 0, 0, 2>, + Conv::template process_tile<0, 1, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 0, 1, 0>, + Conv::template process_tile<0, 1, 4, 0, 1, 1>, + Conv::template process_tile<0, 1, 4, 0, 1, 2>, + Conv::template process_tile<0, 1, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 0, 2, 0>, + Conv::template process_tile<0, 1, 4, 0, 2, 1>, + Conv::template process_tile<0, 1, 4, 0, 2, 2>, + Conv::template process_tile<0, 1, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 0, 3, 0>, + Conv::template process_tile<0, 1, 4, 0, 3, 1>, + Conv::template process_tile<0, 1, 4, 0, 3, 2>, + Conv::template process_tile<0, 1, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 1, 0, 0>, + Conv::template process_tile<0, 1, 4, 1, 0, 1>, + Conv::template process_tile<0, 1, 4, 1, 0, 2>, + Conv::template process_tile<0, 1, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 1, 1, 0>, + Conv::template process_tile<0, 1, 4, 1, 1, 1>, + Conv::template process_tile<0, 1, 4, 1, 1, 2>, + Conv::template process_tile<0, 1, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 1, 2, 0>, + Conv::template process_tile<0, 1, 4, 1, 2, 1>, + Conv::template process_tile<0, 1, 4, 1, 2, 2>, + Conv::template process_tile<0, 1, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 1, 3, 0>, + Conv::template process_tile<0, 1, 4, 1, 3, 1>, + Conv::template process_tile<0, 1, 4, 1, 3, 2>, + Conv::template process_tile<0, 1, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 2, 0, 0>, + Conv::template process_tile<0, 1, 4, 2, 0, 1>, + Conv::template process_tile<0, 1, 4, 2, 0, 2>, + Conv::template process_tile<0, 1, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 2, 1, 0>, + Conv::template process_tile<0, 1, 4, 2, 1, 1>, + Conv::template process_tile<0, 1, 4, 2, 1, 2>, + Conv::template process_tile<0, 1, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 2, 2, 0>, + Conv::template process_tile<0, 1, 4, 2, 2, 1>, + Conv::template process_tile<0, 1, 4, 2, 2, 2>, + Conv::template process_tile<0, 1, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 2, 3, 0>, + Conv::template process_tile<0, 1, 4, 2, 3, 1>, + Conv::template process_tile<0, 1, 4, 2, 3, 2>, + Conv::template process_tile<0, 1, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 3, 0, 0>, + Conv::template process_tile<0, 1, 4, 3, 0, 1>, + Conv::template process_tile<0, 1, 4, 3, 0, 2>, + Conv::template process_tile<0, 1, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 3, 1, 0>, + Conv::template process_tile<0, 1, 4, 3, 1, 1>, + Conv::template process_tile<0, 1, 4, 3, 1, 2>, + Conv::template process_tile<0, 1, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 3, 2, 0>, + Conv::template process_tile<0, 1, 4, 3, 2, 1>, + Conv::template process_tile<0, 1, 4, 3, 2, 2>, + Conv::template process_tile<0, 1, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 3, 3, 0>, + Conv::template process_tile<0, 1, 4, 3, 3, 1>, + Conv::template process_tile<0, 1, 4, 3, 3, 2>, + Conv::template process_tile<0, 1, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 4, 0, 0>, + Conv::template process_tile<0, 1, 4, 4, 0, 1>, + Conv::template process_tile<0, 1, 4, 4, 0, 2>, + Conv::template process_tile<0, 1, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 4, 1, 0>, + Conv::template process_tile<0, 1, 4, 4, 1, 1>, + Conv::template process_tile<0, 1, 4, 4, 1, 2>, + Conv::template process_tile<0, 1, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 4, 2, 0>, + Conv::template process_tile<0, 1, 4, 4, 2, 1>, + Conv::template process_tile<0, 1, 4, 4, 2, 2>, + Conv::template process_tile<0, 1, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 4, 3, 0>, + Conv::template process_tile<0, 1, 4, 4, 3, 1>, + Conv::template process_tile<0, 1, 4, 4, 3, 2>, + Conv::template process_tile<0, 1, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 5, 0, 0>, + Conv::template process_tile<0, 1, 4, 5, 0, 1>, + Conv::template process_tile<0, 1, 4, 5, 0, 2>, + Conv::template process_tile<0, 1, 4, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 5, 1, 0>, + Conv::template process_tile<0, 1, 4, 5, 1, 1>, + Conv::template process_tile<0, 1, 4, 5, 1, 2>, + Conv::template process_tile<0, 1, 4, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 5, 2, 0>, + Conv::template process_tile<0, 1, 4, 5, 2, 1>, + Conv::template process_tile<0, 1, 4, 5, 2, 2>, + Conv::template process_tile<0, 1, 4, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 5, 3, 0>, + Conv::template process_tile<0, 1, 4, 5, 3, 1>, + Conv::template process_tile<0, 1, 4, 5, 3, 2>, + Conv::template process_tile<0, 1, 4, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 4, 6, 0, 0>, + Conv::template process_tile<0, 1, 4, 6, 0, 1>, + Conv::template process_tile<0, 1, 4, 6, 0, 2>, + Conv::template process_tile<0, 1, 4, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 4, 6, 1, 0>, + Conv::template process_tile<0, 1, 4, 6, 1, 1>, + Conv::template process_tile<0, 1, 4, 6, 1, 2>, + Conv::template process_tile<0, 1, 4, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 4, 6, 2, 0>, + Conv::template process_tile<0, 1, 4, 6, 2, 1>, + Conv::template process_tile<0, 1, 4, 6, 2, 2>, + Conv::template process_tile<0, 1, 4, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 4, 6, 3, 0>, + Conv::template process_tile<0, 1, 4, 6, 3, 1>, + Conv::template process_tile<0, 1, 4, 6, 3, 2>, + Conv::template process_tile<0, 1, 4, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 0, 0, 0>, + Conv::template process_tile<0, 1, 5, 0, 0, 1>, + Conv::template process_tile<0, 1, 5, 0, 0, 2>, + Conv::template process_tile<0, 1, 5, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 0, 1, 0>, + Conv::template process_tile<0, 1, 5, 0, 1, 1>, + Conv::template process_tile<0, 1, 5, 0, 1, 2>, + Conv::template process_tile<0, 1, 5, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 0, 2, 0>, + Conv::template process_tile<0, 1, 5, 0, 2, 1>, + Conv::template process_tile<0, 1, 5, 0, 2, 2>, + Conv::template process_tile<0, 1, 5, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 0, 3, 0>, + Conv::template process_tile<0, 1, 5, 0, 3, 1>, + Conv::template process_tile<0, 1, 5, 0, 3, 2>, + Conv::template process_tile<0, 1, 5, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 1, 0, 0>, + Conv::template process_tile<0, 1, 5, 1, 0, 1>, + Conv::template process_tile<0, 1, 5, 1, 0, 2>, + Conv::template process_tile<0, 1, 5, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 1, 1, 0>, + Conv::template process_tile<0, 1, 5, 1, 1, 1>, + Conv::template process_tile<0, 1, 5, 1, 1, 2>, + Conv::template process_tile<0, 1, 5, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 1, 2, 0>, + Conv::template process_tile<0, 1, 5, 1, 2, 1>, + Conv::template process_tile<0, 1, 5, 1, 2, 2>, + Conv::template process_tile<0, 1, 5, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 1, 3, 0>, + Conv::template process_tile<0, 1, 5, 1, 3, 1>, + Conv::template process_tile<0, 1, 5, 1, 3, 2>, + Conv::template process_tile<0, 1, 5, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 2, 0, 0>, + Conv::template process_tile<0, 1, 5, 2, 0, 1>, + Conv::template process_tile<0, 1, 5, 2, 0, 2>, + Conv::template process_tile<0, 1, 5, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 2, 1, 0>, + Conv::template process_tile<0, 1, 5, 2, 1, 1>, + Conv::template process_tile<0, 1, 5, 2, 1, 2>, + Conv::template process_tile<0, 1, 5, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 2, 2, 0>, + Conv::template process_tile<0, 1, 5, 2, 2, 1>, + Conv::template process_tile<0, 1, 5, 2, 2, 2>, + Conv::template process_tile<0, 1, 5, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 2, 3, 0>, + Conv::template process_tile<0, 1, 5, 2, 3, 1>, + Conv::template process_tile<0, 1, 5, 2, 3, 2>, + Conv::template process_tile<0, 1, 5, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 3, 0, 0>, + Conv::template process_tile<0, 1, 5, 3, 0, 1>, + Conv::template process_tile<0, 1, 5, 3, 0, 2>, + Conv::template process_tile<0, 1, 5, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 3, 1, 0>, + Conv::template process_tile<0, 1, 5, 3, 1, 1>, + Conv::template process_tile<0, 1, 5, 3, 1, 2>, + Conv::template process_tile<0, 1, 5, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 3, 2, 0>, + Conv::template process_tile<0, 1, 5, 3, 2, 1>, + Conv::template process_tile<0, 1, 5, 3, 2, 2>, + Conv::template process_tile<0, 1, 5, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 3, 3, 0>, + Conv::template process_tile<0, 1, 5, 3, 3, 1>, + Conv::template process_tile<0, 1, 5, 3, 3, 2>, + Conv::template process_tile<0, 1, 5, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 4, 0, 0>, + Conv::template process_tile<0, 1, 5, 4, 0, 1>, + Conv::template process_tile<0, 1, 5, 4, 0, 2>, + Conv::template process_tile<0, 1, 5, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 4, 1, 0>, + Conv::template process_tile<0, 1, 5, 4, 1, 1>, + Conv::template process_tile<0, 1, 5, 4, 1, 2>, + Conv::template process_tile<0, 1, 5, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 4, 2, 0>, + Conv::template process_tile<0, 1, 5, 4, 2, 1>, + Conv::template process_tile<0, 1, 5, 4, 2, 2>, + Conv::template process_tile<0, 1, 5, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 4, 3, 0>, + Conv::template process_tile<0, 1, 5, 4, 3, 1>, + Conv::template process_tile<0, 1, 5, 4, 3, 2>, + Conv::template process_tile<0, 1, 5, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 5, 0, 0>, + Conv::template process_tile<0, 1, 5, 5, 0, 1>, + Conv::template process_tile<0, 1, 5, 5, 0, 2>, + Conv::template process_tile<0, 1, 5, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 5, 1, 0>, + Conv::template process_tile<0, 1, 5, 5, 1, 1>, + Conv::template process_tile<0, 1, 5, 5, 1, 2>, + Conv::template process_tile<0, 1, 5, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 5, 2, 0>, + Conv::template process_tile<0, 1, 5, 5, 2, 1>, + Conv::template process_tile<0, 1, 5, 5, 2, 2>, + Conv::template process_tile<0, 1, 5, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 5, 3, 0>, + Conv::template process_tile<0, 1, 5, 5, 3, 1>, + Conv::template process_tile<0, 1, 5, 5, 3, 2>, + Conv::template process_tile<0, 1, 5, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 5, 6, 0, 0>, + Conv::template process_tile<0, 1, 5, 6, 0, 1>, + Conv::template process_tile<0, 1, 5, 6, 0, 2>, + Conv::template process_tile<0, 1, 5, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 5, 6, 1, 0>, + Conv::template process_tile<0, 1, 5, 6, 1, 1>, + Conv::template process_tile<0, 1, 5, 6, 1, 2>, + Conv::template process_tile<0, 1, 5, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 5, 6, 2, 0>, + Conv::template process_tile<0, 1, 5, 6, 2, 1>, + Conv::template process_tile<0, 1, 5, 6, 2, 2>, + Conv::template process_tile<0, 1, 5, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 5, 6, 3, 0>, + Conv::template process_tile<0, 1, 5, 6, 3, 1>, + Conv::template process_tile<0, 1, 5, 6, 3, 2>, + Conv::template process_tile<0, 1, 5, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 0, 0, 0>, + Conv::template process_tile<0, 1, 6, 0, 0, 1>, + Conv::template process_tile<0, 1, 6, 0, 0, 2>, + Conv::template process_tile<0, 1, 6, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 0, 1, 0>, + Conv::template process_tile<0, 1, 6, 0, 1, 1>, + Conv::template process_tile<0, 1, 6, 0, 1, 2>, + Conv::template process_tile<0, 1, 6, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 0, 2, 0>, + Conv::template process_tile<0, 1, 6, 0, 2, 1>, + Conv::template process_tile<0, 1, 6, 0, 2, 2>, + Conv::template process_tile<0, 1, 6, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 0, 3, 0>, + Conv::template process_tile<0, 1, 6, 0, 3, 1>, + Conv::template process_tile<0, 1, 6, 0, 3, 2>, + Conv::template process_tile<0, 1, 6, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 1, 0, 0>, + Conv::template process_tile<0, 1, 6, 1, 0, 1>, + Conv::template process_tile<0, 1, 6, 1, 0, 2>, + Conv::template process_tile<0, 1, 6, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 1, 1, 0>, + Conv::template process_tile<0, 1, 6, 1, 1, 1>, + Conv::template process_tile<0, 1, 6, 1, 1, 2>, + Conv::template process_tile<0, 1, 6, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 1, 2, 0>, + Conv::template process_tile<0, 1, 6, 1, 2, 1>, + Conv::template process_tile<0, 1, 6, 1, 2, 2>, + Conv::template process_tile<0, 1, 6, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 1, 3, 0>, + Conv::template process_tile<0, 1, 6, 1, 3, 1>, + Conv::template process_tile<0, 1, 6, 1, 3, 2>, + Conv::template process_tile<0, 1, 6, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 2, 0, 0>, + Conv::template process_tile<0, 1, 6, 2, 0, 1>, + Conv::template process_tile<0, 1, 6, 2, 0, 2>, + Conv::template process_tile<0, 1, 6, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 2, 1, 0>, + Conv::template process_tile<0, 1, 6, 2, 1, 1>, + Conv::template process_tile<0, 1, 6, 2, 1, 2>, + Conv::template process_tile<0, 1, 6, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 2, 2, 0>, + Conv::template process_tile<0, 1, 6, 2, 2, 1>, + Conv::template process_tile<0, 1, 6, 2, 2, 2>, + Conv::template process_tile<0, 1, 6, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 2, 3, 0>, + Conv::template process_tile<0, 1, 6, 2, 3, 1>, + Conv::template process_tile<0, 1, 6, 2, 3, 2>, + Conv::template process_tile<0, 1, 6, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 3, 0, 0>, + Conv::template process_tile<0, 1, 6, 3, 0, 1>, + Conv::template process_tile<0, 1, 6, 3, 0, 2>, + Conv::template process_tile<0, 1, 6, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 3, 1, 0>, + Conv::template process_tile<0, 1, 6, 3, 1, 1>, + Conv::template process_tile<0, 1, 6, 3, 1, 2>, + Conv::template process_tile<0, 1, 6, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 3, 2, 0>, + Conv::template process_tile<0, 1, 6, 3, 2, 1>, + Conv::template process_tile<0, 1, 6, 3, 2, 2>, + Conv::template process_tile<0, 1, 6, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 3, 3, 0>, + Conv::template process_tile<0, 1, 6, 3, 3, 1>, + Conv::template process_tile<0, 1, 6, 3, 3, 2>, + Conv::template process_tile<0, 1, 6, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 4, 0, 0>, + Conv::template process_tile<0, 1, 6, 4, 0, 1>, + Conv::template process_tile<0, 1, 6, 4, 0, 2>, + Conv::template process_tile<0, 1, 6, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 4, 1, 0>, + Conv::template process_tile<0, 1, 6, 4, 1, 1>, + Conv::template process_tile<0, 1, 6, 4, 1, 2>, + Conv::template process_tile<0, 1, 6, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 4, 2, 0>, + Conv::template process_tile<0, 1, 6, 4, 2, 1>, + Conv::template process_tile<0, 1, 6, 4, 2, 2>, + Conv::template process_tile<0, 1, 6, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 4, 3, 0>, + Conv::template process_tile<0, 1, 6, 4, 3, 1>, + Conv::template process_tile<0, 1, 6, 4, 3, 2>, + Conv::template process_tile<0, 1, 6, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 5, 0, 0>, + Conv::template process_tile<0, 1, 6, 5, 0, 1>, + Conv::template process_tile<0, 1, 6, 5, 0, 2>, + Conv::template process_tile<0, 1, 6, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 5, 1, 0>, + Conv::template process_tile<0, 1, 6, 5, 1, 1>, + Conv::template process_tile<0, 1, 6, 5, 1, 2>, + Conv::template process_tile<0, 1, 6, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 5, 2, 0>, + Conv::template process_tile<0, 1, 6, 5, 2, 1>, + Conv::template process_tile<0, 1, 6, 5, 2, 2>, + Conv::template process_tile<0, 1, 6, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 5, 3, 0>, + Conv::template process_tile<0, 1, 6, 5, 3, 1>, + Conv::template process_tile<0, 1, 6, 5, 3, 2>, + Conv::template process_tile<0, 1, 6, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<0, 1, 6, 6, 0, 0>, + Conv::template process_tile<0, 1, 6, 6, 0, 1>, + Conv::template process_tile<0, 1, 6, 6, 0, 2>, + Conv::template process_tile<0, 1, 6, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<0, 1, 6, 6, 1, 0>, + Conv::template process_tile<0, 1, 6, 6, 1, 1>, + Conv::template process_tile<0, 1, 6, 6, 1, 2>, + Conv::template process_tile<0, 1, 6, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<0, 1, 6, 6, 2, 0>, + Conv::template process_tile<0, 1, 6, 6, 2, 1>, + Conv::template process_tile<0, 1, 6, 6, 2, 2>, + Conv::template process_tile<0, 1, 6, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<0, 1, 6, 6, 3, 0>, + Conv::template process_tile<0, 1, 6, 6, 3, 1>, + Conv::template process_tile<0, 1, 6, 6, 3, 2>, + Conv::template process_tile<0, 1, 6, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 1 + }, // Input pad top = 0 + { // Input pad top = 1 + { // Input pad left = 0 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 0, 0, 0>, + Conv::template process_tile<1, 0, 0, 0, 0, 1>, + Conv::template process_tile<1, 0, 0, 0, 0, 2>, + Conv::template process_tile<1, 0, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 0, 1, 0>, + Conv::template process_tile<1, 0, 0, 0, 1, 1>, + Conv::template process_tile<1, 0, 0, 0, 1, 2>, + Conv::template process_tile<1, 0, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 0, 2, 0>, + Conv::template process_tile<1, 0, 0, 0, 2, 1>, + Conv::template process_tile<1, 0, 0, 0, 2, 2>, + Conv::template process_tile<1, 0, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 0, 3, 0>, + Conv::template process_tile<1, 0, 0, 0, 3, 1>, + Conv::template process_tile<1, 0, 0, 0, 3, 2>, + Conv::template process_tile<1, 0, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 1, 0, 0>, + Conv::template process_tile<1, 0, 0, 1, 0, 1>, + Conv::template process_tile<1, 0, 0, 1, 0, 2>, + Conv::template process_tile<1, 0, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 1, 1, 0>, + Conv::template process_tile<1, 0, 0, 1, 1, 1>, + Conv::template process_tile<1, 0, 0, 1, 1, 2>, + Conv::template process_tile<1, 0, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 1, 2, 0>, + Conv::template process_tile<1, 0, 0, 1, 2, 1>, + Conv::template process_tile<1, 0, 0, 1, 2, 2>, + Conv::template process_tile<1, 0, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 1, 3, 0>, + Conv::template process_tile<1, 0, 0, 1, 3, 1>, + Conv::template process_tile<1, 0, 0, 1, 3, 2>, + Conv::template process_tile<1, 0, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 2, 0, 0>, + Conv::template process_tile<1, 0, 0, 2, 0, 1>, + Conv::template process_tile<1, 0, 0, 2, 0, 2>, + Conv::template process_tile<1, 0, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 2, 1, 0>, + Conv::template process_tile<1, 0, 0, 2, 1, 1>, + Conv::template process_tile<1, 0, 0, 2, 1, 2>, + Conv::template process_tile<1, 0, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 2, 2, 0>, + Conv::template process_tile<1, 0, 0, 2, 2, 1>, + Conv::template process_tile<1, 0, 0, 2, 2, 2>, + Conv::template process_tile<1, 0, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 2, 3, 0>, + Conv::template process_tile<1, 0, 0, 2, 3, 1>, + Conv::template process_tile<1, 0, 0, 2, 3, 2>, + Conv::template process_tile<1, 0, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 3, 0, 0>, + Conv::template process_tile<1, 0, 0, 3, 0, 1>, + Conv::template process_tile<1, 0, 0, 3, 0, 2>, + Conv::template process_tile<1, 0, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 3, 1, 0>, + Conv::template process_tile<1, 0, 0, 3, 1, 1>, + Conv::template process_tile<1, 0, 0, 3, 1, 2>, + Conv::template process_tile<1, 0, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 3, 2, 0>, + Conv::template process_tile<1, 0, 0, 3, 2, 1>, + Conv::template process_tile<1, 0, 0, 3, 2, 2>, + Conv::template process_tile<1, 0, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 3, 3, 0>, + Conv::template process_tile<1, 0, 0, 3, 3, 1>, + Conv::template process_tile<1, 0, 0, 3, 3, 2>, + Conv::template process_tile<1, 0, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 4, 0, 0>, + Conv::template process_tile<1, 0, 0, 4, 0, 1>, + Conv::template process_tile<1, 0, 0, 4, 0, 2>, + Conv::template process_tile<1, 0, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 4, 1, 0>, + Conv::template process_tile<1, 0, 0, 4, 1, 1>, + Conv::template process_tile<1, 0, 0, 4, 1, 2>, + Conv::template process_tile<1, 0, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 4, 2, 0>, + Conv::template process_tile<1, 0, 0, 4, 2, 1>, + Conv::template process_tile<1, 0, 0, 4, 2, 2>, + Conv::template process_tile<1, 0, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 4, 3, 0>, + Conv::template process_tile<1, 0, 0, 4, 3, 1>, + Conv::template process_tile<1, 0, 0, 4, 3, 2>, + Conv::template process_tile<1, 0, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 5, 0, 0>, + Conv::template process_tile<1, 0, 0, 5, 0, 1>, + Conv::template process_tile<1, 0, 0, 5, 0, 2>, + Conv::template process_tile<1, 0, 0, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 5, 1, 0>, + Conv::template process_tile<1, 0, 0, 5, 1, 1>, + Conv::template process_tile<1, 0, 0, 5, 1, 2>, + Conv::template process_tile<1, 0, 0, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 5, 2, 0>, + Conv::template process_tile<1, 0, 0, 5, 2, 1>, + Conv::template process_tile<1, 0, 0, 5, 2, 2>, + Conv::template process_tile<1, 0, 0, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 5, 3, 0>, + Conv::template process_tile<1, 0, 0, 5, 3, 1>, + Conv::template process_tile<1, 0, 0, 5, 3, 2>, + Conv::template process_tile<1, 0, 0, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 0, 6, 0, 0>, + Conv::template process_tile<1, 0, 0, 6, 0, 1>, + Conv::template process_tile<1, 0, 0, 6, 0, 2>, + Conv::template process_tile<1, 0, 0, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 0, 6, 1, 0>, + Conv::template process_tile<1, 0, 0, 6, 1, 1>, + Conv::template process_tile<1, 0, 0, 6, 1, 2>, + Conv::template process_tile<1, 0, 0, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 0, 6, 2, 0>, + Conv::template process_tile<1, 0, 0, 6, 2, 1>, + Conv::template process_tile<1, 0, 0, 6, 2, 2>, + Conv::template process_tile<1, 0, 0, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 0, 6, 3, 0>, + Conv::template process_tile<1, 0, 0, 6, 3, 1>, + Conv::template process_tile<1, 0, 0, 6, 3, 2>, + Conv::template process_tile<1, 0, 0, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 0, 0, 0>, + Conv::template process_tile<1, 0, 1, 0, 0, 1>, + Conv::template process_tile<1, 0, 1, 0, 0, 2>, + Conv::template process_tile<1, 0, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 0, 1, 0>, + Conv::template process_tile<1, 0, 1, 0, 1, 1>, + Conv::template process_tile<1, 0, 1, 0, 1, 2>, + Conv::template process_tile<1, 0, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 0, 2, 0>, + Conv::template process_tile<1, 0, 1, 0, 2, 1>, + Conv::template process_tile<1, 0, 1, 0, 2, 2>, + Conv::template process_tile<1, 0, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 0, 3, 0>, + Conv::template process_tile<1, 0, 1, 0, 3, 1>, + Conv::template process_tile<1, 0, 1, 0, 3, 2>, + Conv::template process_tile<1, 0, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 1, 0, 0>, + Conv::template process_tile<1, 0, 1, 1, 0, 1>, + Conv::template process_tile<1, 0, 1, 1, 0, 2>, + Conv::template process_tile<1, 0, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 1, 1, 0>, + Conv::template process_tile<1, 0, 1, 1, 1, 1>, + Conv::template process_tile<1, 0, 1, 1, 1, 2>, + Conv::template process_tile<1, 0, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 1, 2, 0>, + Conv::template process_tile<1, 0, 1, 1, 2, 1>, + Conv::template process_tile<1, 0, 1, 1, 2, 2>, + Conv::template process_tile<1, 0, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 1, 3, 0>, + Conv::template process_tile<1, 0, 1, 1, 3, 1>, + Conv::template process_tile<1, 0, 1, 1, 3, 2>, + Conv::template process_tile<1, 0, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 2, 0, 0>, + Conv::template process_tile<1, 0, 1, 2, 0, 1>, + Conv::template process_tile<1, 0, 1, 2, 0, 2>, + Conv::template process_tile<1, 0, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 2, 1, 0>, + Conv::template process_tile<1, 0, 1, 2, 1, 1>, + Conv::template process_tile<1, 0, 1, 2, 1, 2>, + Conv::template process_tile<1, 0, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 2, 2, 0>, + Conv::template process_tile<1, 0, 1, 2, 2, 1>, + Conv::template process_tile<1, 0, 1, 2, 2, 2>, + Conv::template process_tile<1, 0, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 2, 3, 0>, + Conv::template process_tile<1, 0, 1, 2, 3, 1>, + Conv::template process_tile<1, 0, 1, 2, 3, 2>, + Conv::template process_tile<1, 0, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 3, 0, 0>, + Conv::template process_tile<1, 0, 1, 3, 0, 1>, + Conv::template process_tile<1, 0, 1, 3, 0, 2>, + Conv::template process_tile<1, 0, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 3, 1, 0>, + Conv::template process_tile<1, 0, 1, 3, 1, 1>, + Conv::template process_tile<1, 0, 1, 3, 1, 2>, + Conv::template process_tile<1, 0, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 3, 2, 0>, + Conv::template process_tile<1, 0, 1, 3, 2, 1>, + Conv::template process_tile<1, 0, 1, 3, 2, 2>, + Conv::template process_tile<1, 0, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 3, 3, 0>, + Conv::template process_tile<1, 0, 1, 3, 3, 1>, + Conv::template process_tile<1, 0, 1, 3, 3, 2>, + Conv::template process_tile<1, 0, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 4, 0, 0>, + Conv::template process_tile<1, 0, 1, 4, 0, 1>, + Conv::template process_tile<1, 0, 1, 4, 0, 2>, + Conv::template process_tile<1, 0, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 4, 1, 0>, + Conv::template process_tile<1, 0, 1, 4, 1, 1>, + Conv::template process_tile<1, 0, 1, 4, 1, 2>, + Conv::template process_tile<1, 0, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 4, 2, 0>, + Conv::template process_tile<1, 0, 1, 4, 2, 1>, + Conv::template process_tile<1, 0, 1, 4, 2, 2>, + Conv::template process_tile<1, 0, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 4, 3, 0>, + Conv::template process_tile<1, 0, 1, 4, 3, 1>, + Conv::template process_tile<1, 0, 1, 4, 3, 2>, + Conv::template process_tile<1, 0, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 5, 0, 0>, + Conv::template process_tile<1, 0, 1, 5, 0, 1>, + Conv::template process_tile<1, 0, 1, 5, 0, 2>, + Conv::template process_tile<1, 0, 1, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 5, 1, 0>, + Conv::template process_tile<1, 0, 1, 5, 1, 1>, + Conv::template process_tile<1, 0, 1, 5, 1, 2>, + Conv::template process_tile<1, 0, 1, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 5, 2, 0>, + Conv::template process_tile<1, 0, 1, 5, 2, 1>, + Conv::template process_tile<1, 0, 1, 5, 2, 2>, + Conv::template process_tile<1, 0, 1, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 5, 3, 0>, + Conv::template process_tile<1, 0, 1, 5, 3, 1>, + Conv::template process_tile<1, 0, 1, 5, 3, 2>, + Conv::template process_tile<1, 0, 1, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 1, 6, 0, 0>, + Conv::template process_tile<1, 0, 1, 6, 0, 1>, + Conv::template process_tile<1, 0, 1, 6, 0, 2>, + Conv::template process_tile<1, 0, 1, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 1, 6, 1, 0>, + Conv::template process_tile<1, 0, 1, 6, 1, 1>, + Conv::template process_tile<1, 0, 1, 6, 1, 2>, + Conv::template process_tile<1, 0, 1, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 1, 6, 2, 0>, + Conv::template process_tile<1, 0, 1, 6, 2, 1>, + Conv::template process_tile<1, 0, 1, 6, 2, 2>, + Conv::template process_tile<1, 0, 1, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 1, 6, 3, 0>, + Conv::template process_tile<1, 0, 1, 6, 3, 1>, + Conv::template process_tile<1, 0, 1, 6, 3, 2>, + Conv::template process_tile<1, 0, 1, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 0, 0, 0>, + Conv::template process_tile<1, 0, 2, 0, 0, 1>, + Conv::template process_tile<1, 0, 2, 0, 0, 2>, + Conv::template process_tile<1, 0, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 0, 1, 0>, + Conv::template process_tile<1, 0, 2, 0, 1, 1>, + Conv::template process_tile<1, 0, 2, 0, 1, 2>, + Conv::template process_tile<1, 0, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 0, 2, 0>, + Conv::template process_tile<1, 0, 2, 0, 2, 1>, + Conv::template process_tile<1, 0, 2, 0, 2, 2>, + Conv::template process_tile<1, 0, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 0, 3, 0>, + Conv::template process_tile<1, 0, 2, 0, 3, 1>, + Conv::template process_tile<1, 0, 2, 0, 3, 2>, + Conv::template process_tile<1, 0, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 1, 0, 0>, + Conv::template process_tile<1, 0, 2, 1, 0, 1>, + Conv::template process_tile<1, 0, 2, 1, 0, 2>, + Conv::template process_tile<1, 0, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 1, 1, 0>, + Conv::template process_tile<1, 0, 2, 1, 1, 1>, + Conv::template process_tile<1, 0, 2, 1, 1, 2>, + Conv::template process_tile<1, 0, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 1, 2, 0>, + Conv::template process_tile<1, 0, 2, 1, 2, 1>, + Conv::template process_tile<1, 0, 2, 1, 2, 2>, + Conv::template process_tile<1, 0, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 1, 3, 0>, + Conv::template process_tile<1, 0, 2, 1, 3, 1>, + Conv::template process_tile<1, 0, 2, 1, 3, 2>, + Conv::template process_tile<1, 0, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 2, 0, 0>, + Conv::template process_tile<1, 0, 2, 2, 0, 1>, + Conv::template process_tile<1, 0, 2, 2, 0, 2>, + Conv::template process_tile<1, 0, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 2, 1, 0>, + Conv::template process_tile<1, 0, 2, 2, 1, 1>, + Conv::template process_tile<1, 0, 2, 2, 1, 2>, + Conv::template process_tile<1, 0, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 2, 2, 0>, + Conv::template process_tile<1, 0, 2, 2, 2, 1>, + Conv::template process_tile<1, 0, 2, 2, 2, 2>, + Conv::template process_tile<1, 0, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 2, 3, 0>, + Conv::template process_tile<1, 0, 2, 2, 3, 1>, + Conv::template process_tile<1, 0, 2, 2, 3, 2>, + Conv::template process_tile<1, 0, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 3, 0, 0>, + Conv::template process_tile<1, 0, 2, 3, 0, 1>, + Conv::template process_tile<1, 0, 2, 3, 0, 2>, + Conv::template process_tile<1, 0, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 3, 1, 0>, + Conv::template process_tile<1, 0, 2, 3, 1, 1>, + Conv::template process_tile<1, 0, 2, 3, 1, 2>, + Conv::template process_tile<1, 0, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 3, 2, 0>, + Conv::template process_tile<1, 0, 2, 3, 2, 1>, + Conv::template process_tile<1, 0, 2, 3, 2, 2>, + Conv::template process_tile<1, 0, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 3, 3, 0>, + Conv::template process_tile<1, 0, 2, 3, 3, 1>, + Conv::template process_tile<1, 0, 2, 3, 3, 2>, + Conv::template process_tile<1, 0, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 4, 0, 0>, + Conv::template process_tile<1, 0, 2, 4, 0, 1>, + Conv::template process_tile<1, 0, 2, 4, 0, 2>, + Conv::template process_tile<1, 0, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 4, 1, 0>, + Conv::template process_tile<1, 0, 2, 4, 1, 1>, + Conv::template process_tile<1, 0, 2, 4, 1, 2>, + Conv::template process_tile<1, 0, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 4, 2, 0>, + Conv::template process_tile<1, 0, 2, 4, 2, 1>, + Conv::template process_tile<1, 0, 2, 4, 2, 2>, + Conv::template process_tile<1, 0, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 4, 3, 0>, + Conv::template process_tile<1, 0, 2, 4, 3, 1>, + Conv::template process_tile<1, 0, 2, 4, 3, 2>, + Conv::template process_tile<1, 0, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 5, 0, 0>, + Conv::template process_tile<1, 0, 2, 5, 0, 1>, + Conv::template process_tile<1, 0, 2, 5, 0, 2>, + Conv::template process_tile<1, 0, 2, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 5, 1, 0>, + Conv::template process_tile<1, 0, 2, 5, 1, 1>, + Conv::template process_tile<1, 0, 2, 5, 1, 2>, + Conv::template process_tile<1, 0, 2, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 5, 2, 0>, + Conv::template process_tile<1, 0, 2, 5, 2, 1>, + Conv::template process_tile<1, 0, 2, 5, 2, 2>, + Conv::template process_tile<1, 0, 2, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 5, 3, 0>, + Conv::template process_tile<1, 0, 2, 5, 3, 1>, + Conv::template process_tile<1, 0, 2, 5, 3, 2>, + Conv::template process_tile<1, 0, 2, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 2, 6, 0, 0>, + Conv::template process_tile<1, 0, 2, 6, 0, 1>, + Conv::template process_tile<1, 0, 2, 6, 0, 2>, + Conv::template process_tile<1, 0, 2, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 2, 6, 1, 0>, + Conv::template process_tile<1, 0, 2, 6, 1, 1>, + Conv::template process_tile<1, 0, 2, 6, 1, 2>, + Conv::template process_tile<1, 0, 2, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 2, 6, 2, 0>, + Conv::template process_tile<1, 0, 2, 6, 2, 1>, + Conv::template process_tile<1, 0, 2, 6, 2, 2>, + Conv::template process_tile<1, 0, 2, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 2, 6, 3, 0>, + Conv::template process_tile<1, 0, 2, 6, 3, 1>, + Conv::template process_tile<1, 0, 2, 6, 3, 2>, + Conv::template process_tile<1, 0, 2, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 0, 0, 0>, + Conv::template process_tile<1, 0, 3, 0, 0, 1>, + Conv::template process_tile<1, 0, 3, 0, 0, 2>, + Conv::template process_tile<1, 0, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 0, 1, 0>, + Conv::template process_tile<1, 0, 3, 0, 1, 1>, + Conv::template process_tile<1, 0, 3, 0, 1, 2>, + Conv::template process_tile<1, 0, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 0, 2, 0>, + Conv::template process_tile<1, 0, 3, 0, 2, 1>, + Conv::template process_tile<1, 0, 3, 0, 2, 2>, + Conv::template process_tile<1, 0, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 0, 3, 0>, + Conv::template process_tile<1, 0, 3, 0, 3, 1>, + Conv::template process_tile<1, 0, 3, 0, 3, 2>, + Conv::template process_tile<1, 0, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 1, 0, 0>, + Conv::template process_tile<1, 0, 3, 1, 0, 1>, + Conv::template process_tile<1, 0, 3, 1, 0, 2>, + Conv::template process_tile<1, 0, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 1, 1, 0>, + Conv::template process_tile<1, 0, 3, 1, 1, 1>, + Conv::template process_tile<1, 0, 3, 1, 1, 2>, + Conv::template process_tile<1, 0, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 1, 2, 0>, + Conv::template process_tile<1, 0, 3, 1, 2, 1>, + Conv::template process_tile<1, 0, 3, 1, 2, 2>, + Conv::template process_tile<1, 0, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 1, 3, 0>, + Conv::template process_tile<1, 0, 3, 1, 3, 1>, + Conv::template process_tile<1, 0, 3, 1, 3, 2>, + Conv::template process_tile<1, 0, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 2, 0, 0>, + Conv::template process_tile<1, 0, 3, 2, 0, 1>, + Conv::template process_tile<1, 0, 3, 2, 0, 2>, + Conv::template process_tile<1, 0, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 2, 1, 0>, + Conv::template process_tile<1, 0, 3, 2, 1, 1>, + Conv::template process_tile<1, 0, 3, 2, 1, 2>, + Conv::template process_tile<1, 0, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 2, 2, 0>, + Conv::template process_tile<1, 0, 3, 2, 2, 1>, + Conv::template process_tile<1, 0, 3, 2, 2, 2>, + Conv::template process_tile<1, 0, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 2, 3, 0>, + Conv::template process_tile<1, 0, 3, 2, 3, 1>, + Conv::template process_tile<1, 0, 3, 2, 3, 2>, + Conv::template process_tile<1, 0, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 3, 0, 0>, + Conv::template process_tile<1, 0, 3, 3, 0, 1>, + Conv::template process_tile<1, 0, 3, 3, 0, 2>, + Conv::template process_tile<1, 0, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 3, 1, 0>, + Conv::template process_tile<1, 0, 3, 3, 1, 1>, + Conv::template process_tile<1, 0, 3, 3, 1, 2>, + Conv::template process_tile<1, 0, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 3, 2, 0>, + Conv::template process_tile<1, 0, 3, 3, 2, 1>, + Conv::template process_tile<1, 0, 3, 3, 2, 2>, + Conv::template process_tile<1, 0, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 3, 3, 0>, + Conv::template process_tile<1, 0, 3, 3, 3, 1>, + Conv::template process_tile<1, 0, 3, 3, 3, 2>, + Conv::template process_tile<1, 0, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 4, 0, 0>, + Conv::template process_tile<1, 0, 3, 4, 0, 1>, + Conv::template process_tile<1, 0, 3, 4, 0, 2>, + Conv::template process_tile<1, 0, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 4, 1, 0>, + Conv::template process_tile<1, 0, 3, 4, 1, 1>, + Conv::template process_tile<1, 0, 3, 4, 1, 2>, + Conv::template process_tile<1, 0, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 4, 2, 0>, + Conv::template process_tile<1, 0, 3, 4, 2, 1>, + Conv::template process_tile<1, 0, 3, 4, 2, 2>, + Conv::template process_tile<1, 0, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 4, 3, 0>, + Conv::template process_tile<1, 0, 3, 4, 3, 1>, + Conv::template process_tile<1, 0, 3, 4, 3, 2>, + Conv::template process_tile<1, 0, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 5, 0, 0>, + Conv::template process_tile<1, 0, 3, 5, 0, 1>, + Conv::template process_tile<1, 0, 3, 5, 0, 2>, + Conv::template process_tile<1, 0, 3, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 5, 1, 0>, + Conv::template process_tile<1, 0, 3, 5, 1, 1>, + Conv::template process_tile<1, 0, 3, 5, 1, 2>, + Conv::template process_tile<1, 0, 3, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 5, 2, 0>, + Conv::template process_tile<1, 0, 3, 5, 2, 1>, + Conv::template process_tile<1, 0, 3, 5, 2, 2>, + Conv::template process_tile<1, 0, 3, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 5, 3, 0>, + Conv::template process_tile<1, 0, 3, 5, 3, 1>, + Conv::template process_tile<1, 0, 3, 5, 3, 2>, + Conv::template process_tile<1, 0, 3, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 3, 6, 0, 0>, + Conv::template process_tile<1, 0, 3, 6, 0, 1>, + Conv::template process_tile<1, 0, 3, 6, 0, 2>, + Conv::template process_tile<1, 0, 3, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 3, 6, 1, 0>, + Conv::template process_tile<1, 0, 3, 6, 1, 1>, + Conv::template process_tile<1, 0, 3, 6, 1, 2>, + Conv::template process_tile<1, 0, 3, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 3, 6, 2, 0>, + Conv::template process_tile<1, 0, 3, 6, 2, 1>, + Conv::template process_tile<1, 0, 3, 6, 2, 2>, + Conv::template process_tile<1, 0, 3, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 3, 6, 3, 0>, + Conv::template process_tile<1, 0, 3, 6, 3, 1>, + Conv::template process_tile<1, 0, 3, 6, 3, 2>, + Conv::template process_tile<1, 0, 3, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 0, 0, 0>, + Conv::template process_tile<1, 0, 4, 0, 0, 1>, + Conv::template process_tile<1, 0, 4, 0, 0, 2>, + Conv::template process_tile<1, 0, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 0, 1, 0>, + Conv::template process_tile<1, 0, 4, 0, 1, 1>, + Conv::template process_tile<1, 0, 4, 0, 1, 2>, + Conv::template process_tile<1, 0, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 0, 2, 0>, + Conv::template process_tile<1, 0, 4, 0, 2, 1>, + Conv::template process_tile<1, 0, 4, 0, 2, 2>, + Conv::template process_tile<1, 0, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 0, 3, 0>, + Conv::template process_tile<1, 0, 4, 0, 3, 1>, + Conv::template process_tile<1, 0, 4, 0, 3, 2>, + Conv::template process_tile<1, 0, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 1, 0, 0>, + Conv::template process_tile<1, 0, 4, 1, 0, 1>, + Conv::template process_tile<1, 0, 4, 1, 0, 2>, + Conv::template process_tile<1, 0, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 1, 1, 0>, + Conv::template process_tile<1, 0, 4, 1, 1, 1>, + Conv::template process_tile<1, 0, 4, 1, 1, 2>, + Conv::template process_tile<1, 0, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 1, 2, 0>, + Conv::template process_tile<1, 0, 4, 1, 2, 1>, + Conv::template process_tile<1, 0, 4, 1, 2, 2>, + Conv::template process_tile<1, 0, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 1, 3, 0>, + Conv::template process_tile<1, 0, 4, 1, 3, 1>, + Conv::template process_tile<1, 0, 4, 1, 3, 2>, + Conv::template process_tile<1, 0, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 2, 0, 0>, + Conv::template process_tile<1, 0, 4, 2, 0, 1>, + Conv::template process_tile<1, 0, 4, 2, 0, 2>, + Conv::template process_tile<1, 0, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 2, 1, 0>, + Conv::template process_tile<1, 0, 4, 2, 1, 1>, + Conv::template process_tile<1, 0, 4, 2, 1, 2>, + Conv::template process_tile<1, 0, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 2, 2, 0>, + Conv::template process_tile<1, 0, 4, 2, 2, 1>, + Conv::template process_tile<1, 0, 4, 2, 2, 2>, + Conv::template process_tile<1, 0, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 2, 3, 0>, + Conv::template process_tile<1, 0, 4, 2, 3, 1>, + Conv::template process_tile<1, 0, 4, 2, 3, 2>, + Conv::template process_tile<1, 0, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 3, 0, 0>, + Conv::template process_tile<1, 0, 4, 3, 0, 1>, + Conv::template process_tile<1, 0, 4, 3, 0, 2>, + Conv::template process_tile<1, 0, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 3, 1, 0>, + Conv::template process_tile<1, 0, 4, 3, 1, 1>, + Conv::template process_tile<1, 0, 4, 3, 1, 2>, + Conv::template process_tile<1, 0, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 3, 2, 0>, + Conv::template process_tile<1, 0, 4, 3, 2, 1>, + Conv::template process_tile<1, 0, 4, 3, 2, 2>, + Conv::template process_tile<1, 0, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 3, 3, 0>, + Conv::template process_tile<1, 0, 4, 3, 3, 1>, + Conv::template process_tile<1, 0, 4, 3, 3, 2>, + Conv::template process_tile<1, 0, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 4, 0, 0>, + Conv::template process_tile<1, 0, 4, 4, 0, 1>, + Conv::template process_tile<1, 0, 4, 4, 0, 2>, + Conv::template process_tile<1, 0, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 4, 1, 0>, + Conv::template process_tile<1, 0, 4, 4, 1, 1>, + Conv::template process_tile<1, 0, 4, 4, 1, 2>, + Conv::template process_tile<1, 0, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 4, 2, 0>, + Conv::template process_tile<1, 0, 4, 4, 2, 1>, + Conv::template process_tile<1, 0, 4, 4, 2, 2>, + Conv::template process_tile<1, 0, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 4, 3, 0>, + Conv::template process_tile<1, 0, 4, 4, 3, 1>, + Conv::template process_tile<1, 0, 4, 4, 3, 2>, + Conv::template process_tile<1, 0, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 5, 0, 0>, + Conv::template process_tile<1, 0, 4, 5, 0, 1>, + Conv::template process_tile<1, 0, 4, 5, 0, 2>, + Conv::template process_tile<1, 0, 4, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 5, 1, 0>, + Conv::template process_tile<1, 0, 4, 5, 1, 1>, + Conv::template process_tile<1, 0, 4, 5, 1, 2>, + Conv::template process_tile<1, 0, 4, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 5, 2, 0>, + Conv::template process_tile<1, 0, 4, 5, 2, 1>, + Conv::template process_tile<1, 0, 4, 5, 2, 2>, + Conv::template process_tile<1, 0, 4, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 5, 3, 0>, + Conv::template process_tile<1, 0, 4, 5, 3, 1>, + Conv::template process_tile<1, 0, 4, 5, 3, 2>, + Conv::template process_tile<1, 0, 4, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 4, 6, 0, 0>, + Conv::template process_tile<1, 0, 4, 6, 0, 1>, + Conv::template process_tile<1, 0, 4, 6, 0, 2>, + Conv::template process_tile<1, 0, 4, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 4, 6, 1, 0>, + Conv::template process_tile<1, 0, 4, 6, 1, 1>, + Conv::template process_tile<1, 0, 4, 6, 1, 2>, + Conv::template process_tile<1, 0, 4, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 4, 6, 2, 0>, + Conv::template process_tile<1, 0, 4, 6, 2, 1>, + Conv::template process_tile<1, 0, 4, 6, 2, 2>, + Conv::template process_tile<1, 0, 4, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 4, 6, 3, 0>, + Conv::template process_tile<1, 0, 4, 6, 3, 1>, + Conv::template process_tile<1, 0, 4, 6, 3, 2>, + Conv::template process_tile<1, 0, 4, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 0, 0, 0>, + Conv::template process_tile<1, 0, 5, 0, 0, 1>, + Conv::template process_tile<1, 0, 5, 0, 0, 2>, + Conv::template process_tile<1, 0, 5, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 0, 1, 0>, + Conv::template process_tile<1, 0, 5, 0, 1, 1>, + Conv::template process_tile<1, 0, 5, 0, 1, 2>, + Conv::template process_tile<1, 0, 5, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 0, 2, 0>, + Conv::template process_tile<1, 0, 5, 0, 2, 1>, + Conv::template process_tile<1, 0, 5, 0, 2, 2>, + Conv::template process_tile<1, 0, 5, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 0, 3, 0>, + Conv::template process_tile<1, 0, 5, 0, 3, 1>, + Conv::template process_tile<1, 0, 5, 0, 3, 2>, + Conv::template process_tile<1, 0, 5, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 1, 0, 0>, + Conv::template process_tile<1, 0, 5, 1, 0, 1>, + Conv::template process_tile<1, 0, 5, 1, 0, 2>, + Conv::template process_tile<1, 0, 5, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 1, 1, 0>, + Conv::template process_tile<1, 0, 5, 1, 1, 1>, + Conv::template process_tile<1, 0, 5, 1, 1, 2>, + Conv::template process_tile<1, 0, 5, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 1, 2, 0>, + Conv::template process_tile<1, 0, 5, 1, 2, 1>, + Conv::template process_tile<1, 0, 5, 1, 2, 2>, + Conv::template process_tile<1, 0, 5, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 1, 3, 0>, + Conv::template process_tile<1, 0, 5, 1, 3, 1>, + Conv::template process_tile<1, 0, 5, 1, 3, 2>, + Conv::template process_tile<1, 0, 5, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 2, 0, 0>, + Conv::template process_tile<1, 0, 5, 2, 0, 1>, + Conv::template process_tile<1, 0, 5, 2, 0, 2>, + Conv::template process_tile<1, 0, 5, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 2, 1, 0>, + Conv::template process_tile<1, 0, 5, 2, 1, 1>, + Conv::template process_tile<1, 0, 5, 2, 1, 2>, + Conv::template process_tile<1, 0, 5, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 2, 2, 0>, + Conv::template process_tile<1, 0, 5, 2, 2, 1>, + Conv::template process_tile<1, 0, 5, 2, 2, 2>, + Conv::template process_tile<1, 0, 5, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 2, 3, 0>, + Conv::template process_tile<1, 0, 5, 2, 3, 1>, + Conv::template process_tile<1, 0, 5, 2, 3, 2>, + Conv::template process_tile<1, 0, 5, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 3, 0, 0>, + Conv::template process_tile<1, 0, 5, 3, 0, 1>, + Conv::template process_tile<1, 0, 5, 3, 0, 2>, + Conv::template process_tile<1, 0, 5, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 3, 1, 0>, + Conv::template process_tile<1, 0, 5, 3, 1, 1>, + Conv::template process_tile<1, 0, 5, 3, 1, 2>, + Conv::template process_tile<1, 0, 5, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 3, 2, 0>, + Conv::template process_tile<1, 0, 5, 3, 2, 1>, + Conv::template process_tile<1, 0, 5, 3, 2, 2>, + Conv::template process_tile<1, 0, 5, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 3, 3, 0>, + Conv::template process_tile<1, 0, 5, 3, 3, 1>, + Conv::template process_tile<1, 0, 5, 3, 3, 2>, + Conv::template process_tile<1, 0, 5, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 4, 0, 0>, + Conv::template process_tile<1, 0, 5, 4, 0, 1>, + Conv::template process_tile<1, 0, 5, 4, 0, 2>, + Conv::template process_tile<1, 0, 5, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 4, 1, 0>, + Conv::template process_tile<1, 0, 5, 4, 1, 1>, + Conv::template process_tile<1, 0, 5, 4, 1, 2>, + Conv::template process_tile<1, 0, 5, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 4, 2, 0>, + Conv::template process_tile<1, 0, 5, 4, 2, 1>, + Conv::template process_tile<1, 0, 5, 4, 2, 2>, + Conv::template process_tile<1, 0, 5, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 4, 3, 0>, + Conv::template process_tile<1, 0, 5, 4, 3, 1>, + Conv::template process_tile<1, 0, 5, 4, 3, 2>, + Conv::template process_tile<1, 0, 5, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 5, 0, 0>, + Conv::template process_tile<1, 0, 5, 5, 0, 1>, + Conv::template process_tile<1, 0, 5, 5, 0, 2>, + Conv::template process_tile<1, 0, 5, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 5, 1, 0>, + Conv::template process_tile<1, 0, 5, 5, 1, 1>, + Conv::template process_tile<1, 0, 5, 5, 1, 2>, + Conv::template process_tile<1, 0, 5, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 5, 2, 0>, + Conv::template process_tile<1, 0, 5, 5, 2, 1>, + Conv::template process_tile<1, 0, 5, 5, 2, 2>, + Conv::template process_tile<1, 0, 5, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 5, 3, 0>, + Conv::template process_tile<1, 0, 5, 5, 3, 1>, + Conv::template process_tile<1, 0, 5, 5, 3, 2>, + Conv::template process_tile<1, 0, 5, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 5, 6, 0, 0>, + Conv::template process_tile<1, 0, 5, 6, 0, 1>, + Conv::template process_tile<1, 0, 5, 6, 0, 2>, + Conv::template process_tile<1, 0, 5, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 5, 6, 1, 0>, + Conv::template process_tile<1, 0, 5, 6, 1, 1>, + Conv::template process_tile<1, 0, 5, 6, 1, 2>, + Conv::template process_tile<1, 0, 5, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 5, 6, 2, 0>, + Conv::template process_tile<1, 0, 5, 6, 2, 1>, + Conv::template process_tile<1, 0, 5, 6, 2, 2>, + Conv::template process_tile<1, 0, 5, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 5, 6, 3, 0>, + Conv::template process_tile<1, 0, 5, 6, 3, 1>, + Conv::template process_tile<1, 0, 5, 6, 3, 2>, + Conv::template process_tile<1, 0, 5, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 0, 0, 0>, + Conv::template process_tile<1, 0, 6, 0, 0, 1>, + Conv::template process_tile<1, 0, 6, 0, 0, 2>, + Conv::template process_tile<1, 0, 6, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 0, 1, 0>, + Conv::template process_tile<1, 0, 6, 0, 1, 1>, + Conv::template process_tile<1, 0, 6, 0, 1, 2>, + Conv::template process_tile<1, 0, 6, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 0, 2, 0>, + Conv::template process_tile<1, 0, 6, 0, 2, 1>, + Conv::template process_tile<1, 0, 6, 0, 2, 2>, + Conv::template process_tile<1, 0, 6, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 0, 3, 0>, + Conv::template process_tile<1, 0, 6, 0, 3, 1>, + Conv::template process_tile<1, 0, 6, 0, 3, 2>, + Conv::template process_tile<1, 0, 6, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 1, 0, 0>, + Conv::template process_tile<1, 0, 6, 1, 0, 1>, + Conv::template process_tile<1, 0, 6, 1, 0, 2>, + Conv::template process_tile<1, 0, 6, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 1, 1, 0>, + Conv::template process_tile<1, 0, 6, 1, 1, 1>, + Conv::template process_tile<1, 0, 6, 1, 1, 2>, + Conv::template process_tile<1, 0, 6, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 1, 2, 0>, + Conv::template process_tile<1, 0, 6, 1, 2, 1>, + Conv::template process_tile<1, 0, 6, 1, 2, 2>, + Conv::template process_tile<1, 0, 6, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 1, 3, 0>, + Conv::template process_tile<1, 0, 6, 1, 3, 1>, + Conv::template process_tile<1, 0, 6, 1, 3, 2>, + Conv::template process_tile<1, 0, 6, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 2, 0, 0>, + Conv::template process_tile<1, 0, 6, 2, 0, 1>, + Conv::template process_tile<1, 0, 6, 2, 0, 2>, + Conv::template process_tile<1, 0, 6, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 2, 1, 0>, + Conv::template process_tile<1, 0, 6, 2, 1, 1>, + Conv::template process_tile<1, 0, 6, 2, 1, 2>, + Conv::template process_tile<1, 0, 6, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 2, 2, 0>, + Conv::template process_tile<1, 0, 6, 2, 2, 1>, + Conv::template process_tile<1, 0, 6, 2, 2, 2>, + Conv::template process_tile<1, 0, 6, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 2, 3, 0>, + Conv::template process_tile<1, 0, 6, 2, 3, 1>, + Conv::template process_tile<1, 0, 6, 2, 3, 2>, + Conv::template process_tile<1, 0, 6, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 3, 0, 0>, + Conv::template process_tile<1, 0, 6, 3, 0, 1>, + Conv::template process_tile<1, 0, 6, 3, 0, 2>, + Conv::template process_tile<1, 0, 6, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 3, 1, 0>, + Conv::template process_tile<1, 0, 6, 3, 1, 1>, + Conv::template process_tile<1, 0, 6, 3, 1, 2>, + Conv::template process_tile<1, 0, 6, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 3, 2, 0>, + Conv::template process_tile<1, 0, 6, 3, 2, 1>, + Conv::template process_tile<1, 0, 6, 3, 2, 2>, + Conv::template process_tile<1, 0, 6, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 3, 3, 0>, + Conv::template process_tile<1, 0, 6, 3, 3, 1>, + Conv::template process_tile<1, 0, 6, 3, 3, 2>, + Conv::template process_tile<1, 0, 6, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 4, 0, 0>, + Conv::template process_tile<1, 0, 6, 4, 0, 1>, + Conv::template process_tile<1, 0, 6, 4, 0, 2>, + Conv::template process_tile<1, 0, 6, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 4, 1, 0>, + Conv::template process_tile<1, 0, 6, 4, 1, 1>, + Conv::template process_tile<1, 0, 6, 4, 1, 2>, + Conv::template process_tile<1, 0, 6, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 4, 2, 0>, + Conv::template process_tile<1, 0, 6, 4, 2, 1>, + Conv::template process_tile<1, 0, 6, 4, 2, 2>, + Conv::template process_tile<1, 0, 6, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 4, 3, 0>, + Conv::template process_tile<1, 0, 6, 4, 3, 1>, + Conv::template process_tile<1, 0, 6, 4, 3, 2>, + Conv::template process_tile<1, 0, 6, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 5, 0, 0>, + Conv::template process_tile<1, 0, 6, 5, 0, 1>, + Conv::template process_tile<1, 0, 6, 5, 0, 2>, + Conv::template process_tile<1, 0, 6, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 5, 1, 0>, + Conv::template process_tile<1, 0, 6, 5, 1, 1>, + Conv::template process_tile<1, 0, 6, 5, 1, 2>, + Conv::template process_tile<1, 0, 6, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 5, 2, 0>, + Conv::template process_tile<1, 0, 6, 5, 2, 1>, + Conv::template process_tile<1, 0, 6, 5, 2, 2>, + Conv::template process_tile<1, 0, 6, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 5, 3, 0>, + Conv::template process_tile<1, 0, 6, 5, 3, 1>, + Conv::template process_tile<1, 0, 6, 5, 3, 2>, + Conv::template process_tile<1, 0, 6, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 0, 6, 6, 0, 0>, + Conv::template process_tile<1, 0, 6, 6, 0, 1>, + Conv::template process_tile<1, 0, 6, 6, 0, 2>, + Conv::template process_tile<1, 0, 6, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 0, 6, 6, 1, 0>, + Conv::template process_tile<1, 0, 6, 6, 1, 1>, + Conv::template process_tile<1, 0, 6, 6, 1, 2>, + Conv::template process_tile<1, 0, 6, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 0, 6, 6, 2, 0>, + Conv::template process_tile<1, 0, 6, 6, 2, 1>, + Conv::template process_tile<1, 0, 6, 6, 2, 2>, + Conv::template process_tile<1, 0, 6, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 0, 6, 6, 3, 0>, + Conv::template process_tile<1, 0, 6, 6, 3, 1>, + Conv::template process_tile<1, 0, 6, 6, 3, 2>, + Conv::template process_tile<1, 0, 6, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 0 + { // Input pad left = 1 + { // Input pad bottom = 0 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 0, 0, 0>, + Conv::template process_tile<1, 1, 0, 0, 0, 1>, + Conv::template process_tile<1, 1, 0, 0, 0, 2>, + Conv::template process_tile<1, 1, 0, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 0, 1, 0>, + Conv::template process_tile<1, 1, 0, 0, 1, 1>, + Conv::template process_tile<1, 1, 0, 0, 1, 2>, + Conv::template process_tile<1, 1, 0, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 0, 2, 0>, + Conv::template process_tile<1, 1, 0, 0, 2, 1>, + Conv::template process_tile<1, 1, 0, 0, 2, 2>, + Conv::template process_tile<1, 1, 0, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 0, 3, 0>, + Conv::template process_tile<1, 1, 0, 0, 3, 1>, + Conv::template process_tile<1, 1, 0, 0, 3, 2>, + Conv::template process_tile<1, 1, 0, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 1, 0, 0>, + Conv::template process_tile<1, 1, 0, 1, 0, 1>, + Conv::template process_tile<1, 1, 0, 1, 0, 2>, + Conv::template process_tile<1, 1, 0, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 1, 1, 0>, + Conv::template process_tile<1, 1, 0, 1, 1, 1>, + Conv::template process_tile<1, 1, 0, 1, 1, 2>, + Conv::template process_tile<1, 1, 0, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 1, 2, 0>, + Conv::template process_tile<1, 1, 0, 1, 2, 1>, + Conv::template process_tile<1, 1, 0, 1, 2, 2>, + Conv::template process_tile<1, 1, 0, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 1, 3, 0>, + Conv::template process_tile<1, 1, 0, 1, 3, 1>, + Conv::template process_tile<1, 1, 0, 1, 3, 2>, + Conv::template process_tile<1, 1, 0, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 2, 0, 0>, + Conv::template process_tile<1, 1, 0, 2, 0, 1>, + Conv::template process_tile<1, 1, 0, 2, 0, 2>, + Conv::template process_tile<1, 1, 0, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 2, 1, 0>, + Conv::template process_tile<1, 1, 0, 2, 1, 1>, + Conv::template process_tile<1, 1, 0, 2, 1, 2>, + Conv::template process_tile<1, 1, 0, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 2, 2, 0>, + Conv::template process_tile<1, 1, 0, 2, 2, 1>, + Conv::template process_tile<1, 1, 0, 2, 2, 2>, + Conv::template process_tile<1, 1, 0, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 2, 3, 0>, + Conv::template process_tile<1, 1, 0, 2, 3, 1>, + Conv::template process_tile<1, 1, 0, 2, 3, 2>, + Conv::template process_tile<1, 1, 0, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 3, 0, 0>, + Conv::template process_tile<1, 1, 0, 3, 0, 1>, + Conv::template process_tile<1, 1, 0, 3, 0, 2>, + Conv::template process_tile<1, 1, 0, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 3, 1, 0>, + Conv::template process_tile<1, 1, 0, 3, 1, 1>, + Conv::template process_tile<1, 1, 0, 3, 1, 2>, + Conv::template process_tile<1, 1, 0, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 3, 2, 0>, + Conv::template process_tile<1, 1, 0, 3, 2, 1>, + Conv::template process_tile<1, 1, 0, 3, 2, 2>, + Conv::template process_tile<1, 1, 0, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 3, 3, 0>, + Conv::template process_tile<1, 1, 0, 3, 3, 1>, + Conv::template process_tile<1, 1, 0, 3, 3, 2>, + Conv::template process_tile<1, 1, 0, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 4, 0, 0>, + Conv::template process_tile<1, 1, 0, 4, 0, 1>, + Conv::template process_tile<1, 1, 0, 4, 0, 2>, + Conv::template process_tile<1, 1, 0, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 4, 1, 0>, + Conv::template process_tile<1, 1, 0, 4, 1, 1>, + Conv::template process_tile<1, 1, 0, 4, 1, 2>, + Conv::template process_tile<1, 1, 0, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 4, 2, 0>, + Conv::template process_tile<1, 1, 0, 4, 2, 1>, + Conv::template process_tile<1, 1, 0, 4, 2, 2>, + Conv::template process_tile<1, 1, 0, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 4, 3, 0>, + Conv::template process_tile<1, 1, 0, 4, 3, 1>, + Conv::template process_tile<1, 1, 0, 4, 3, 2>, + Conv::template process_tile<1, 1, 0, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 5, 0, 0>, + Conv::template process_tile<1, 1, 0, 5, 0, 1>, + Conv::template process_tile<1, 1, 0, 5, 0, 2>, + Conv::template process_tile<1, 1, 0, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 5, 1, 0>, + Conv::template process_tile<1, 1, 0, 5, 1, 1>, + Conv::template process_tile<1, 1, 0, 5, 1, 2>, + Conv::template process_tile<1, 1, 0, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 5, 2, 0>, + Conv::template process_tile<1, 1, 0, 5, 2, 1>, + Conv::template process_tile<1, 1, 0, 5, 2, 2>, + Conv::template process_tile<1, 1, 0, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 5, 3, 0>, + Conv::template process_tile<1, 1, 0, 5, 3, 1>, + Conv::template process_tile<1, 1, 0, 5, 3, 2>, + Conv::template process_tile<1, 1, 0, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 0, 6, 0, 0>, + Conv::template process_tile<1, 1, 0, 6, 0, 1>, + Conv::template process_tile<1, 1, 0, 6, 0, 2>, + Conv::template process_tile<1, 1, 0, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 0, 6, 1, 0>, + Conv::template process_tile<1, 1, 0, 6, 1, 1>, + Conv::template process_tile<1, 1, 0, 6, 1, 2>, + Conv::template process_tile<1, 1, 0, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 0, 6, 2, 0>, + Conv::template process_tile<1, 1, 0, 6, 2, 1>, + Conv::template process_tile<1, 1, 0, 6, 2, 2>, + Conv::template process_tile<1, 1, 0, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 0, 6, 3, 0>, + Conv::template process_tile<1, 1, 0, 6, 3, 1>, + Conv::template process_tile<1, 1, 0, 6, 3, 2>, + Conv::template process_tile<1, 1, 0, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 0 + { // Input pad bottom = 1 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 0, 0, 0>, + Conv::template process_tile<1, 1, 1, 0, 0, 1>, + Conv::template process_tile<1, 1, 1, 0, 0, 2>, + Conv::template process_tile<1, 1, 1, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 0, 1, 0>, + Conv::template process_tile<1, 1, 1, 0, 1, 1>, + Conv::template process_tile<1, 1, 1, 0, 1, 2>, + Conv::template process_tile<1, 1, 1, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 0, 2, 0>, + Conv::template process_tile<1, 1, 1, 0, 2, 1>, + Conv::template process_tile<1, 1, 1, 0, 2, 2>, + Conv::template process_tile<1, 1, 1, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 0, 3, 0>, + Conv::template process_tile<1, 1, 1, 0, 3, 1>, + Conv::template process_tile<1, 1, 1, 0, 3, 2>, + Conv::template process_tile<1, 1, 1, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 1, 0, 0>, + Conv::template process_tile<1, 1, 1, 1, 0, 1>, + Conv::template process_tile<1, 1, 1, 1, 0, 2>, + Conv::template process_tile<1, 1, 1, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 1, 1, 0>, + Conv::template process_tile<1, 1, 1, 1, 1, 1>, + Conv::template process_tile<1, 1, 1, 1, 1, 2>, + Conv::template process_tile<1, 1, 1, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 1, 2, 0>, + Conv::template process_tile<1, 1, 1, 1, 2, 1>, + Conv::template process_tile<1, 1, 1, 1, 2, 2>, + Conv::template process_tile<1, 1, 1, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 1, 3, 0>, + Conv::template process_tile<1, 1, 1, 1, 3, 1>, + Conv::template process_tile<1, 1, 1, 1, 3, 2>, + Conv::template process_tile<1, 1, 1, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 2, 0, 0>, + Conv::template process_tile<1, 1, 1, 2, 0, 1>, + Conv::template process_tile<1, 1, 1, 2, 0, 2>, + Conv::template process_tile<1, 1, 1, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 2, 1, 0>, + Conv::template process_tile<1, 1, 1, 2, 1, 1>, + Conv::template process_tile<1, 1, 1, 2, 1, 2>, + Conv::template process_tile<1, 1, 1, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 2, 2, 0>, + Conv::template process_tile<1, 1, 1, 2, 2, 1>, + Conv::template process_tile<1, 1, 1, 2, 2, 2>, + Conv::template process_tile<1, 1, 1, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 2, 3, 0>, + Conv::template process_tile<1, 1, 1, 2, 3, 1>, + Conv::template process_tile<1, 1, 1, 2, 3, 2>, + Conv::template process_tile<1, 1, 1, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 3, 0, 0>, + Conv::template process_tile<1, 1, 1, 3, 0, 1>, + Conv::template process_tile<1, 1, 1, 3, 0, 2>, + Conv::template process_tile<1, 1, 1, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 3, 1, 0>, + Conv::template process_tile<1, 1, 1, 3, 1, 1>, + Conv::template process_tile<1, 1, 1, 3, 1, 2>, + Conv::template process_tile<1, 1, 1, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 3, 2, 0>, + Conv::template process_tile<1, 1, 1, 3, 2, 1>, + Conv::template process_tile<1, 1, 1, 3, 2, 2>, + Conv::template process_tile<1, 1, 1, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 3, 3, 0>, + Conv::template process_tile<1, 1, 1, 3, 3, 1>, + Conv::template process_tile<1, 1, 1, 3, 3, 2>, + Conv::template process_tile<1, 1, 1, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 4, 0, 0>, + Conv::template process_tile<1, 1, 1, 4, 0, 1>, + Conv::template process_tile<1, 1, 1, 4, 0, 2>, + Conv::template process_tile<1, 1, 1, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 4, 1, 0>, + Conv::template process_tile<1, 1, 1, 4, 1, 1>, + Conv::template process_tile<1, 1, 1, 4, 1, 2>, + Conv::template process_tile<1, 1, 1, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 4, 2, 0>, + Conv::template process_tile<1, 1, 1, 4, 2, 1>, + Conv::template process_tile<1, 1, 1, 4, 2, 2>, + Conv::template process_tile<1, 1, 1, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 4, 3, 0>, + Conv::template process_tile<1, 1, 1, 4, 3, 1>, + Conv::template process_tile<1, 1, 1, 4, 3, 2>, + Conv::template process_tile<1, 1, 1, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 5, 0, 0>, + Conv::template process_tile<1, 1, 1, 5, 0, 1>, + Conv::template process_tile<1, 1, 1, 5, 0, 2>, + Conv::template process_tile<1, 1, 1, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 5, 1, 0>, + Conv::template process_tile<1, 1, 1, 5, 1, 1>, + Conv::template process_tile<1, 1, 1, 5, 1, 2>, + Conv::template process_tile<1, 1, 1, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 5, 2, 0>, + Conv::template process_tile<1, 1, 1, 5, 2, 1>, + Conv::template process_tile<1, 1, 1, 5, 2, 2>, + Conv::template process_tile<1, 1, 1, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 5, 3, 0>, + Conv::template process_tile<1, 1, 1, 5, 3, 1>, + Conv::template process_tile<1, 1, 1, 5, 3, 2>, + Conv::template process_tile<1, 1, 1, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 1, 6, 0, 0>, + Conv::template process_tile<1, 1, 1, 6, 0, 1>, + Conv::template process_tile<1, 1, 1, 6, 0, 2>, + Conv::template process_tile<1, 1, 1, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 1, 6, 1, 0>, + Conv::template process_tile<1, 1, 1, 6, 1, 1>, + Conv::template process_tile<1, 1, 1, 6, 1, 2>, + Conv::template process_tile<1, 1, 1, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 1, 6, 2, 0>, + Conv::template process_tile<1, 1, 1, 6, 2, 1>, + Conv::template process_tile<1, 1, 1, 6, 2, 2>, + Conv::template process_tile<1, 1, 1, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 1, 6, 3, 0>, + Conv::template process_tile<1, 1, 1, 6, 3, 1>, + Conv::template process_tile<1, 1, 1, 6, 3, 2>, + Conv::template process_tile<1, 1, 1, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 1 + { // Input pad bottom = 2 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 0, 0, 0>, + Conv::template process_tile<1, 1, 2, 0, 0, 1>, + Conv::template process_tile<1, 1, 2, 0, 0, 2>, + Conv::template process_tile<1, 1, 2, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 0, 1, 0>, + Conv::template process_tile<1, 1, 2, 0, 1, 1>, + Conv::template process_tile<1, 1, 2, 0, 1, 2>, + Conv::template process_tile<1, 1, 2, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 0, 2, 0>, + Conv::template process_tile<1, 1, 2, 0, 2, 1>, + Conv::template process_tile<1, 1, 2, 0, 2, 2>, + Conv::template process_tile<1, 1, 2, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 0, 3, 0>, + Conv::template process_tile<1, 1, 2, 0, 3, 1>, + Conv::template process_tile<1, 1, 2, 0, 3, 2>, + Conv::template process_tile<1, 1, 2, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 1, 0, 0>, + Conv::template process_tile<1, 1, 2, 1, 0, 1>, + Conv::template process_tile<1, 1, 2, 1, 0, 2>, + Conv::template process_tile<1, 1, 2, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 1, 1, 0>, + Conv::template process_tile<1, 1, 2, 1, 1, 1>, + Conv::template process_tile<1, 1, 2, 1, 1, 2>, + Conv::template process_tile<1, 1, 2, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 1, 2, 0>, + Conv::template process_tile<1, 1, 2, 1, 2, 1>, + Conv::template process_tile<1, 1, 2, 1, 2, 2>, + Conv::template process_tile<1, 1, 2, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 1, 3, 0>, + Conv::template process_tile<1, 1, 2, 1, 3, 1>, + Conv::template process_tile<1, 1, 2, 1, 3, 2>, + Conv::template process_tile<1, 1, 2, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 2, 0, 0>, + Conv::template process_tile<1, 1, 2, 2, 0, 1>, + Conv::template process_tile<1, 1, 2, 2, 0, 2>, + Conv::template process_tile<1, 1, 2, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 2, 1, 0>, + Conv::template process_tile<1, 1, 2, 2, 1, 1>, + Conv::template process_tile<1, 1, 2, 2, 1, 2>, + Conv::template process_tile<1, 1, 2, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 2, 2, 0>, + Conv::template process_tile<1, 1, 2, 2, 2, 1>, + Conv::template process_tile<1, 1, 2, 2, 2, 2>, + Conv::template process_tile<1, 1, 2, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 2, 3, 0>, + Conv::template process_tile<1, 1, 2, 2, 3, 1>, + Conv::template process_tile<1, 1, 2, 2, 3, 2>, + Conv::template process_tile<1, 1, 2, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 3, 0, 0>, + Conv::template process_tile<1, 1, 2, 3, 0, 1>, + Conv::template process_tile<1, 1, 2, 3, 0, 2>, + Conv::template process_tile<1, 1, 2, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 3, 1, 0>, + Conv::template process_tile<1, 1, 2, 3, 1, 1>, + Conv::template process_tile<1, 1, 2, 3, 1, 2>, + Conv::template process_tile<1, 1, 2, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 3, 2, 0>, + Conv::template process_tile<1, 1, 2, 3, 2, 1>, + Conv::template process_tile<1, 1, 2, 3, 2, 2>, + Conv::template process_tile<1, 1, 2, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 3, 3, 0>, + Conv::template process_tile<1, 1, 2, 3, 3, 1>, + Conv::template process_tile<1, 1, 2, 3, 3, 2>, + Conv::template process_tile<1, 1, 2, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 4, 0, 0>, + Conv::template process_tile<1, 1, 2, 4, 0, 1>, + Conv::template process_tile<1, 1, 2, 4, 0, 2>, + Conv::template process_tile<1, 1, 2, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 4, 1, 0>, + Conv::template process_tile<1, 1, 2, 4, 1, 1>, + Conv::template process_tile<1, 1, 2, 4, 1, 2>, + Conv::template process_tile<1, 1, 2, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 4, 2, 0>, + Conv::template process_tile<1, 1, 2, 4, 2, 1>, + Conv::template process_tile<1, 1, 2, 4, 2, 2>, + Conv::template process_tile<1, 1, 2, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 4, 3, 0>, + Conv::template process_tile<1, 1, 2, 4, 3, 1>, + Conv::template process_tile<1, 1, 2, 4, 3, 2>, + Conv::template process_tile<1, 1, 2, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 5, 0, 0>, + Conv::template process_tile<1, 1, 2, 5, 0, 1>, + Conv::template process_tile<1, 1, 2, 5, 0, 2>, + Conv::template process_tile<1, 1, 2, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 5, 1, 0>, + Conv::template process_tile<1, 1, 2, 5, 1, 1>, + Conv::template process_tile<1, 1, 2, 5, 1, 2>, + Conv::template process_tile<1, 1, 2, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 5, 2, 0>, + Conv::template process_tile<1, 1, 2, 5, 2, 1>, + Conv::template process_tile<1, 1, 2, 5, 2, 2>, + Conv::template process_tile<1, 1, 2, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 5, 3, 0>, + Conv::template process_tile<1, 1, 2, 5, 3, 1>, + Conv::template process_tile<1, 1, 2, 5, 3, 2>, + Conv::template process_tile<1, 1, 2, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 2, 6, 0, 0>, + Conv::template process_tile<1, 1, 2, 6, 0, 1>, + Conv::template process_tile<1, 1, 2, 6, 0, 2>, + Conv::template process_tile<1, 1, 2, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 2, 6, 1, 0>, + Conv::template process_tile<1, 1, 2, 6, 1, 1>, + Conv::template process_tile<1, 1, 2, 6, 1, 2>, + Conv::template process_tile<1, 1, 2, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 2, 6, 2, 0>, + Conv::template process_tile<1, 1, 2, 6, 2, 1>, + Conv::template process_tile<1, 1, 2, 6, 2, 2>, + Conv::template process_tile<1, 1, 2, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 2, 6, 3, 0>, + Conv::template process_tile<1, 1, 2, 6, 3, 1>, + Conv::template process_tile<1, 1, 2, 6, 3, 2>, + Conv::template process_tile<1, 1, 2, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 2 + { // Input pad bottom = 3 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 0, 0, 0>, + Conv::template process_tile<1, 1, 3, 0, 0, 1>, + Conv::template process_tile<1, 1, 3, 0, 0, 2>, + Conv::template process_tile<1, 1, 3, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 0, 1, 0>, + Conv::template process_tile<1, 1, 3, 0, 1, 1>, + Conv::template process_tile<1, 1, 3, 0, 1, 2>, + Conv::template process_tile<1, 1, 3, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 0, 2, 0>, + Conv::template process_tile<1, 1, 3, 0, 2, 1>, + Conv::template process_tile<1, 1, 3, 0, 2, 2>, + Conv::template process_tile<1, 1, 3, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 0, 3, 0>, + Conv::template process_tile<1, 1, 3, 0, 3, 1>, + Conv::template process_tile<1, 1, 3, 0, 3, 2>, + Conv::template process_tile<1, 1, 3, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 1, 0, 0>, + Conv::template process_tile<1, 1, 3, 1, 0, 1>, + Conv::template process_tile<1, 1, 3, 1, 0, 2>, + Conv::template process_tile<1, 1, 3, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 1, 1, 0>, + Conv::template process_tile<1, 1, 3, 1, 1, 1>, + Conv::template process_tile<1, 1, 3, 1, 1, 2>, + Conv::template process_tile<1, 1, 3, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 1, 2, 0>, + Conv::template process_tile<1, 1, 3, 1, 2, 1>, + Conv::template process_tile<1, 1, 3, 1, 2, 2>, + Conv::template process_tile<1, 1, 3, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 1, 3, 0>, + Conv::template process_tile<1, 1, 3, 1, 3, 1>, + Conv::template process_tile<1, 1, 3, 1, 3, 2>, + Conv::template process_tile<1, 1, 3, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 2, 0, 0>, + Conv::template process_tile<1, 1, 3, 2, 0, 1>, + Conv::template process_tile<1, 1, 3, 2, 0, 2>, + Conv::template process_tile<1, 1, 3, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 2, 1, 0>, + Conv::template process_tile<1, 1, 3, 2, 1, 1>, + Conv::template process_tile<1, 1, 3, 2, 1, 2>, + Conv::template process_tile<1, 1, 3, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 2, 2, 0>, + Conv::template process_tile<1, 1, 3, 2, 2, 1>, + Conv::template process_tile<1, 1, 3, 2, 2, 2>, + Conv::template process_tile<1, 1, 3, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 2, 3, 0>, + Conv::template process_tile<1, 1, 3, 2, 3, 1>, + Conv::template process_tile<1, 1, 3, 2, 3, 2>, + Conv::template process_tile<1, 1, 3, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 3, 0, 0>, + Conv::template process_tile<1, 1, 3, 3, 0, 1>, + Conv::template process_tile<1, 1, 3, 3, 0, 2>, + Conv::template process_tile<1, 1, 3, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 3, 1, 0>, + Conv::template process_tile<1, 1, 3, 3, 1, 1>, + Conv::template process_tile<1, 1, 3, 3, 1, 2>, + Conv::template process_tile<1, 1, 3, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 3, 2, 0>, + Conv::template process_tile<1, 1, 3, 3, 2, 1>, + Conv::template process_tile<1, 1, 3, 3, 2, 2>, + Conv::template process_tile<1, 1, 3, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 3, 3, 0>, + Conv::template process_tile<1, 1, 3, 3, 3, 1>, + Conv::template process_tile<1, 1, 3, 3, 3, 2>, + Conv::template process_tile<1, 1, 3, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 4, 0, 0>, + Conv::template process_tile<1, 1, 3, 4, 0, 1>, + Conv::template process_tile<1, 1, 3, 4, 0, 2>, + Conv::template process_tile<1, 1, 3, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 4, 1, 0>, + Conv::template process_tile<1, 1, 3, 4, 1, 1>, + Conv::template process_tile<1, 1, 3, 4, 1, 2>, + Conv::template process_tile<1, 1, 3, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 4, 2, 0>, + Conv::template process_tile<1, 1, 3, 4, 2, 1>, + Conv::template process_tile<1, 1, 3, 4, 2, 2>, + Conv::template process_tile<1, 1, 3, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 4, 3, 0>, + Conv::template process_tile<1, 1, 3, 4, 3, 1>, + Conv::template process_tile<1, 1, 3, 4, 3, 2>, + Conv::template process_tile<1, 1, 3, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 5, 0, 0>, + Conv::template process_tile<1, 1, 3, 5, 0, 1>, + Conv::template process_tile<1, 1, 3, 5, 0, 2>, + Conv::template process_tile<1, 1, 3, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 5, 1, 0>, + Conv::template process_tile<1, 1, 3, 5, 1, 1>, + Conv::template process_tile<1, 1, 3, 5, 1, 2>, + Conv::template process_tile<1, 1, 3, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 5, 2, 0>, + Conv::template process_tile<1, 1, 3, 5, 2, 1>, + Conv::template process_tile<1, 1, 3, 5, 2, 2>, + Conv::template process_tile<1, 1, 3, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 5, 3, 0>, + Conv::template process_tile<1, 1, 3, 5, 3, 1>, + Conv::template process_tile<1, 1, 3, 5, 3, 2>, + Conv::template process_tile<1, 1, 3, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 3, 6, 0, 0>, + Conv::template process_tile<1, 1, 3, 6, 0, 1>, + Conv::template process_tile<1, 1, 3, 6, 0, 2>, + Conv::template process_tile<1, 1, 3, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 3, 6, 1, 0>, + Conv::template process_tile<1, 1, 3, 6, 1, 1>, + Conv::template process_tile<1, 1, 3, 6, 1, 2>, + Conv::template process_tile<1, 1, 3, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 3, 6, 2, 0>, + Conv::template process_tile<1, 1, 3, 6, 2, 1>, + Conv::template process_tile<1, 1, 3, 6, 2, 2>, + Conv::template process_tile<1, 1, 3, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 3, 6, 3, 0>, + Conv::template process_tile<1, 1, 3, 6, 3, 1>, + Conv::template process_tile<1, 1, 3, 6, 3, 2>, + Conv::template process_tile<1, 1, 3, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 3 + { // Input pad bottom = 4 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 0, 0, 0>, + Conv::template process_tile<1, 1, 4, 0, 0, 1>, + Conv::template process_tile<1, 1, 4, 0, 0, 2>, + Conv::template process_tile<1, 1, 4, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 0, 1, 0>, + Conv::template process_tile<1, 1, 4, 0, 1, 1>, + Conv::template process_tile<1, 1, 4, 0, 1, 2>, + Conv::template process_tile<1, 1, 4, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 0, 2, 0>, + Conv::template process_tile<1, 1, 4, 0, 2, 1>, + Conv::template process_tile<1, 1, 4, 0, 2, 2>, + Conv::template process_tile<1, 1, 4, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 0, 3, 0>, + Conv::template process_tile<1, 1, 4, 0, 3, 1>, + Conv::template process_tile<1, 1, 4, 0, 3, 2>, + Conv::template process_tile<1, 1, 4, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 1, 0, 0>, + Conv::template process_tile<1, 1, 4, 1, 0, 1>, + Conv::template process_tile<1, 1, 4, 1, 0, 2>, + Conv::template process_tile<1, 1, 4, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 1, 1, 0>, + Conv::template process_tile<1, 1, 4, 1, 1, 1>, + Conv::template process_tile<1, 1, 4, 1, 1, 2>, + Conv::template process_tile<1, 1, 4, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 1, 2, 0>, + Conv::template process_tile<1, 1, 4, 1, 2, 1>, + Conv::template process_tile<1, 1, 4, 1, 2, 2>, + Conv::template process_tile<1, 1, 4, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 1, 3, 0>, + Conv::template process_tile<1, 1, 4, 1, 3, 1>, + Conv::template process_tile<1, 1, 4, 1, 3, 2>, + Conv::template process_tile<1, 1, 4, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 2, 0, 0>, + Conv::template process_tile<1, 1, 4, 2, 0, 1>, + Conv::template process_tile<1, 1, 4, 2, 0, 2>, + Conv::template process_tile<1, 1, 4, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 2, 1, 0>, + Conv::template process_tile<1, 1, 4, 2, 1, 1>, + Conv::template process_tile<1, 1, 4, 2, 1, 2>, + Conv::template process_tile<1, 1, 4, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 2, 2, 0>, + Conv::template process_tile<1, 1, 4, 2, 2, 1>, + Conv::template process_tile<1, 1, 4, 2, 2, 2>, + Conv::template process_tile<1, 1, 4, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 2, 3, 0>, + Conv::template process_tile<1, 1, 4, 2, 3, 1>, + Conv::template process_tile<1, 1, 4, 2, 3, 2>, + Conv::template process_tile<1, 1, 4, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 3, 0, 0>, + Conv::template process_tile<1, 1, 4, 3, 0, 1>, + Conv::template process_tile<1, 1, 4, 3, 0, 2>, + Conv::template process_tile<1, 1, 4, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 3, 1, 0>, + Conv::template process_tile<1, 1, 4, 3, 1, 1>, + Conv::template process_tile<1, 1, 4, 3, 1, 2>, + Conv::template process_tile<1, 1, 4, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 3, 2, 0>, + Conv::template process_tile<1, 1, 4, 3, 2, 1>, + Conv::template process_tile<1, 1, 4, 3, 2, 2>, + Conv::template process_tile<1, 1, 4, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 3, 3, 0>, + Conv::template process_tile<1, 1, 4, 3, 3, 1>, + Conv::template process_tile<1, 1, 4, 3, 3, 2>, + Conv::template process_tile<1, 1, 4, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 4, 0, 0>, + Conv::template process_tile<1, 1, 4, 4, 0, 1>, + Conv::template process_tile<1, 1, 4, 4, 0, 2>, + Conv::template process_tile<1, 1, 4, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 4, 1, 0>, + Conv::template process_tile<1, 1, 4, 4, 1, 1>, + Conv::template process_tile<1, 1, 4, 4, 1, 2>, + Conv::template process_tile<1, 1, 4, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 4, 2, 0>, + Conv::template process_tile<1, 1, 4, 4, 2, 1>, + Conv::template process_tile<1, 1, 4, 4, 2, 2>, + Conv::template process_tile<1, 1, 4, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 4, 3, 0>, + Conv::template process_tile<1, 1, 4, 4, 3, 1>, + Conv::template process_tile<1, 1, 4, 4, 3, 2>, + Conv::template process_tile<1, 1, 4, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 5, 0, 0>, + Conv::template process_tile<1, 1, 4, 5, 0, 1>, + Conv::template process_tile<1, 1, 4, 5, 0, 2>, + Conv::template process_tile<1, 1, 4, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 5, 1, 0>, + Conv::template process_tile<1, 1, 4, 5, 1, 1>, + Conv::template process_tile<1, 1, 4, 5, 1, 2>, + Conv::template process_tile<1, 1, 4, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 5, 2, 0>, + Conv::template process_tile<1, 1, 4, 5, 2, 1>, + Conv::template process_tile<1, 1, 4, 5, 2, 2>, + Conv::template process_tile<1, 1, 4, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 5, 3, 0>, + Conv::template process_tile<1, 1, 4, 5, 3, 1>, + Conv::template process_tile<1, 1, 4, 5, 3, 2>, + Conv::template process_tile<1, 1, 4, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 4, 6, 0, 0>, + Conv::template process_tile<1, 1, 4, 6, 0, 1>, + Conv::template process_tile<1, 1, 4, 6, 0, 2>, + Conv::template process_tile<1, 1, 4, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 4, 6, 1, 0>, + Conv::template process_tile<1, 1, 4, 6, 1, 1>, + Conv::template process_tile<1, 1, 4, 6, 1, 2>, + Conv::template process_tile<1, 1, 4, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 4, 6, 2, 0>, + Conv::template process_tile<1, 1, 4, 6, 2, 1>, + Conv::template process_tile<1, 1, 4, 6, 2, 2>, + Conv::template process_tile<1, 1, 4, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 4, 6, 3, 0>, + Conv::template process_tile<1, 1, 4, 6, 3, 1>, + Conv::template process_tile<1, 1, 4, 6, 3, 2>, + Conv::template process_tile<1, 1, 4, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 4 + { // Input pad bottom = 5 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 0, 0, 0>, + Conv::template process_tile<1, 1, 5, 0, 0, 1>, + Conv::template process_tile<1, 1, 5, 0, 0, 2>, + Conv::template process_tile<1, 1, 5, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 0, 1, 0>, + Conv::template process_tile<1, 1, 5, 0, 1, 1>, + Conv::template process_tile<1, 1, 5, 0, 1, 2>, + Conv::template process_tile<1, 1, 5, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 0, 2, 0>, + Conv::template process_tile<1, 1, 5, 0, 2, 1>, + Conv::template process_tile<1, 1, 5, 0, 2, 2>, + Conv::template process_tile<1, 1, 5, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 0, 3, 0>, + Conv::template process_tile<1, 1, 5, 0, 3, 1>, + Conv::template process_tile<1, 1, 5, 0, 3, 2>, + Conv::template process_tile<1, 1, 5, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 1, 0, 0>, + Conv::template process_tile<1, 1, 5, 1, 0, 1>, + Conv::template process_tile<1, 1, 5, 1, 0, 2>, + Conv::template process_tile<1, 1, 5, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 1, 1, 0>, + Conv::template process_tile<1, 1, 5, 1, 1, 1>, + Conv::template process_tile<1, 1, 5, 1, 1, 2>, + Conv::template process_tile<1, 1, 5, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 1, 2, 0>, + Conv::template process_tile<1, 1, 5, 1, 2, 1>, + Conv::template process_tile<1, 1, 5, 1, 2, 2>, + Conv::template process_tile<1, 1, 5, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 1, 3, 0>, + Conv::template process_tile<1, 1, 5, 1, 3, 1>, + Conv::template process_tile<1, 1, 5, 1, 3, 2>, + Conv::template process_tile<1, 1, 5, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 2, 0, 0>, + Conv::template process_tile<1, 1, 5, 2, 0, 1>, + Conv::template process_tile<1, 1, 5, 2, 0, 2>, + Conv::template process_tile<1, 1, 5, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 2, 1, 0>, + Conv::template process_tile<1, 1, 5, 2, 1, 1>, + Conv::template process_tile<1, 1, 5, 2, 1, 2>, + Conv::template process_tile<1, 1, 5, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 2, 2, 0>, + Conv::template process_tile<1, 1, 5, 2, 2, 1>, + Conv::template process_tile<1, 1, 5, 2, 2, 2>, + Conv::template process_tile<1, 1, 5, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 2, 3, 0>, + Conv::template process_tile<1, 1, 5, 2, 3, 1>, + Conv::template process_tile<1, 1, 5, 2, 3, 2>, + Conv::template process_tile<1, 1, 5, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 3, 0, 0>, + Conv::template process_tile<1, 1, 5, 3, 0, 1>, + Conv::template process_tile<1, 1, 5, 3, 0, 2>, + Conv::template process_tile<1, 1, 5, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 3, 1, 0>, + Conv::template process_tile<1, 1, 5, 3, 1, 1>, + Conv::template process_tile<1, 1, 5, 3, 1, 2>, + Conv::template process_tile<1, 1, 5, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 3, 2, 0>, + Conv::template process_tile<1, 1, 5, 3, 2, 1>, + Conv::template process_tile<1, 1, 5, 3, 2, 2>, + Conv::template process_tile<1, 1, 5, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 3, 3, 0>, + Conv::template process_tile<1, 1, 5, 3, 3, 1>, + Conv::template process_tile<1, 1, 5, 3, 3, 2>, + Conv::template process_tile<1, 1, 5, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 4, 0, 0>, + Conv::template process_tile<1, 1, 5, 4, 0, 1>, + Conv::template process_tile<1, 1, 5, 4, 0, 2>, + Conv::template process_tile<1, 1, 5, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 4, 1, 0>, + Conv::template process_tile<1, 1, 5, 4, 1, 1>, + Conv::template process_tile<1, 1, 5, 4, 1, 2>, + Conv::template process_tile<1, 1, 5, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 4, 2, 0>, + Conv::template process_tile<1, 1, 5, 4, 2, 1>, + Conv::template process_tile<1, 1, 5, 4, 2, 2>, + Conv::template process_tile<1, 1, 5, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 4, 3, 0>, + Conv::template process_tile<1, 1, 5, 4, 3, 1>, + Conv::template process_tile<1, 1, 5, 4, 3, 2>, + Conv::template process_tile<1, 1, 5, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 5, 0, 0>, + Conv::template process_tile<1, 1, 5, 5, 0, 1>, + Conv::template process_tile<1, 1, 5, 5, 0, 2>, + Conv::template process_tile<1, 1, 5, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 5, 1, 0>, + Conv::template process_tile<1, 1, 5, 5, 1, 1>, + Conv::template process_tile<1, 1, 5, 5, 1, 2>, + Conv::template process_tile<1, 1, 5, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 5, 2, 0>, + Conv::template process_tile<1, 1, 5, 5, 2, 1>, + Conv::template process_tile<1, 1, 5, 5, 2, 2>, + Conv::template process_tile<1, 1, 5, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 5, 3, 0>, + Conv::template process_tile<1, 1, 5, 5, 3, 1>, + Conv::template process_tile<1, 1, 5, 5, 3, 2>, + Conv::template process_tile<1, 1, 5, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 5, 6, 0, 0>, + Conv::template process_tile<1, 1, 5, 6, 0, 1>, + Conv::template process_tile<1, 1, 5, 6, 0, 2>, + Conv::template process_tile<1, 1, 5, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 5, 6, 1, 0>, + Conv::template process_tile<1, 1, 5, 6, 1, 1>, + Conv::template process_tile<1, 1, 5, 6, 1, 2>, + Conv::template process_tile<1, 1, 5, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 5, 6, 2, 0>, + Conv::template process_tile<1, 1, 5, 6, 2, 1>, + Conv::template process_tile<1, 1, 5, 6, 2, 2>, + Conv::template process_tile<1, 1, 5, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 5, 6, 3, 0>, + Conv::template process_tile<1, 1, 5, 6, 3, 1>, + Conv::template process_tile<1, 1, 5, 6, 3, 2>, + Conv::template process_tile<1, 1, 5, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 5 + { // Input pad bottom = 6 + { // Input pad right = 0 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 0, 0, 0>, + Conv::template process_tile<1, 1, 6, 0, 0, 1>, + Conv::template process_tile<1, 1, 6, 0, 0, 2>, + Conv::template process_tile<1, 1, 6, 0, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 0, 1, 0>, + Conv::template process_tile<1, 1, 6, 0, 1, 1>, + Conv::template process_tile<1, 1, 6, 0, 1, 2>, + Conv::template process_tile<1, 1, 6, 0, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 0, 2, 0>, + Conv::template process_tile<1, 1, 6, 0, 2, 1>, + Conv::template process_tile<1, 1, 6, 0, 2, 2>, + Conv::template process_tile<1, 1, 6, 0, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 0, 3, 0>, + Conv::template process_tile<1, 1, 6, 0, 3, 1>, + Conv::template process_tile<1, 1, 6, 0, 3, 2>, + Conv::template process_tile<1, 1, 6, 0, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 0 + { // Input pad right = 1 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 1, 0, 0>, + Conv::template process_tile<1, 1, 6, 1, 0, 1>, + Conv::template process_tile<1, 1, 6, 1, 0, 2>, + Conv::template process_tile<1, 1, 6, 1, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 1, 1, 0>, + Conv::template process_tile<1, 1, 6, 1, 1, 1>, + Conv::template process_tile<1, 1, 6, 1, 1, 2>, + Conv::template process_tile<1, 1, 6, 1, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 1, 2, 0>, + Conv::template process_tile<1, 1, 6, 1, 2, 1>, + Conv::template process_tile<1, 1, 6, 1, 2, 2>, + Conv::template process_tile<1, 1, 6, 1, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 1, 3, 0>, + Conv::template process_tile<1, 1, 6, 1, 3, 1>, + Conv::template process_tile<1, 1, 6, 1, 3, 2>, + Conv::template process_tile<1, 1, 6, 1, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 1 + { // Input pad right = 2 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 2, 0, 0>, + Conv::template process_tile<1, 1, 6, 2, 0, 1>, + Conv::template process_tile<1, 1, 6, 2, 0, 2>, + Conv::template process_tile<1, 1, 6, 2, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 2, 1, 0>, + Conv::template process_tile<1, 1, 6, 2, 1, 1>, + Conv::template process_tile<1, 1, 6, 2, 1, 2>, + Conv::template process_tile<1, 1, 6, 2, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 2, 2, 0>, + Conv::template process_tile<1, 1, 6, 2, 2, 1>, + Conv::template process_tile<1, 1, 6, 2, 2, 2>, + Conv::template process_tile<1, 1, 6, 2, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 2, 3, 0>, + Conv::template process_tile<1, 1, 6, 2, 3, 1>, + Conv::template process_tile<1, 1, 6, 2, 3, 2>, + Conv::template process_tile<1, 1, 6, 2, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 2 + { // Input pad right = 3 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 3, 0, 0>, + Conv::template process_tile<1, 1, 6, 3, 0, 1>, + Conv::template process_tile<1, 1, 6, 3, 0, 2>, + Conv::template process_tile<1, 1, 6, 3, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 3, 1, 0>, + Conv::template process_tile<1, 1, 6, 3, 1, 1>, + Conv::template process_tile<1, 1, 6, 3, 1, 2>, + Conv::template process_tile<1, 1, 6, 3, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 3, 2, 0>, + Conv::template process_tile<1, 1, 6, 3, 2, 1>, + Conv::template process_tile<1, 1, 6, 3, 2, 2>, + Conv::template process_tile<1, 1, 6, 3, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 3, 3, 0>, + Conv::template process_tile<1, 1, 6, 3, 3, 1>, + Conv::template process_tile<1, 1, 6, 3, 3, 2>, + Conv::template process_tile<1, 1, 6, 3, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 3 + { // Input pad right = 4 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 4, 0, 0>, + Conv::template process_tile<1, 1, 6, 4, 0, 1>, + Conv::template process_tile<1, 1, 6, 4, 0, 2>, + Conv::template process_tile<1, 1, 6, 4, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 4, 1, 0>, + Conv::template process_tile<1, 1, 6, 4, 1, 1>, + Conv::template process_tile<1, 1, 6, 4, 1, 2>, + Conv::template process_tile<1, 1, 6, 4, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 4, 2, 0>, + Conv::template process_tile<1, 1, 6, 4, 2, 1>, + Conv::template process_tile<1, 1, 6, 4, 2, 2>, + Conv::template process_tile<1, 1, 6, 4, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 4, 3, 0>, + Conv::template process_tile<1, 1, 6, 4, 3, 1>, + Conv::template process_tile<1, 1, 6, 4, 3, 2>, + Conv::template process_tile<1, 1, 6, 4, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 4 + { // Input pad right = 5 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 5, 0, 0>, + Conv::template process_tile<1, 1, 6, 5, 0, 1>, + Conv::template process_tile<1, 1, 6, 5, 0, 2>, + Conv::template process_tile<1, 1, 6, 5, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 5, 1, 0>, + Conv::template process_tile<1, 1, 6, 5, 1, 1>, + Conv::template process_tile<1, 1, 6, 5, 1, 2>, + Conv::template process_tile<1, 1, 6, 5, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 5, 2, 0>, + Conv::template process_tile<1, 1, 6, 5, 2, 1>, + Conv::template process_tile<1, 1, 6, 5, 2, 2>, + Conv::template process_tile<1, 1, 6, 5, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 5, 3, 0>, + Conv::template process_tile<1, 1, 6, 5, 3, 1>, + Conv::template process_tile<1, 1, 6, 5, 3, 2>, + Conv::template process_tile<1, 1, 6, 5, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 5 + { // Input pad right = 6 + { // Output pad bottom = 0 + Conv::template process_tile<1, 1, 6, 6, 0, 0>, + Conv::template process_tile<1, 1, 6, 6, 0, 1>, + Conv::template process_tile<1, 1, 6, 6, 0, 2>, + Conv::template process_tile<1, 1, 6, 6, 0, 3>, + }, // Output pad bottom = 0 + { // Output pad bottom = 1 + Conv::template process_tile<1, 1, 6, 6, 1, 0>, + Conv::template process_tile<1, 1, 6, 6, 1, 1>, + Conv::template process_tile<1, 1, 6, 6, 1, 2>, + Conv::template process_tile<1, 1, 6, 6, 1, 3>, + }, // Output pad bottom = 1 + { // Output pad bottom = 2 + Conv::template process_tile<1, 1, 6, 6, 2, 0>, + Conv::template process_tile<1, 1, 6, 6, 2, 1>, + Conv::template process_tile<1, 1, 6, 6, 2, 2>, + Conv::template process_tile<1, 1, 6, 6, 2, 3>, + }, // Output pad bottom = 2 + { // Output pad bottom = 3 + Conv::template process_tile<1, 1, 6, 6, 3, 0>, + Conv::template process_tile<1, 1, 6, 6, 3, 1>, + Conv::template process_tile<1, 1, 6, 6, 3, 2>, + Conv::template process_tile<1, 1, 6, 6, 3, 3>, + }, // Output pad bottom = 3 + }, // Input pad right = 6 + }, // Input pad bottom = 6 + }, // Input pad left = 1 + }, // Input pad top = 1 +}; + + +template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp new file mode 100644 index 0000000000..ac83bf9dd2 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp" + +using namespace winograd; + +template +BatchedBlockedGemm::BatchedBlockedGemm( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn* const a_ptr, + const TIn* const b_ptr, + TOut* const c_ptr +) : n_gemms(n_gemms), M(M), N(N), K(K), + a_matrix_stride(a_matrix_stride), + a_row_stride(a_row_stride), + b_matrix_stride(b_matrix_stride), + b_row_stride(b_row_stride), + c_matrix_stride(c_matrix_stride), + c_row_stride(c_row_stride), + a_ptr(a_ptr), b_ptr(b_ptr), c_ptr(c_ptr) +{ +} + +template +unsigned int BatchedBlockedGemm::get_window() const +{ + return n_gemms; +} + +template +void BatchedBlockedGemm::run( + const unsigned int start, const unsigned int stop +) +{ + // Perform the specified GEMMs + for (unsigned int i = start; i < stop; i++) + { + // Get pointers to the relevant matrices + const TIn* const mtr_a = a_ptr + i*a_matrix_stride; + const TIn* const mtr_b = b_ptr + i*b_matrix_stride; + TOut* const mtr_c = c_ptr + i*c_matrix_stride; + + // Perform the GEMM + BlockedGemm( + mtr_a, mtr_b, mtr_c, M, K, N, + a_row_stride, b_row_stride, c_row_stride + ); + } +} + +template class winograd::BatchedBlockedGemm<4, 16, float, float>; + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..6d8afc0def --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform; + +/****************************************************************************** + * Cost methods for the input transform. + * ===================================== + */ +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); + const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); + return 16 * 16 * tile_M * tile_N * input_shape.n_channels; +} +/*****************************************************************************/ + +/***************************************************************************** +* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a +* variety of padding types. For example, tiles at the top and left of an image +* can require one row or column of padding on their top and left sides if the +* padding type is SAME (where X represents a padded value): +* +* _______ _______ +* |X X X X| |X X X X| +* |X | | | . . . +* |X | | | +* |X______| |_______| +* _______ +* |X | . +* |X | . . . . +* |X | . +* |X______| +* +* For tiles near the right or bottom of the image it is more complicated. Such +* tiles might require padding by 0 or 1 rows or columns if the padding type is +* VALID or 1 or 2 rows or columns if the padding type is SAME: +* +* _______ _______ _______ _______ +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X______| |_______| |______X| |____X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X______| |_______| |______X| |____X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* Additional tiles are required for especially small input images. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2} +* Padding right in {0, 1, 2} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int inner_tile_i = 4, inner_tile_j = 4; + constexpr int cells_i = inner_tile_i - pad_bottom; + constexpr int cells_j = inner_tile_i - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[inner_tile_i][inner_tile_j]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[inner_tile_i][inner_tile_j]; + float XTx[inner_tile_i][inner_tile_j]; + float U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel. + float32x4_t x[inner_tile_i][inner_tile_j]; + float32x4_t XTx[inner_tile_i][inner_tile_j]; + float32x4_t U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = x[0][j] - x[2][j]; + XTx[0][j] = vsubq_f32(x[0][j], x[2][j]); + + // XTx[1][j] = x[1][j] + x[2][j]; + XTx[1][j] = vaddq_f32(x[1][j], x[2][j]); + + // XTx[2][j] = x[2][j] - x[1][j]; + XTx[2][j] = vsubq_f32(x[2][j], x[1][j]); + + // XTx[3][j] = x[1][j] - x[3][j]; + XTx[3][j] = vsubq_f32(x[1][j], x[3][j]); + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + // U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]); + + // U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]); + + // U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]); + + // U[i][3] = XTx[i][1] - XTx[i][3]; + U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel. + float32x2_t x[inner_tile_i][inner_tile_j]; + float32x2_t XTx[inner_tile_i][inner_tile_j]; + float32x2_t U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = x[0][j] - x[2][j]; + XTx[0][j] = vsub_f32(x[0][j], x[2][j]); + + // XTx[1][j] = x[1][j] + x[2][j]; + XTx[1][j] = vadd_f32(x[1][j], x[2][j]); + + // XTx[2][j] = x[2][j] - x[1][j]; + XTx[2][j] = vsub_f32(x[2][j], x[1][j]); + + // XTx[3][j] = x[1][j] - x[3][j]; + XTx[3][j] = vsub_f32(x[1][j], x[3][j]); + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + // U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]); + + // U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]); + + // U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]); + + // U[i][3] = XTx[i][1] - XTx[i][3]; + U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = x[0][j] - x[2][j]; + XTx[1][j] = x[1][j] + x[2][j]; + XTx[2][j] = x[2][j] - x[1][j]; + XTx[3][j] = x[1][j] - x[3][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][3] = XTx[i][1] - XTx[i][3]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // Right + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom-right + Transform::template process_tile<0, 0, 1, 2>, // Bottom-right + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom-right + Transform::template process_tile<0, 0, 2, 2>, // Bottom-right + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, // Left AND right + Transform::template process_tile<0, 1, 0, 2>, // Left AND right + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Left-bottom + Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right + Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right + }, + { + Transform::template process_tile<0, 1, 2, 0>, // Left-bottom + Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right + Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right + } + }, + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top-right + Transform::template process_tile<1, 0, 0, 2>, // Top-right + }, + { + Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom + Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right + Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right + }, + { + Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom + Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right + Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right + } + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top-left + Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right + Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right + }, + { + Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom + Transform::template process_tile<1, 1, 1, 1>, // All padded + Transform::template process_tile<1, 1, 1, 2>, // All padded + }, + { + Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom + Transform::template process_tile<1, 1, 2, 1>, // All padded + Transform::template process_tile<1, 1, 2, 2>, // All padded + } + } + } +}; + +template struct WinogradGEMM<2, 2, 3, 3>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..d9ebe8b7cd --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + return 0; // TODO +} + +/***************************************************************************** +* F(2x2, 5x5) implies the use of a 6x6 input tile. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2, 3, 4} +* Padding right in {0, 1, 2, 3, 4} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int cells_i = 6 - pad_bottom; + constexpr int cells_j = 6 - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[6][6]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel + float32x4_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel + float32x2_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // " " + Transform::template process_tile<0, 0, 0, 3>, // " " + Transform::template process_tile<0, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom right + Transform::template process_tile<0, 0, 1, 2>, // " " + Transform::template process_tile<0, 0, 1, 3>, // " " + Transform::template process_tile<0, 0, 1, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom right + Transform::template process_tile<0, 0, 2, 2>, // " " + Transform::template process_tile<0, 0, 2, 3>, // " " + Transform::template process_tile<0, 0, 2, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 3, 0>, // Bottom + Transform::template process_tile<0, 0, 3, 1>, // Bottom right + Transform::template process_tile<0, 0, 3, 2>, // " " + Transform::template process_tile<0, 0, 3, 3>, // " " + Transform::template process_tile<0, 0, 3, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 4, 0>, // Bottom + Transform::template process_tile<0, 0, 4, 1>, // Bottom right + Transform::template process_tile<0, 0, 4, 2>, // " " + Transform::template process_tile<0, 0, 4, 3>, // " " + Transform::template process_tile<0, 0, 4, 4>, // " " + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, + Transform::template process_tile<0, 1, 0, 2>, + Transform::template process_tile<0, 1, 0, 3>, + Transform::template process_tile<0, 1, 0, 4>, + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Bottom left + Transform::template process_tile<0, 1, 1, 1>, + Transform::template process_tile<0, 1, 1, 2>, + Transform::template process_tile<0, 1, 1, 3>, + Transform::template process_tile<0, 1, 1, 4>, + }, + { + Transform::template process_tile<0, 1, 2, 0>, // " " + Transform::template process_tile<0, 1, 2, 1>, + Transform::template process_tile<0, 1, 2, 2>, + Transform::template process_tile<0, 1, 2, 3>, + Transform::template process_tile<0, 1, 2, 4>, + }, + { + Transform::template process_tile<0, 1, 3, 0>, // " " + Transform::template process_tile<0, 1, 3, 1>, + Transform::template process_tile<0, 1, 3, 2>, + Transform::template process_tile<0, 1, 3, 3>, + Transform::template process_tile<0, 1, 3, 4>, + }, + { + Transform::template process_tile<0, 1, 4, 0>, // " " + Transform::template process_tile<0, 1, 4, 1>, + Transform::template process_tile<0, 1, 4, 2>, + Transform::template process_tile<0, 1, 4, 3>, + Transform::template process_tile<0, 1, 4, 4>, + } + } + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top right + Transform::template process_tile<1, 0, 0, 2>, // " " + Transform::template process_tile<1, 0, 0, 3>, // " " + Transform::template process_tile<1, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<1, 0, 1, 0>, + Transform::template process_tile<1, 0, 1, 1>, + Transform::template process_tile<1, 0, 1, 2>, + Transform::template process_tile<1, 0, 1, 3>, + Transform::template process_tile<1, 0, 1, 4>, + }, + { + Transform::template process_tile<1, 0, 2, 0>, + Transform::template process_tile<1, 0, 2, 1>, + Transform::template process_tile<1, 0, 2, 2>, + Transform::template process_tile<1, 0, 2, 3>, + Transform::template process_tile<1, 0, 2, 4>, + }, + { + Transform::template process_tile<1, 0, 3, 0>, + Transform::template process_tile<1, 0, 3, 1>, + Transform::template process_tile<1, 0, 3, 2>, + Transform::template process_tile<1, 0, 3, 3>, + Transform::template process_tile<1, 0, 3, 4>, + }, + { + Transform::template process_tile<1, 0, 4, 0>, + Transform::template process_tile<1, 0, 4, 1>, + Transform::template process_tile<1, 0, 4, 2>, + Transform::template process_tile<1, 0, 4, 3>, + Transform::template process_tile<1, 0, 4, 4>, + }, + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top left + Transform::template process_tile<1, 1, 0, 1>, + Transform::template process_tile<1, 1, 0, 2>, + Transform::template process_tile<1, 1, 0, 3>, + Transform::template process_tile<1, 1, 0, 4>, + }, + { + Transform::template process_tile<1, 1, 1, 0>, + Transform::template process_tile<1, 1, 1, 1>, + Transform::template process_tile<1, 1, 1, 2>, + Transform::template process_tile<1, 1, 1, 3>, + Transform::template process_tile<1, 1, 1, 4>, + }, + { + Transform::template process_tile<1, 1, 2, 0>, + Transform::template process_tile<1, 1, 2, 1>, + Transform::template process_tile<1, 1, 2, 2>, + Transform::template process_tile<1, 1, 2, 3>, + Transform::template process_tile<1, 1, 2, 4>, + }, + { + Transform::template process_tile<1, 1, 3, 0>, + Transform::template process_tile<1, 1, 3, 1>, + Transform::template process_tile<1, 1, 3, 2>, + Transform::template process_tile<1, 1, 3, 3>, + Transform::template process_tile<1, 1, 3, 4>, + }, + { + Transform::template process_tile<1, 1, 4, 0>, + Transform::template process_tile<1, 1, 4, 1>, + Transform::template process_tile<1, 1, 4, 2>, + Transform::template process_tile<1, 1, 4, 3>, + Transform::template process_tile<1, 1, 4, 4>, + } + } + } +}; + +template struct WinogradGEMM<2, 2, 5, 5>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..04d1573e4c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); + const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); + return 12 * 24 * tile_M * tile_N * input_shape.n_channels; +} + +/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a +* variety of padding types. For example, tiles at the top and left of an +* image can require one row or column of padding on their top and left sides +* if the padding type is SAME (where X represents a padded value): +* +* ___________ ___________ +* |X X X X X X| |X X X X X X| +* |X | | | +* |X | | | +* |X | | | +* |X | | | +* |X__________| |___________| +* ___________ +* |X | +* |X | +* |X | +* |X | +* |X | +* |X__________| +* +* For tiles near the right or bottom of the image it is more complicated. +* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the +* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding +* type is SAME. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2, 3, 4} +* Padding right in {0, 1, 2, 3, 4} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int cells_i = 6 - pad_bottom; + constexpr int cells_j = 6 - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[6][6]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel + float32x4_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel + float32x2_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +/* In the below, unusual or especially small tiles are routed via the slow + * path whereas common or large tiles are routed through a faster path. + */ +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // " " + Transform::template process_tile<0, 0, 0, 3>, // " " + Transform::template process_tile<0, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom right + Transform::template process_tile<0, 0, 1, 2>, // " " + Transform::template process_tile<0, 0, 1, 3>, // " " + Transform::template process_tile<0, 0, 1, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom right + Transform::template process_tile<0, 0, 2, 2>, // " " + Transform::template process_tile<0, 0, 2, 3>, // " " + Transform::template process_tile<0, 0, 2, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 3, 0>, // Bottom + Transform::template process_tile<0, 0, 3, 1>, // Bottom right + Transform::template process_tile<0, 0, 3, 2>, // " " + Transform::template process_tile<0, 0, 3, 3>, // " " + Transform::template process_tile<0, 0, 3, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 4, 0>, // Bottom + Transform::template process_tile<0, 0, 4, 1>, // Bottom right + Transform::template process_tile<0, 0, 4, 2>, // " " + Transform::template process_tile<0, 0, 4, 3>, // " " + Transform::template process_tile<0, 0, 4, 4>, // " " + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, + Transform::template process_tile<0, 1, 0, 2>, + Transform::template process_tile<0, 1, 0, 3>, + Transform::template process_tile<0, 1, 0, 4>, + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Bottom left + Transform::template process_tile<0, 1, 1, 1>, + Transform::template process_tile<0, 1, 1, 2>, + Transform::template process_tile<0, 1, 1, 3>, + Transform::template process_tile<0, 1, 1, 4>, + }, + { + Transform::template process_tile<0, 1, 2, 0>, // " " + Transform::template process_tile<0, 1, 2, 1>, + Transform::template process_tile<0, 1, 2, 2>, + Transform::template process_tile<0, 1, 2, 3>, + Transform::template process_tile<0, 1, 2, 4>, + }, + { + Transform::template process_tile<0, 1, 3, 0>, // " " + Transform::template process_tile<0, 1, 3, 1>, + Transform::template process_tile<0, 1, 3, 2>, + Transform::template process_tile<0, 1, 3, 3>, + Transform::template process_tile<0, 1, 3, 4>, + }, + { + Transform::template process_tile<0, 1, 4, 0>, // " " + Transform::template process_tile<0, 1, 4, 1>, + Transform::template process_tile<0, 1, 4, 2>, + Transform::template process_tile<0, 1, 4, 3>, + Transform::template process_tile<0, 1, 4, 4>, + } + } + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top right + Transform::template process_tile<1, 0, 0, 2>, // " " + Transform::template process_tile<1, 0, 0, 3>, // " " + Transform::template process_tile<1, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<1, 0, 1, 0>, + Transform::template process_tile<1, 0, 1, 1>, + Transform::template process_tile<1, 0, 1, 2>, + Transform::template process_tile<1, 0, 1, 3>, + Transform::template process_tile<1, 0, 1, 4>, + }, + { + Transform::template process_tile<1, 0, 2, 0>, + Transform::template process_tile<1, 0, 2, 1>, + Transform::template process_tile<1, 0, 2, 2>, + Transform::template process_tile<1, 0, 2, 3>, + Transform::template process_tile<1, 0, 2, 4>, + }, + { + Transform::template process_tile<1, 0, 3, 0>, + Transform::template process_tile<1, 0, 3, 1>, + Transform::template process_tile<1, 0, 3, 2>, + Transform::template process_tile<1, 0, 3, 3>, + Transform::template process_tile<1, 0, 3, 4>, + }, + { + Transform::template process_tile<1, 0, 4, 0>, + Transform::template process_tile<1, 0, 4, 1>, + Transform::template process_tile<1, 0, 4, 2>, + Transform::template process_tile<1, 0, 4, 3>, + Transform::template process_tile<1, 0, 4, 4>, + }, + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top left + Transform::template process_tile<1, 1, 0, 1>, + Transform::template process_tile<1, 1, 0, 2>, + Transform::template process_tile<1, 1, 0, 3>, + Transform::template process_tile<1, 1, 0, 4>, + }, + { + Transform::template process_tile<1, 1, 1, 0>, + Transform::template process_tile<1, 1, 1, 1>, + Transform::template process_tile<1, 1, 1, 2>, + Transform::template process_tile<1, 1, 1, 3>, + Transform::template process_tile<1, 1, 1, 4>, + }, + { + Transform::template process_tile<1, 1, 2, 0>, + Transform::template process_tile<1, 1, 2, 1>, + Transform::template process_tile<1, 1, 2, 2>, + Transform::template process_tile<1, 1, 2, 3>, + Transform::template process_tile<1, 1, 2, 4>, + }, + { + Transform::template process_tile<1, 1, 3, 0>, + Transform::template process_tile<1, 1, 3, 1>, + Transform::template process_tile<1, 1, 3, 2>, + Transform::template process_tile<1, 1, 3, 3>, + Transform::template process_tile<1, 1, 3, 4>, + }, + { + Transform::template process_tile<1, 1, 4, 0>, + Transform::template process_tile<1, 1, 4, 1>, + Transform::template process_tile<1, 1, 4, 2>, + Transform::template process_tile<1, 1, 4, 3>, + Transform::template process_tile<1, 1, 4, 4>, + } + } + } +}; + +template struct WinogradGEMM<4, 4, 3, 3>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..a95ce0e7d2 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(shape.n_rows, 2); + const int tile_N = iceildiv(shape.n_cols, 2); + return 24 * tile_M * tile_N * shape.n_channels; +} + +/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use + * enough tiles to cover the output space each output tile may contain 0 or 1 + * padded values to the right and bottom columns or rows of the tile, e.g.: + * + * ___ ___ + * | | | X| + * |___| |__X| + * + * ___ ___ + * | | | X| + * |X_X| |X_X| + * + * + * We provide a specialised output transform for each of these instances. + * Consequently we below construct an array of the various padding options, the + * array contains pointers to the specific implementations. + */ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 2 - pad_bottom; + constexpr int cells_j = 2 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); + + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Load the bias vector + b = vld1q_f32(bptr); + bptr += 4; + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); + + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Load the bias vector + b = vld1_f32(bptr); + bptr += 2; + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + } + + // Load the bias + b = *(bptr++); + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, // No padding + Transform::template process_tile<0, 1>, // Right padding + }, + { + Transform::template process_tile<1, 0>, // Bottom padding + Transform::template process_tile<1, 1>, // Bottom and right padding + } +}; + +template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..262f71118c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + return 0; // TODO +} + +/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use + * enough tiles to cover the output space each output tile may contain 0 or 1 + * padded values to the right and bottom columns or rows of the tile, e.g.: + * + * ___ ___ + * | | | X| + * |___| |__X| + * + * ___ ___ + * | | | X| + * |X_X| |X_X| + * + * + * We provide a specialised output transform for each of these instances. + * Consequently we below construct an array of the various padding options, the + * array contains pointers to the specific implementations. + */ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 2 - pad_bottom; + constexpr int cells_j = 2 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, // No padding + Transform::template process_tile<0, 1>, // Right padding + }, + { + Transform::template process_tile<1, 0>, // Bottom padding + Transform::template process_tile<1, 1>, // Bottom and right padding + } +}; + +template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..8f47736f0c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(shape.n_rows, 4); + const int tile_N = iceildiv(shape.n_cols, 4); + return 170 * tile_M * tile_N * shape.n_channels; +} + +// Instantiate cost methods +template int Transform::ops_performed(const Tensor4DShape&); + +/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use + * enough tiles to cover the output space each output tile may contain up to 3 + * padded values to the right and bottom columns or rows of the tile, e.g.: +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |_______| |______X| |____X_X| |__X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* +* We provide a specialised output transform for each of these instances. +*/ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 4 - pad_bottom; + constexpr int cells_j = 4 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); + + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); + + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, + Transform::template process_tile<0, 1>, + Transform::template process_tile<0, 2>, + Transform::template process_tile<0, 3>, + }, + { + Transform::template process_tile<1, 0>, + Transform::template process_tile<1, 1>, + Transform::template process_tile<1, 2>, + Transform::template process_tile<1, 3>, + }, + { + Transform::template process_tile<2, 0>, + Transform::template process_tile<2, 1>, + Transform::template process_tile<2, 2>, + Transform::template process_tile<2, 3>, + }, + { + Transform::template process_tile<3, 0>, + Transform::template process_tile<3, 1>, + Transform::template process_tile<3, 2>, + Transform::template process_tile<3, 3>, + } +}; + +template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..6c71461f81 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + template <> + template <> + void WinogradGEMM<2, 2, 3, 3>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + constexpr int inner_tile_i = 4; + constexpr int inner_tile_j = 4; + + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 3 * weight_col_stride; + const float *inptrs[3][3]; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + + // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + + // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + + // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + + // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<2, 2, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) + { + const int channel_prod = shape.n_input_channels * shape.n_output_channels; + return 2 * 18 * channel_prod; + } + + template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..2f4f6e1ba2 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + template <> + template <> + void WinogradGEMM<2, 2, 5, 5>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 5 * weight_col_stride; + const float *inptrs[5][5]; + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + // Ww[0][j] = w[0][j]/4.0f; + Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f); + + // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[1][j] = vmulq_n_f32( + vaddq_f32( + vaddq_f32( + vaddq_f32(w[1][j], w[0][j]), + vaddq_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + -1.0f/6.0f + ); + + // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; + Ww[2][j] = vmulq_n_f32( + vsubq_f32( + vaddq_f32( + vsubq_f32(w[1][j], w[0][j]), + vsubq_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + 1.0f/6.0f + ); + + // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[3][j] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), + vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), + vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[5][j] = w[4][j]; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + // V[i][0] = Ww[i][0]/4.0f; + V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f); + + // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][1] = vmulq_n_f32( + vaddq_f32( + vaddq_f32( + vaddq_f32(Ww[i][1], Ww[i][0]), + vaddq_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + -1.0f/6.0f + ); + + // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; + V[i][2] = vmulq_n_f32( + vsubq_f32( + vaddq_f32( + vsubq_f32(Ww[i][1], Ww[i][0]), + vsubq_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + 1.0f/6.0f + ); + + // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][3] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), + vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), + vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][5] = Ww[i][4]; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + // Ww[0][j] = w[0][j]/4.0f; + Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f); + + // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[1][j] = vmul_n_f32( + vadd_f32( + vadd_f32( + vadd_f32(w[1][j], w[0][j]), + vadd_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + -1.0f/6.0f + ); + + // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; + Ww[2][j] = vmul_n_f32( + vsub_f32( + vadd_f32( + vsub_f32(w[1][j], w[0][j]), + vsub_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + 1.0f/6.0f + ); + + // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[3][j] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), + vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), + vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[5][j] = w[4][j]; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + // V[i][0] = Ww[i][0]/4.0f; + V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f); + + // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][1] = vmul_n_f32( + vadd_f32( + vadd_f32( + vadd_f32(Ww[i][1], Ww[i][0]), + vadd_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + -1.0f/6.0f + ); + + // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; + V[i][2] = vmul_n_f32( + vsub_f32( + vadd_f32( + vsub_f32(Ww[i][1], Ww[i][0]), + vsub_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + 1.0f/6.0f + ); + + // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][3] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), + vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), + vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][5] = Ww[i][4]; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + Ww[0][j] = w[0][j]/4.0f; + Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + V[i][0] = Ww[i][0]/4.0f; + V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<2, 2, 5, 5>::WeightsTransform::ops_performed(const KernelShape &shape) + { + return 0; // TODO + } + + template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..a56a475fc9 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + /* Float implementation for kernel transform F(4x4, 3x3) */ + template <> + template <> + void WinogradGEMM<4, 4, 3, 3>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, // NOTE: Data in HWIO order + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 3 * weight_col_stride; + const float *inptrs[3][3]; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + // Ww[0][j] = 6*w[0][j]; + Ww[0][j] = vmulq_n_f32(w[0][j], 6.0); + + // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0); + + // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0); + + // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[5][j] = 24*w[2][j]; + Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f); + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + const float recip576 = 1.0f / 576.0f; + + // V[i][0] = 6*Ww[i][0]; + V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576); + + // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; + V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); + + // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; + V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); + + // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; + V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; + V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][5] = 24*Ww[i][2]; + V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576); + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + // Ww[0][j] = 6*w[0][j]; + Ww[0][j] = vmul_n_f32(w[0][j], 6.0); + + // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0); + + // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0); + + // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[5][j] = 24*w[2][j]; + Ww[5][j] = vmul_n_f32(w[2][j], 24.0f); + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + const float recip576 = 1.0f / 576.0f; + + // V[i][0] = 6*Ww[i][0]; + V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576); + + // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; + V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); + + // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; + V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); + + // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; + V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; + V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][5] = 24*Ww[i][2]; + V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576); + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = 6*w[0][j]; + Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[5][j] = 24*w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + V[i][0] = ( 6*Ww[i][0]) / 576.0; + V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0; + V[i][2] = (-4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]) / 576.0; + V[i][3] = ( 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]) / 576.0; + V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]) / 576.0; + V[i][5] = (24*Ww[i][2]) / 576.0; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<4, 4, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) + { + const int channel_prod = shape.n_input_channels * shape.n_output_channels; + return 9 * 16 * channel_prod; + } + + template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform; +} diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp new file mode 100644 index 0000000000..8f8cd250bf --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp" + +using namespace winograd; + +/** Get the output shape of a convolution. */ +template +template +Tensor4DShape WinogradGEMM::Convolution::get_output_shape( + const KernelShape &kernel_shape, + const Tensor4DShape &in_shape, + const PaddingType padding +) +{ + return Tensor4DShape { + in_shape.n_batches, + (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1), + (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1), + kernel_shape.n_output_channels, + in_shape.ordering + }; +} + +/* Get the memory required to transform the kernel. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_transform_working_size(const KernelShape &shape) +{ + if (shape.ordering == HWIO) + { + // Kernel is already in the correct order, so no additional memory is + // required. + return 0; + } + else + { + // Need to re-order the kernel into HWIO form, require enough space to + // represent the tensor. + return sizeof(TIn) * shape.size(); + } +} + +/** Get the memory required to store the kernel transformed into the + * Winograd domain. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_storage_size(const KernelShape &shape) +{ + return N_GEMMS * get_kernel_matrix_size(shape); +} + + +template +template +size_t WinogradGEMM::Convolution::get_input_storage_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding +) +{ + return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding); +} + + +template +template +size_t WinogradGEMM::Convolution::get_output_storage_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding +) +{ + return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding); +} + + +/** Get the memory required to apply a Winograd operator to some input. + */ +template +template +size_t WinogradGEMM::Convolution::get_working_space_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + + // Get the memory required to store the matrices + const size_t matrix_sizes = N_GEMMS * ( + get_input_matrix_size(kernel_shape, input_shape, padding_type) + + get_output_matrix_size(kernel_shape, input_shape, padding_type) + ); + + // Add additional space to re-order the input and output if the input tensor + // is not in NHWC format. + if (input_shape.ordering == NHWC) + { + return matrix_sizes; // No extra spacing required + } + else // NCHW, must reorder the input and output tensors + { + // We only need to re-order the input or output at any one time, so request + // enough memory to do the largest of these. + const size_t extra_memory = std::max( + sizeof(TIn) * input_shape.size(), + sizeof(TOut) * output_shape.size() + ); + return matrix_sizes + extra_memory; + } +} + + +/* Get the memory required by a single "input" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_input_matrix_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn); +} + +template +template +int WinogradGEMM::Convolution::get_input_matrix_stride( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + // Compute shape for the GEMM + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK); + const int K = kernel_shape.n_input_channels; + + return M * K; +} + + +/* Get the memory required by a single "output" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_output_matrix_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut); +} + + +template +template +int WinogradGEMM::Convolution::get_output_matrix_stride( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + // Compute shape for the GEMM + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int M = roundup(tile_rows * tile_cols, M_BLOCK); + const int N = roundup(kernel_shape.n_output_channels, N_BLOCK); + + return input_shape.n_batches * M * N; +} + + +/* Get the memory required by a single "kernel" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_matrix_size(const KernelShape &shape) +{ + return sizeof(TIn) * get_kernel_matrix_stride(shape); +} + +template +template +int WinogradGEMM::Convolution::get_kernel_matrix_stride(const KernelShape &shape) +{ + const int K = shape.n_input_channels; + const int N = roundup(shape.n_output_channels, N_BLOCK); + return K * N; +} + + +/** Create a new Winograd operator. */ +template +template +WinogradGEMM::Convolution::Convolution( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding, + void *kernel_storage +) : kernel_shape(kernel_shape), // Store the kernel shape + kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), + manage_kernel_storage(kernel_storage == NULL), + _kernel_storage(manage_kernel_storage ? + ALLOCATE(get_kernel_storage_size(kernel_shape)) : + kernel_storage), + input_shape(input_shape), + padding(padding), + output_shape(get_output_shape(kernel_shape, input_shape, padding)), + tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)), + tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)), + M(input_shape.n_batches * tile_rows * tile_cols), + K(kernel_shape.n_input_channels), + N(kernel_shape.n_output_channels), + prof() +{ + // Create pointers to the kernel matrices + const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); + int8_t* const ks_bytes = reinterpret_cast(_kernel_storage); + for (int i = 0; i < N_GEMMS; i++) { + kernel_matrices[i] = reinterpret_cast( + ks_bytes + i*kernel_matrix_size_bytes); + } +} + + +/** Create a new Winograd operator and initialise the weights. */ +template +template +WinogradGEMM::Convolution::Convolution( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding, + const TIn* const kernel, + void *kernel_storage, + void *transform_working_space +) : Convolution(kernel_shape, input_shape, padding, kernel_storage) +{ + transform_weights(kernel, transform_working_space); +} + + +/** Clean up a convolution engine. */ +template +template +WinogradGEMM:: +Convolution::~Convolution() +{ + // If we were responsible for managing kernel storage ensure that it is + // freed. + if (manage_kernel_storage) + { + free(_kernel_storage); + } +} + + +/** Transform weights into the Winograd domain and store them for later use/reuse. */ +template +template +template +void WinogradGEMM:: +Convolution::transform_weights( + const TIn* const kernel, + void *transform_working_space +) +{ + // Allocate working space if it is required + bool allocated_working_space = false; + if (transform_working_space == NULL && // If no memory has been provided + get_kernel_transform_working_size(kernel_shape) != 0) // And we need the space + { + allocated_working_space = true; + transform_working_space = ALLOCATE( + get_kernel_transform_working_size(kernel_shape) + ); + } + + // The transformation methods only work on weights laid out in HWIO form, if + // the weights are not in this form then we need to re-order them. + const TIn *kernel_hwio = kernel; + if (kernel_shape.ordering != HWIO) + { + kernel_hwio = reinterpret_cast(transform_working_space); + + // Re-order the weights from OIHW to HWIO + this->prof( + "Weight reorder", + [&kernel, &kernel_hwio, this] () { + reorder::ofm_ifm_h_w_to_h_w_ifm_ofm( + kernel, const_cast(kernel_hwio), + kernel_shape.n_output_channels, + kernel_shape.n_input_channels, + kernel_shape.n_rows, + kernel_shape.n_cols + ); + }, + kernel_shape.size() * sizeof(TIn), + 0, + kernel_shape.size() * sizeof(TIn) + ); + } + + const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); + WeightsTransformT weights_transform( + kernel_hwio, kernel_matrices[0], + kernel_matrix_size_bytes / sizeof(TIn), + kernel_matrix_row_stride, + kernel_shape.n_output_channels, + kernel_shape.n_input_channels + ); + + // Transform the weights into the Winograd domain + auto kernel_prep = [&] () + { + weights_transform.run(0, weights_transform.get_window()); + }; + + prof( + "Kernel Prep", kernel_prep, + WeightsTransformT::bytes_read(kernel_shape), + WeightsTransformT::ops_performed(kernel_shape), + WeightsTransformT::bytes_written(kernel_shape) + ); + + // Free memory if we allocated it + if (allocated_working_space) + { + free(transform_working_space); + } +} + + +/** Perform a convolution. */ +template +template +void WinogradGEMM:: +Convolution::execute( + TOut* const output, + const TIn* const input, + const TOut* const biases, + void *working_space, + const int n_threads +) +{ + const auto padding_type = padding; + const auto input_shape = this->input_shape; + + // Allocate working space if none has been provided + const bool manage_working_space = (working_space == NULL); + if (manage_working_space) + { + const size_t ws_size = get_working_space_size( + kernel_shape, input_shape, padding_type + ); + working_space = ALLOCATE(ws_size * sizeof(int8_t)); + memset(working_space, 0x00, ws_size); + } + int8_t* const ws_bytes = reinterpret_cast(working_space); + + // Split the working space into that required for 16 input matrices and + // output matrices. + TIn *input_matrices[N_GEMMS]; + TOut *output_matrices[N_GEMMS]; + const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type); + const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type); + + for (int i = 0; i < N_GEMMS; i++) + { + input_matrices[i] = reinterpret_cast( + ws_bytes + i*in_matrix_stride_bytes); + output_matrices[i] = reinterpret_cast( + ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes); + } + + // If we need to re-order the input and output tensors then the final chunk + // of the working space can be used for this purpose. + // TODO - Overlay the input reorder on top of the output matrices + // - Overlay the output reorder on top of the input matrices + // Reorder the input input form if it was not provided in this ordering. + const TIn* input_nhwc = input; + if (input_shape.ordering == NCHW) + { + input_nhwc = reinterpret_cast( + ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) + ); + + this->prof( + "NCHW -> NHWC", + [input, input_shape, input_nhwc] () { + reorder::nchw_to_nhwc( + input, const_cast(input_nhwc), + input_shape.n_batches, + input_shape.n_channels, + input_shape.n_rows, + input_shape.n_cols + ); + }, + input_shape.size(), 0, input_shape.size() + ); + } + + // Compute shape for the GEMM + const auto output_shape = this->output_shape; + int M = this->M; + int K = this->K; + int N = this->N; + + const int in_matrix_row_stride = K; + const int out_matrix_row_stride = kernel_matrix_row_stride; + + InputTransform input_transform( + input_nhwc, + input_shape.n_batches, + input_shape.n_rows, + input_shape.n_cols, + input_shape.n_channels, + padding_type, + input_matrices[0], + in_matrix_stride_bytes / sizeof(TIn), + in_matrix_row_stride + ); + + // Transform the input into the Winograd domain + auto input_prep = [&] () { + input_transform.run(0, input_transform.get_window()); + }; + prof( + "Input Prep", input_prep, + InputTransform::bytes_read(input_shape), + InputTransform::ops_performed(input_shape), + InputTransform::bytes_written(input_shape) + ); + + // Perform the GEMMs + const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape); + BatchedBlockedGemm gemms( + N_GEMMS, M, K, N, + in_matrix_stride_bytes / sizeof(TIn), + in_matrix_row_stride, + kernel_matrix_stride_bytes / sizeof(TIn), + kernel_matrix_row_stride, + out_matrix_stride_bytes / sizeof(TOut), + out_matrix_row_stride, + input_matrices[0], + kernel_matrices[0], + output_matrices[0] + ); + for (unsigned int i = 0; i < gemms.get_window(); i++) + { + auto run_gemm = [&] () { gemms.run(i, i+1); }; + prof("GEMM", run_gemm, 0, 0, 0); + } + + // If the output tensor needs to be in NCHW form then store the NHWC output + // tensor in temporary storage and then reorder. If the output tensor needs + // to be in NHWC then just write straight to the output tensor. + TOut *output_nhwc = output; + if (input_shape.ordering == NCHW) + { + output_nhwc = reinterpret_cast( + ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) + ); + } + + // Transform the output tensor from the Winograd domain to the spatial + // domain. + OutputTransform output_transform( + output_matrices[0], + out_matrix_stride_bytes / sizeof(TOut), + out_matrix_row_stride, + biases, + output_nhwc, + output_shape.n_batches, + output_shape.n_rows, + output_shape.n_cols, + output_shape.n_channels + ); + auto output_prep = [&] () { + output_transform.run(0, output_transform.get_window()); + }; + prof( + "Output Comp", output_prep, + OutputTransform::bytes_read(output_shape), + OutputTransform::ops_performed(output_shape), + OutputTransform::bytes_written(output_shape) + ); + + // Reorder the output tensor if it is required to be in NCHW form. + if (input_shape.ordering == NCHW) + { + prof( + "NHWC -> NCHW", + [output_nhwc, output_shape, output] () { + reorder::nhwc_to_nchw( + output_nhwc, output, + output_shape.n_batches, + output_shape.n_rows, + output_shape.n_cols, + output_shape.n_channels + ); + }, + output_shape.size(), 0, output_shape.size() + ); + } + + // Free working space if we were responsible for allocating it + if (manage_working_space) + { + free(working_space); + } +} + + +/** Perform a convolution. */ +template +template +void WinogradGEMM:: +Convolution::execute( + TOut* const output, + const TIn* const input, + const TOut* const biases, + const int n_threads +) +{ + execute(output, input, biases, NULL, n_threads); +} + + +// Instantiate required implementations +template class WinogradGEMM<2, 2, 3, 3>::Convolution; +template class WinogradGEMM<4, 4, 3, 3>::Convolution; + +template class WinogradGEMM<2, 2, 5, 5>::Convolution; diff --git a/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp deleted file mode 100644 index 52c2db866a..0000000000 --- a/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "batched_blocked_gemm.hpp" -#include "gemm.hpp" -using namespace winograd; - -template -BatchedBlockedGemm::BatchedBlockedGemm( - const unsigned int n_gemms, - const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const TIn* const a_ptr, - const TIn* const b_ptr, - TOut* const c_ptr -) : n_gemms(n_gemms), M(M), N(N), K(K), - a_matrix_stride(a_matrix_stride), - a_row_stride(a_row_stride), - b_matrix_stride(b_matrix_stride), - b_row_stride(b_row_stride), - c_matrix_stride(c_matrix_stride), - c_row_stride(c_row_stride), - a_ptr(a_ptr), b_ptr(b_ptr), c_ptr(c_ptr) -{ -} - -template -unsigned int BatchedBlockedGemm::get_window() const -{ - return n_gemms; -} - -template -void BatchedBlockedGemm::run( - const unsigned int start, const unsigned int stop -) -{ - // Perform the specified GEMMs - for (unsigned int i = start; i < stop; i++) - { - // Get pointers to the relevant matrices - const TIn* const mtr_a = a_ptr + i*a_matrix_stride; - const TIn* const mtr_b = b_ptr + i*b_matrix_stride; - TOut* const mtr_c = c_ptr + i*c_matrix_stride; - - // Perform the GEMM - BlockedGemm( - mtr_a, mtr_b, mtr_c, M, K, N, - a_row_stride, b_row_stride, c_row_stride - ); - } -} - -template class winograd::BatchedBlockedGemm<4, 16, float, float>; - diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp deleted file mode 100644 index 381ae92182..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/input.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform; - -/****************************************************************************** - * Cost methods for the input transform. - * ===================================== - */ -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &input_shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); - const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); - return 16 * 16 * tile_M * tile_N * input_shape.n_channels; -} -/*****************************************************************************/ - -/***************************************************************************** -* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a -* variety of padding types. For example, tiles at the top and left of an image -* can require one row or column of padding on their top and left sides if the -* padding type is SAME (where X represents a padded value): -* -* _______ _______ -* |X X X X| |X X X X| -* |X | | | . . . -* |X | | | -* |X______| |_______| -* _______ -* |X | . -* |X | . . . . -* |X | . -* |X______| -* -* For tiles near the right or bottom of the image it is more complicated. Such -* tiles might require padding by 0 or 1 rows or columns if the padding type is -* VALID or 1 or 2 rows or columns if the padding type is SAME: -* -* _______ _______ _______ _______ -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X______| |_______| |______X| |____X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X______| |_______| |______X| |____X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* Additional tiles are required for especially small input images. -* -* Build an array of the specialised methods that deal with each of the -* different padding combinations which may be required. These padding -* constraints are the space: -* -* Padding top in {0, 1} -* Padding left in {0, 1} -* Padding bottom in {0, 1, 2} -* Padding right in {0, 1, 2} -*/ -template <> -template <> -template -void Transform::process_tile( - int n_channels, - const float* const input_base, - const int input_row_stride, - const int input_col_stride, - float* const matrix_base, - const int matrix_stride -) -{ - constexpr int inner_tile_i = 4, inner_tile_j = 4; - constexpr int cells_i = inner_tile_i - pad_bottom; - constexpr int cells_j = inner_tile_i - pad_right; - - float *outptr = matrix_base; - - // Get pointers into the input tile - const float *x_ptrs[inner_tile_i][inner_tile_j]; - for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) - { - // Get a pointer into the row - const float* const row_ptr = input_base + xi*input_row_stride; - - for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) - { - x_ptrs[i][j] = row_ptr + xj*input_col_stride; - } - } - - // Matrices used/computed in this kernel. - float x[inner_tile_i][inner_tile_j]; - float XTx[inner_tile_i][inner_tile_j]; - float U[inner_tile_i][inner_tile_j]; - - for (int i = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++) - { - x[i][j] = XTx[i][j] = 0.0f; - } - } - - // Perform the Winograd input transformation for each channel in the input - // tensor. - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used/computed in this kernel. - float32x4_t x[inner_tile_i][inner_tile_j]; - float32x4_t XTx[inner_tile_i][inner_tile_j]; - float32x4_t U[inner_tile_i][inner_tile_j]; - - for (int i = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++) - { - x[i][j] = vdupq_n_f32(0.0f); - XTx[i][j] = vdupq_n_f32(0.0f); - } - } - - // Load x - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1q_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 4; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = x[0][j] - x[2][j]; - XTx[0][j] = vsubq_f32(x[0][j], x[2][j]); - - // XTx[1][j] = x[1][j] + x[2][j]; - XTx[1][j] = vaddq_f32(x[1][j], x[2][j]); - - // XTx[2][j] = x[2][j] - x[1][j]; - XTx[2][j] = vsubq_f32(x[2][j], x[1][j]); - - // XTx[3][j] = x[1][j] - x[3][j]; - XTx[3][j] = vsubq_f32(x[1][j], x[3][j]); - } - - // Compute U = XT . x . X - for (int i = 0; i < inner_tile_i; i++) - { - // U[i][0] = XTx[i][0] - XTx[i][2]; - U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]); - - // U[i][1] = XTx[i][1] + XTx[i][2]; - U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]); - - // U[i][2] = XTx[i][2] - XTx[i][1]; - U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]); - - // U[i][3] = XTx[i][1] - XTx[i][3]; - U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used/computed in this kernel. - float32x2_t x[inner_tile_i][inner_tile_j]; - float32x2_t XTx[inner_tile_i][inner_tile_j]; - float32x2_t U[inner_tile_i][inner_tile_j]; - - for (int i = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++) - { - x[i][j] = vdup_n_f32(0.0f); - XTx[i][j] = vdup_n_f32(0.0f); - } - } - - // Load x - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 2; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = x[0][j] - x[2][j]; - XTx[0][j] = vsub_f32(x[0][j], x[2][j]); - - // XTx[1][j] = x[1][j] + x[2][j]; - XTx[1][j] = vadd_f32(x[1][j], x[2][j]); - - // XTx[2][j] = x[2][j] - x[1][j]; - XTx[2][j] = vsub_f32(x[2][j], x[1][j]); - - // XTx[3][j] = x[1][j] - x[3][j]; - XTx[3][j] = vsub_f32(x[1][j], x[3][j]); - } - - // Compute U = XT . x . X - for (int i = 0; i < inner_tile_i; i++) - { - // U[i][0] = XTx[i][0] - XTx[i][2]; - U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]); - - // U[i][1] = XTx[i][1] + XTx[i][2]; - U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]); - - // U[i][2] = XTx[i][2] - XTx[i][1]; - U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]); - - // U[i][3] = XTx[i][1] - XTx[i][3]; - U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Load x - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = *(x_ptrs[i][j]++); - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - XTx[0][j] = x[0][j] - x[2][j]; - XTx[1][j] = x[1][j] + x[2][j]; - XTx[2][j] = x[2][j] - x[1][j]; - XTx[3][j] = x[1][j] - x[3][j]; - } - - // Compute U = XT . x . X - for (int i = 0; i < inner_tile_i; i++) - { - U[i][0] = XTx[i][0] - XTx[i][2]; - U[i][1] = XTx[i][1] + XTx[i][2]; - U[i][2] = XTx[i][2] - XTx[i][1]; - U[i][3] = XTx[i][1] - XTx[i][3]; - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - *(outptr + m*matrix_stride) = U[i][j]; - } - } - outptr++; - } -} - -template <> -template <> -const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = -{ - { - { - { - Transform::template process_tile<0, 0, 0, 0>, // No padding - Transform::template process_tile<0, 0, 0, 1>, // Right - Transform::template process_tile<0, 0, 0, 2>, // Right - }, - { - Transform::template process_tile<0, 0, 1, 0>, // Bottom - Transform::template process_tile<0, 0, 1, 1>, // Bottom-right - Transform::template process_tile<0, 0, 1, 2>, // Bottom-right - }, - { - Transform::template process_tile<0, 0, 2, 0>, // Bottom - Transform::template process_tile<0, 0, 2, 1>, // Bottom-right - Transform::template process_tile<0, 0, 2, 2>, // Bottom-right - } - }, - { - { - Transform::template process_tile<0, 1, 0, 0>, // Left - Transform::template process_tile<0, 1, 0, 1>, // Left AND right - Transform::template process_tile<0, 1, 0, 2>, // Left AND right - }, - { - Transform::template process_tile<0, 1, 1, 0>, // Left-bottom - Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right - Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right - }, - { - Transform::template process_tile<0, 1, 2, 0>, // Left-bottom - Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right - Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right - } - }, - }, - { - { - { - Transform::template process_tile<1, 0, 0, 0>, // Top - Transform::template process_tile<1, 0, 0, 1>, // Top-right - Transform::template process_tile<1, 0, 0, 2>, // Top-right - }, - { - Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom - Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right - Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right - }, - { - Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom - Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right - Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right - } - }, - { - { - Transform::template process_tile<1, 1, 0, 0>, // Top-left - Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right - Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right - }, - { - Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom - Transform::template process_tile<1, 1, 1, 1>, // All padded - Transform::template process_tile<1, 1, 1, 2>, // All padded - }, - { - Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom - Transform::template process_tile<1, 1, 2, 1>, // All padded - Transform::template process_tile<1, 1, 2, 2>, // All padded - } - } - } -}; - -template struct WinogradGEMM<2, 2, 3, 3>::InputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_5x5_fp32.cpp deleted file mode 100644 index a6ebca1bce..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/input_2x2_5x5_fp32.cpp +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/input.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &input_shape) -{ - return 0; // TODO -} - -/***************************************************************************** -* F(2x2, 5x5) implies the use of a 6x6 input tile. -* -* Build an array of the specialised methods that deal with each of the -* different padding combinations which may be required. These padding -* constraints are the space: -* -* Padding top in {0, 1} -* Padding left in {0, 1} -* Padding bottom in {0, 1, 2, 3, 4} -* Padding right in {0, 1, 2, 3, 4} -*/ -template <> -template <> -template -void Transform::process_tile( - int n_channels, - const float* const input_base, - const int input_row_stride, - const int input_col_stride, - float* const matrix_base, - const int matrix_stride -) -{ - constexpr int cells_i = 6 - pad_bottom; - constexpr int cells_j = 6 - pad_right; - - float *outptr = matrix_base; - - // Get pointers into the input tile - const float *x_ptrs[6][6]; - for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) - { - // Get a pointer into the row - const float* const row_ptr = input_base + xi*input_row_stride; - - for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) - { - x_ptrs[i][j] = row_ptr + xj*input_col_stride; - } - } - - // Matrices used/computed in this kernel. - float x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = XTx[i][j] = 0.0f; - } - } - - // Perform the Winograd input transformation for each channel in the input - // tensor. - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used/computed in this kernel - float32x4_t x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = vdupq_n_f32(0.0f); - XTx[i][j] = vdupq_n_f32(0.0f); - } - } - - // Read a 6x6 tile in the Winograd domain - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1q_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 4; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); - - // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); - - // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); - - // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); - - // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); - - // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); - - // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used/computed in this kernel - float32x2_t x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = vdup_n_f32(0.0f); - XTx[i][j] = vdup_n_f32(0.0f); - } - } - - // Read a 6x6 tile in the Winograd domain - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 2; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); - - // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); - - // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); - - // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); - - // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); - - // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); - - // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Load x - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = *(x_ptrs[i][j]++); - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - *(outptr + m*matrix_stride) = U[i][j]; - } - } - outptr++; - } -} - -template <> -template <> -const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = -{ - { - { - { - Transform::template process_tile<0, 0, 0, 0>, // No padding - Transform::template process_tile<0, 0, 0, 1>, // Right - Transform::template process_tile<0, 0, 0, 2>, // " " - Transform::template process_tile<0, 0, 0, 3>, // " " - Transform::template process_tile<0, 0, 0, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 1, 0>, // Bottom - Transform::template process_tile<0, 0, 1, 1>, // Bottom right - Transform::template process_tile<0, 0, 1, 2>, // " " - Transform::template process_tile<0, 0, 1, 3>, // " " - Transform::template process_tile<0, 0, 1, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 2, 0>, // Bottom - Transform::template process_tile<0, 0, 2, 1>, // Bottom right - Transform::template process_tile<0, 0, 2, 2>, // " " - Transform::template process_tile<0, 0, 2, 3>, // " " - Transform::template process_tile<0, 0, 2, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 3, 0>, // Bottom - Transform::template process_tile<0, 0, 3, 1>, // Bottom right - Transform::template process_tile<0, 0, 3, 2>, // " " - Transform::template process_tile<0, 0, 3, 3>, // " " - Transform::template process_tile<0, 0, 3, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 4, 0>, // Bottom - Transform::template process_tile<0, 0, 4, 1>, // Bottom right - Transform::template process_tile<0, 0, 4, 2>, // " " - Transform::template process_tile<0, 0, 4, 3>, // " " - Transform::template process_tile<0, 0, 4, 4>, // " " - } - }, - { - { - Transform::template process_tile<0, 1, 0, 0>, // Left - Transform::template process_tile<0, 1, 0, 1>, - Transform::template process_tile<0, 1, 0, 2>, - Transform::template process_tile<0, 1, 0, 3>, - Transform::template process_tile<0, 1, 0, 4>, - }, - { - Transform::template process_tile<0, 1, 1, 0>, // Bottom left - Transform::template process_tile<0, 1, 1, 1>, - Transform::template process_tile<0, 1, 1, 2>, - Transform::template process_tile<0, 1, 1, 3>, - Transform::template process_tile<0, 1, 1, 4>, - }, - { - Transform::template process_tile<0, 1, 2, 0>, // " " - Transform::template process_tile<0, 1, 2, 1>, - Transform::template process_tile<0, 1, 2, 2>, - Transform::template process_tile<0, 1, 2, 3>, - Transform::template process_tile<0, 1, 2, 4>, - }, - { - Transform::template process_tile<0, 1, 3, 0>, // " " - Transform::template process_tile<0, 1, 3, 1>, - Transform::template process_tile<0, 1, 3, 2>, - Transform::template process_tile<0, 1, 3, 3>, - Transform::template process_tile<0, 1, 3, 4>, - }, - { - Transform::template process_tile<0, 1, 4, 0>, // " " - Transform::template process_tile<0, 1, 4, 1>, - Transform::template process_tile<0, 1, 4, 2>, - Transform::template process_tile<0, 1, 4, 3>, - Transform::template process_tile<0, 1, 4, 4>, - } - } - }, - { - { - { - Transform::template process_tile<1, 0, 0, 0>, // Top - Transform::template process_tile<1, 0, 0, 1>, // Top right - Transform::template process_tile<1, 0, 0, 2>, // " " - Transform::template process_tile<1, 0, 0, 3>, // " " - Transform::template process_tile<1, 0, 0, 4>, // " " - }, - { - Transform::template process_tile<1, 0, 1, 0>, - Transform::template process_tile<1, 0, 1, 1>, - Transform::template process_tile<1, 0, 1, 2>, - Transform::template process_tile<1, 0, 1, 3>, - Transform::template process_tile<1, 0, 1, 4>, - }, - { - Transform::template process_tile<1, 0, 2, 0>, - Transform::template process_tile<1, 0, 2, 1>, - Transform::template process_tile<1, 0, 2, 2>, - Transform::template process_tile<1, 0, 2, 3>, - Transform::template process_tile<1, 0, 2, 4>, - }, - { - Transform::template process_tile<1, 0, 3, 0>, - Transform::template process_tile<1, 0, 3, 1>, - Transform::template process_tile<1, 0, 3, 2>, - Transform::template process_tile<1, 0, 3, 3>, - Transform::template process_tile<1, 0, 3, 4>, - }, - { - Transform::template process_tile<1, 0, 4, 0>, - Transform::template process_tile<1, 0, 4, 1>, - Transform::template process_tile<1, 0, 4, 2>, - Transform::template process_tile<1, 0, 4, 3>, - Transform::template process_tile<1, 0, 4, 4>, - }, - }, - { - { - Transform::template process_tile<1, 1, 0, 0>, // Top left - Transform::template process_tile<1, 1, 0, 1>, - Transform::template process_tile<1, 1, 0, 2>, - Transform::template process_tile<1, 1, 0, 3>, - Transform::template process_tile<1, 1, 0, 4>, - }, - { - Transform::template process_tile<1, 1, 1, 0>, - Transform::template process_tile<1, 1, 1, 1>, - Transform::template process_tile<1, 1, 1, 2>, - Transform::template process_tile<1, 1, 1, 3>, - Transform::template process_tile<1, 1, 1, 4>, - }, - { - Transform::template process_tile<1, 1, 2, 0>, - Transform::template process_tile<1, 1, 2, 1>, - Transform::template process_tile<1, 1, 2, 2>, - Transform::template process_tile<1, 1, 2, 3>, - Transform::template process_tile<1, 1, 2, 4>, - }, - { - Transform::template process_tile<1, 1, 3, 0>, - Transform::template process_tile<1, 1, 3, 1>, - Transform::template process_tile<1, 1, 3, 2>, - Transform::template process_tile<1, 1, 3, 3>, - Transform::template process_tile<1, 1, 3, 4>, - }, - { - Transform::template process_tile<1, 1, 4, 0>, - Transform::template process_tile<1, 1, 4, 1>, - Transform::template process_tile<1, 1, 4, 2>, - Transform::template process_tile<1, 1, 4, 3>, - Transform::template process_tile<1, 1, 4, 4>, - } - } - } -}; - -template struct WinogradGEMM<2, 2, 5, 5>::InputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp deleted file mode 100644 index 477aaaf34e..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/input.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &input_shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); - const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); - return 12 * 24 * tile_M * tile_N * input_shape.n_channels; -} - -/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a -* variety of padding types. For example, tiles at the top and left of an -* image can require one row or column of padding on their top and left sides -* if the padding type is SAME (where X represents a padded value): -* -* ___________ ___________ -* |X X X X X X| |X X X X X X| -* |X | | | -* |X | | | -* |X | | | -* |X | | | -* |X__________| |___________| -* ___________ -* |X | -* |X | -* |X | -* |X | -* |X | -* |X__________| -* -* For tiles near the right or bottom of the image it is more complicated. -* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the -* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding -* type is SAME. -* -* Build an array of the specialised methods that deal with each of the -* different padding combinations which may be required. These padding -* constraints are the space: -* -* Padding top in {0, 1} -* Padding left in {0, 1} -* Padding bottom in {0, 1, 2, 3, 4} -* Padding right in {0, 1, 2, 3, 4} -*/ -template <> -template <> -template -void Transform::process_tile( - int n_channels, - const float* const input_base, - const int input_row_stride, - const int input_col_stride, - float* const matrix_base, - const int matrix_stride -) -{ - constexpr int cells_i = 6 - pad_bottom; - constexpr int cells_j = 6 - pad_right; - - float *outptr = matrix_base; - - // Get pointers into the input tile - const float *x_ptrs[6][6]; - for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) - { - // Get a pointer into the row - const float* const row_ptr = input_base + xi*input_row_stride; - - for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) - { - x_ptrs[i][j] = row_ptr + xj*input_col_stride; - } - } - - // Matrices used/computed in this kernel. - float x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = XTx[i][j] = 0.0f; - } - } - - // Perform the Winograd input transformation for each channel in the input - // tensor. - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used/computed in this kernel - float32x4_t x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = vdupq_n_f32(0.0f); - XTx[i][j] = vdupq_n_f32(0.0f); - } - } - - // Read a 6x6 tile in the Winograd domain - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1q_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 4; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); - - // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); - - // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); - - // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); - - // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); - - // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); - - // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used/computed in this kernel - float32x2_t x[6][6], XTx[6][6], U[6][6]; - for (int i = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++) - { - x[i][j] = vdup_n_f32(0.0f); - XTx[i][j] = vdup_n_f32(0.0f); - } - } - - // Read a 6x6 tile in the Winograd domain - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = vld1_f32(x_ptrs[i][j]); - x_ptrs[i][j] += 2; - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); - - // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); - - // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); - - // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); - - // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); - - // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); - - // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); - - // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); - - // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, U[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Load x - for (int i = pad_top; i < cells_i; i++) - { - for (int j = pad_left; j < cells_j; j++) - { - x[i][j] = *(x_ptrs[i][j]++); - } - } - - // Compute XT . x - for (int j = pad_left; j < cells_j; j++) - { - XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; - XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; - XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; - XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; - XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; - XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; - } - - // Compute U = XT . x . X - for (int i = 0; i < 6; i++) - { - U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; - U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; - U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; - U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; - U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; - U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; - } - - // Store the transformed matrix - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - *(outptr + m*matrix_stride) = U[i][j]; - } - } - outptr++; - } -} - -/* In the below, unusual or especially small tiles are routed via the slow - * path whereas common or large tiles are routed through a faster path. - */ -template <> -template <> -const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = -{ - { - { - { - Transform::template process_tile<0, 0, 0, 0>, // No padding - Transform::template process_tile<0, 0, 0, 1>, // Right - Transform::template process_tile<0, 0, 0, 2>, // " " - Transform::template process_tile<0, 0, 0, 3>, // " " - Transform::template process_tile<0, 0, 0, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 1, 0>, // Bottom - Transform::template process_tile<0, 0, 1, 1>, // Bottom right - Transform::template process_tile<0, 0, 1, 2>, // " " - Transform::template process_tile<0, 0, 1, 3>, // " " - Transform::template process_tile<0, 0, 1, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 2, 0>, // Bottom - Transform::template process_tile<0, 0, 2, 1>, // Bottom right - Transform::template process_tile<0, 0, 2, 2>, // " " - Transform::template process_tile<0, 0, 2, 3>, // " " - Transform::template process_tile<0, 0, 2, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 3, 0>, // Bottom - Transform::template process_tile<0, 0, 3, 1>, // Bottom right - Transform::template process_tile<0, 0, 3, 2>, // " " - Transform::template process_tile<0, 0, 3, 3>, // " " - Transform::template process_tile<0, 0, 3, 4>, // " " - }, - { - Transform::template process_tile<0, 0, 4, 0>, // Bottom - Transform::template process_tile<0, 0, 4, 1>, // Bottom right - Transform::template process_tile<0, 0, 4, 2>, // " " - Transform::template process_tile<0, 0, 4, 3>, // " " - Transform::template process_tile<0, 0, 4, 4>, // " " - } - }, - { - { - Transform::template process_tile<0, 1, 0, 0>, // Left - Transform::template process_tile<0, 1, 0, 1>, - Transform::template process_tile<0, 1, 0, 2>, - Transform::template process_tile<0, 1, 0, 3>, - Transform::template process_tile<0, 1, 0, 4>, - }, - { - Transform::template process_tile<0, 1, 1, 0>, // Bottom left - Transform::template process_tile<0, 1, 1, 1>, - Transform::template process_tile<0, 1, 1, 2>, - Transform::template process_tile<0, 1, 1, 3>, - Transform::template process_tile<0, 1, 1, 4>, - }, - { - Transform::template process_tile<0, 1, 2, 0>, // " " - Transform::template process_tile<0, 1, 2, 1>, - Transform::template process_tile<0, 1, 2, 2>, - Transform::template process_tile<0, 1, 2, 3>, - Transform::template process_tile<0, 1, 2, 4>, - }, - { - Transform::template process_tile<0, 1, 3, 0>, // " " - Transform::template process_tile<0, 1, 3, 1>, - Transform::template process_tile<0, 1, 3, 2>, - Transform::template process_tile<0, 1, 3, 3>, - Transform::template process_tile<0, 1, 3, 4>, - }, - { - Transform::template process_tile<0, 1, 4, 0>, // " " - Transform::template process_tile<0, 1, 4, 1>, - Transform::template process_tile<0, 1, 4, 2>, - Transform::template process_tile<0, 1, 4, 3>, - Transform::template process_tile<0, 1, 4, 4>, - } - } - }, - { - { - { - Transform::template process_tile<1, 0, 0, 0>, // Top - Transform::template process_tile<1, 0, 0, 1>, // Top right - Transform::template process_tile<1, 0, 0, 2>, // " " - Transform::template process_tile<1, 0, 0, 3>, // " " - Transform::template process_tile<1, 0, 0, 4>, // " " - }, - { - Transform::template process_tile<1, 0, 1, 0>, - Transform::template process_tile<1, 0, 1, 1>, - Transform::template process_tile<1, 0, 1, 2>, - Transform::template process_tile<1, 0, 1, 3>, - Transform::template process_tile<1, 0, 1, 4>, - }, - { - Transform::template process_tile<1, 0, 2, 0>, - Transform::template process_tile<1, 0, 2, 1>, - Transform::template process_tile<1, 0, 2, 2>, - Transform::template process_tile<1, 0, 2, 3>, - Transform::template process_tile<1, 0, 2, 4>, - }, - { - Transform::template process_tile<1, 0, 3, 0>, - Transform::template process_tile<1, 0, 3, 1>, - Transform::template process_tile<1, 0, 3, 2>, - Transform::template process_tile<1, 0, 3, 3>, - Transform::template process_tile<1, 0, 3, 4>, - }, - { - Transform::template process_tile<1, 0, 4, 0>, - Transform::template process_tile<1, 0, 4, 1>, - Transform::template process_tile<1, 0, 4, 2>, - Transform::template process_tile<1, 0, 4, 3>, - Transform::template process_tile<1, 0, 4, 4>, - }, - }, - { - { - Transform::template process_tile<1, 1, 0, 0>, // Top left - Transform::template process_tile<1, 1, 0, 1>, - Transform::template process_tile<1, 1, 0, 2>, - Transform::template process_tile<1, 1, 0, 3>, - Transform::template process_tile<1, 1, 0, 4>, - }, - { - Transform::template process_tile<1, 1, 1, 0>, - Transform::template process_tile<1, 1, 1, 1>, - Transform::template process_tile<1, 1, 1, 2>, - Transform::template process_tile<1, 1, 1, 3>, - Transform::template process_tile<1, 1, 1, 4>, - }, - { - Transform::template process_tile<1, 1, 2, 0>, - Transform::template process_tile<1, 1, 2, 1>, - Transform::template process_tile<1, 1, 2, 2>, - Transform::template process_tile<1, 1, 2, 3>, - Transform::template process_tile<1, 1, 2, 4>, - }, - { - Transform::template process_tile<1, 1, 3, 0>, - Transform::template process_tile<1, 1, 3, 1>, - Transform::template process_tile<1, 1, 3, 2>, - Transform::template process_tile<1, 1, 3, 3>, - Transform::template process_tile<1, 1, 3, 4>, - }, - { - Transform::template process_tile<1, 1, 4, 0>, - Transform::template process_tile<1, 1, 4, 1>, - Transform::template process_tile<1, 1, 4, 2>, - Transform::template process_tile<1, 1, 4, 3>, - Transform::template process_tile<1, 1, 4, 4>, - } - } - } -}; - -template struct WinogradGEMM<4, 4, 3, 3>::InputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp deleted file mode 100644 index 58db7d2ecd..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/output.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 2); - const int tile_N = iceildiv(shape.n_cols, 2); - return 24 * tile_M * tile_N * shape.n_channels; -} - -/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template -void Transform::process_tile( - const int n_channels, - const float* const matrix_base, - const int matrix_stride, - const float* const biases, - float* const output, - const int output_row_stride, - const int output_col_stride -) -{ - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; - - // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; - } - } - const float *inptr = matrix_base; - const float *bptr = biases; - - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed during this transform - float32x4_t F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) - { - for (int j = 0; j < 4; j++, m++) - { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); - } - } - inptr += 4; - - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); - - // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); - - // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; - f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); - } - - // Load the bias vector - b = vld1q_f32(bptr); - bptr += 4; - - // Write out the output tile - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; - } - } - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) - { - for (int j = 0; j < 4; j++, m++) - { - F[i][j] = vld1_f32(inptr + m*matrix_stride); - } - } - inptr += 2; - - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); - - // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); - - // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; - f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); - } - - // Load the bias vector - b = vld1_f32(bptr); - bptr += 2; - - // Write out the output tile - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; - } - } - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed during this transform - float F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) - { - for (int j = 0; j < 4; j++, m++) - { - F[i][j] = *(inptr + m*matrix_stride); - } - } - inptr++; - - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; - } - - // Load the bias - b = *(bptr++); - - // Write out the output tile - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - *(outptrs[i][j]++) = f[i][j] + b; - } - } - } -} - -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } -}; - -template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_5x5_fp32.cpp deleted file mode 100644 index bfd670090a..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/output_2x2_5x5_fp32.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/output.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) -{ - return 0; // TODO -} - -/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template -void Transform::process_tile( - const int n_channels, - const float* const matrix_base, - const int matrix_stride, - const float* const biases, - float* const output, - const int output_row_stride, - const int output_col_stride -) -{ - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; - - // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; - } - } - const float *inptr = matrix_base; - const float *bptr = biases; - - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed during this transform - float32x4_t F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); - } - } - inptr += 4; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; - f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); - } - - // Write out the output tile - b = vld1q_f32(bptr); - bptr += 4; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; - } - } - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = vld1_f32(inptr + m*matrix_stride); - } - } - inptr += 2; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); - - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; - f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); - } - - // Write out the output tile - b = vld1_f32(bptr); - bptr += 2; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; - } - } - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed during this transform - float F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = *(inptr + m*matrix_stride); - } - } - inptr++; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; - } - - // Write out the output tile - b = *(bptr++); - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - *(outptrs[i][j]++) = f[i][j] + b; - } - } - } -} - -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } -}; - -template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp deleted file mode 100644 index 45210d7976..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "transforms/output.hpp" -#include "winograd_gemm.hpp" -#include "arm.hpp" - -namespace winograd -{ - -using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 4); - const int tile_N = iceildiv(shape.n_cols, 4); - return 170 * tile_M * tile_N * shape.n_channels; -} - -// Instantiate cost methods -template int Transform::ops_performed(const Tensor4DShape&); - -/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain up to 3 - * padded values to the right and bottom columns or rows of the tile, e.g.: -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |_______| |______X| |____X_X| |__X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* -* We provide a specialised output transform for each of these instances. -*/ -template <> -template <> -template -void Transform::process_tile( - const int n_channels, - const float* const matrix_base, - const int matrix_stride, - const float* const biases, - float* const output, - const int output_row_stride, - const int output_col_stride -) -{ - constexpr int cells_i = 4 - pad_bottom; - constexpr int cells_j = 4 - pad_right; - - // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; - } - } - const float *inptr = matrix_base; - const float *bptr = biases; - - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed during this transform - float32x4_t F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); - } - } - inptr += 4; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); - - // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); - - // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); - - // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); - - // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; - f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); - } - - // Write out the output tile - b = vld1q_f32(bptr); - bptr += 4; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; - } - } - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = vld1_f32(inptr + m*matrix_stride); - } - } - inptr += 2; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); - - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); - - // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); - - // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); - - // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); - - // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; - f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); - } - - // Write out the output tile - b = vld1_f32(bptr); - bptr += 2; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; - } - } - } -#endif - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed during this transform - float F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - F[i][j] = *(inptr + m*matrix_stride); - } - } - inptr++; - - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - } - - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; - } - - // Write out the output tile - b = *(bptr++); - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) - { - *(outptrs[i][j]++) = f[i][j] + b; - } - } - } -} - -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - }, - { - Transform::template process_tile<1, 0>, - Transform::template process_tile<1, 1>, - Transform::template process_tile<1, 2>, - Transform::template process_tile<1, 3>, - }, - { - Transform::template process_tile<2, 0>, - Transform::template process_tile<2, 1>, - Transform::template process_tile<2, 2>, - Transform::template process_tile<2, 3>, - }, - { - Transform::template process_tile<3, 0>, - Transform::template process_tile<3, 1>, - Transform::template process_tile<3, 2>, - Transform::template process_tile<3, 3>, - } -}; - -template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp deleted file mode 100644 index c0b282431e..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm.hpp" -#include "winograd_gemm.hpp" -#include "transforms/kernel.hpp" - -namespace winograd -{ - template <> - template <> - void WinogradGEMM<2, 2, 3, 3>::WeightsTransform::execute( - const int n_output_channels, - const int n_input_channels, - const float* const input, - float* const output, - const int matrix_stride, - const int matrix_row_stride - ) - { - constexpr int inner_tile_i = 4; - constexpr int inner_tile_j = 4; - - // Get pointers to each cell of the weight tensor - const auto weight_col_stride = n_input_channels * n_output_channels; - const auto weight_row_stride = 3 * weight_col_stride; - const float *inptrs[3][3]; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; - } - } - - // For each input channel - for (int ic = 0; ic < n_input_channels; ic++) - { - float *outptr = output + ic * matrix_row_stride; - - // For each output channel - int channels_remaining = n_output_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed in this kernel - float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = vld1q_f32(inptrs[i][j]); - inptrs[i][j] += 4; - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - Ww[0][j] = w[0][j]; - - // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); - Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); - - // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); - Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); - - Ww[3][j] = w[2][j]; - } - - // Compute V = W w WT - for (int i = 0; i < inner_tile_i; i++) - { - V[i][0] = Ww[i][0]; - - // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); - V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); - - // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); - V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); - - V[i][3] = Ww[i][2]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed in this kernel - float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = vld1_f32(inptrs[i][j]); - inptrs[i][j] += 2; - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - Ww[0][j] = w[0][j]; - - // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); - Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); - - // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); - Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); - - Ww[3][j] = w[2][j]; - } - - // Compute V = W w WT - for (int i = 0; i < inner_tile_i; i++) - { - V[i][0] = Ww[i][0]; - - // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); - V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); - - // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); - V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); - - V[i][3] = Ww[i][2]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed in this kernel - float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = *(inptrs[i][j]++); - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - Ww[0][j] = w[0][j]; - Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); - Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); - Ww[3][j] = w[2][j]; - } - - // Compute V = W w WT - for (int i = 0; i < inner_tile_i; i++) - { - V[i][0] = Ww[i][0]; - V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); - V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); - V[i][3] = Ww[i][2]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < inner_tile_i; i++) - { - for (int j = 0; j < inner_tile_j; j++, m++) - { - *(outptr + m*matrix_stride) = V[i][j]; - } - } - outptr++; - } - } - } - - template <> - template <> - int WinogradGEMM<2, 2, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) - { - const int channel_prod = shape.n_input_channels * shape.n_output_channels; - return 2 * 18 * channel_prod; - } - - template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/weights_2x2_5x5_fp32.cpp deleted file mode 100644 index acf6b913f8..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/weights_2x2_5x5_fp32.cpp +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm.hpp" -#include "winograd_gemm.hpp" -#include "transforms/kernel.hpp" - -namespace winograd -{ - template <> - template <> - void WinogradGEMM<2, 2, 5, 5>::WeightsTransform::execute( - const int n_output_channels, - const int n_input_channels, - const float* const input, - float* const output, - const int matrix_stride, - const int matrix_row_stride - ) - { - // Get pointers to each cell of the weight tensor - const auto weight_col_stride = n_input_channels * n_output_channels; - const auto weight_row_stride = 5 * weight_col_stride; - const float *inptrs[5][5]; - for (int i = 0; i < 5; i++) - { - for (int j = 0; j < 5; j++) - { - inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; - } - } - - // For each input channel - for (int ic = 0; ic < n_input_channels; ic++) - { - float *outptr = output + ic * matrix_row_stride; - - // For each output channel - int channels_remaining = n_output_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed in this kernel - float32x4_t w[5][5], Ww[6][5], V[6][6]; - - // Read weights - for (int i = 0; i < 5; i++) - { - for (int j = 0; j < 5; j++) - { - w[i][j] = vld1q_f32(inptrs[i][j]); - inptrs[i][j] += 4; - } - } - - // Compute the matrix W w - for (int j = 0; j < 5; j++) - { - // Ww[0][j] = w[0][j]/4.0f; - Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f); - - // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; - Ww[1][j] = vmulq_n_f32( - vaddq_f32( - vaddq_f32( - vaddq_f32(w[1][j], w[0][j]), - vaddq_f32(w[3][j], w[2][j]) - ), - w[4][j] - ), - -1.0f/6.0f - ); - - // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; - // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; - Ww[2][j] = vmulq_n_f32( - vsubq_f32( - vaddq_f32( - vsubq_f32(w[1][j], w[0][j]), - vsubq_f32(w[3][j], w[2][j]) - ), - w[4][j] - ), - 1.0f/6.0f - ); - - // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; - Ww[3][j] = vmulq_n_f32( - vmlaq_n_f32( - vaddq_f32( - vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), - vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) - ), - w[4][j], 2.0f - ), - 1.0f/3.0f - ); - - // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; - Ww[4][j] = vmulq_n_f32( - vmlaq_n_f32( - vaddq_f32( - vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), - vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) - ), - w[4][j], 2.0f - ), - 1.0f/3.0f - ); - - // Ww[5][j] = w[4][j]; - Ww[5][j] = w[4][j]; - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - // V[i][0] = Ww[i][0]/4.0f; - V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f); - - // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; - V[i][1] = vmulq_n_f32( - vaddq_f32( - vaddq_f32( - vaddq_f32(Ww[i][1], Ww[i][0]), - vaddq_f32(Ww[i][3], Ww[i][2]) - ), - Ww[i][4] - ), - -1.0f/6.0f - ); - - // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; - // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; - V[i][2] = vmulq_n_f32( - vsubq_f32( - vaddq_f32( - vsubq_f32(Ww[i][1], Ww[i][0]), - vsubq_f32(Ww[i][3], Ww[i][2]) - ), - Ww[i][4] - ), - 1.0f/6.0f - ); - - // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][3] = vmulq_n_f32( - vmlaq_n_f32( - vaddq_f32( - vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), - vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) - ), - Ww[i][4], 2.0f - ), - 1.0f/3.0f - ); - - // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][4] = vmulq_n_f32( - vmlaq_n_f32( - vaddq_f32( - vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), - vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) - ), - Ww[i][4], 2.0f - ), - 1.0f/3.0f - ); - - // V[i][5] = Ww[i][4]; - V[i][5] = Ww[i][4]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed in this kernel - float32x2_t w[5][5], Ww[6][5], V[6][6]; - - // Read weights - for (int i = 0; i < 5; i++) - { - for (int j = 0; j < 5; j++) - { - w[i][j] = vld1_f32(inptrs[i][j]); - inptrs[i][j] += 2; - } - } - - // Compute the matrix W w - for (int j = 0; j < 5; j++) - { - // Ww[0][j] = w[0][j]/4.0f; - Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f); - - // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; - Ww[1][j] = vmul_n_f32( - vadd_f32( - vadd_f32( - vadd_f32(w[1][j], w[0][j]), - vadd_f32(w[3][j], w[2][j]) - ), - w[4][j] - ), - -1.0f/6.0f - ); - - // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; - // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; - Ww[2][j] = vmul_n_f32( - vsub_f32( - vadd_f32( - vsub_f32(w[1][j], w[0][j]), - vsub_f32(w[3][j], w[2][j]) - ), - w[4][j] - ), - 1.0f/6.0f - ); - - // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; - Ww[3][j] = vmul_n_f32( - vmla_n_f32( - vadd_f32( - vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), - vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) - ), - w[4][j], 2.0f - ), - 1.0f/3.0f - ); - - // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; - Ww[4][j] = vmul_n_f32( - vmla_n_f32( - vadd_f32( - vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), - vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) - ), - w[4][j], 2.0f - ), - 1.0f/3.0f - ); - - // Ww[5][j] = w[4][j]; - Ww[5][j] = w[4][j]; - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - // V[i][0] = Ww[i][0]/4.0f; - V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f); - - // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; - V[i][1] = vmul_n_f32( - vadd_f32( - vadd_f32( - vadd_f32(Ww[i][1], Ww[i][0]), - vadd_f32(Ww[i][3], Ww[i][2]) - ), - Ww[i][4] - ), - -1.0f/6.0f - ); - - // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; - // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; - V[i][2] = vmul_n_f32( - vsub_f32( - vadd_f32( - vsub_f32(Ww[i][1], Ww[i][0]), - vsub_f32(Ww[i][3], Ww[i][2]) - ), - Ww[i][4] - ), - 1.0f/6.0f - ); - - // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][3] = vmul_n_f32( - vmla_n_f32( - vadd_f32( - vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), - vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) - ), - Ww[i][4], 2.0f - ), - 1.0f/3.0f - ); - - // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][4] = vmul_n_f32( - vmla_n_f32( - vadd_f32( - vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), - vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) - ), - Ww[i][4], 2.0f - ), - 1.0f/3.0f - ); - - // V[i][5] = Ww[i][4]; - V[i][5] = Ww[i][4]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed in this kernel - float w[5][5], Ww[6][5], V[6][6]; - - // Read weights - for (int i = 0; i < 5; i++) - { - for (int j = 0; j < 5; j++) - { - w[i][j] = *(inptrs[i][j]++); - } - } - - // Compute the matrix W w - for (int j = 0; j < 5; j++) - { - Ww[0][j] = w[0][j]/4.0f; - Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; - Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; - Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; - Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; - Ww[5][j] = w[4][j]; - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - V[i][0] = Ww[i][0]/4.0f; - V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; - V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; - V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; - V[i][5] = Ww[i][4]; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - *(outptr + m*matrix_stride) = V[i][j]; - } - } - outptr++; - } - } - } - - template <> - template <> - int WinogradGEMM<2, 2, 5, 5>::WeightsTransform::ops_performed(const KernelShape &shape) - { - return 0; // TODO - } - - template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform; -} // namespace winograd diff --git a/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp deleted file mode 100644 index de659c38e0..0000000000 --- a/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm.hpp" -#include "winograd_gemm.hpp" -#include "transforms/kernel.hpp" - -namespace winograd -{ - /* Float implementation for kernel transform F(4x4, 3x3) */ - template <> - template <> - void WinogradGEMM<4, 4, 3, 3>::WeightsTransform::execute( - const int n_output_channels, - const int n_input_channels, - const float* const input, // NOTE: Data in HWIO order - float* const output, - const int matrix_stride, - const int matrix_row_stride - ) - { - // Get pointers to each cell of the weight tensor - const auto weight_col_stride = n_input_channels * n_output_channels; - const auto weight_row_stride = 3 * weight_col_stride; - const float *inptrs[3][3]; - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; - } - } - - // For each input channel - for (int ic = 0; ic < n_input_channels; ic++) - { - float *outptr = output + ic * matrix_row_stride; - - // For each output channel - int channels_remaining = n_output_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Matrices used and computed in this kernel - float32x4_t w[3][3], Ww[6][3], V[6][6]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = vld1q_f32(inptrs[i][j]); - inptrs[i][j] += 4; - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - // Ww[0][j] = 6*w[0][j]; - Ww[0][j] = vmulq_n_f32(w[0][j], 6.0); - - // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; - Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0); - - // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; - Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0); - - // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; - Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); - - // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; - Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); - - // Ww[5][j] = 24*w[2][j]; - Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f); - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - const float recip576 = 1.0f / 576.0f; - - // V[i][0] = 6*Ww[i][0]; - V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576); - - // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; - V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); - - // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; - V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); - - // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; - V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); - - // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; - V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); - - // V[i][5] = 24*Ww[i][2]; - V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576); - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1q_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 4; - } -#endif // __aarch64__ -#ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed in this kernel - float32x2_t w[3][3], Ww[6][3], V[6][6]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = vld1_f32(inptrs[i][j]); - inptrs[i][j] += 2; - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - // Ww[0][j] = 6*w[0][j]; - Ww[0][j] = vmul_n_f32(w[0][j], 6.0); - - // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; - Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0); - - // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; - Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0); - - // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; - Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); - - // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; - Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); - - // Ww[5][j] = 24*w[2][j]; - Ww[5][j] = vmul_n_f32(w[2][j], 24.0f); - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - const float recip576 = 1.0f / 576.0f; - - // V[i][0] = 6*Ww[i][0]; - V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576); - - // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; - V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); - - // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; - V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); - - // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; - V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); - - // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; - V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); - - // V[i][5] = 24*Ww[i][2]; - V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576); - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - vst1_f32(outptr + m*matrix_stride, V[i][j]); - } - } - outptr += 2; - } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) - { - // Matrices used and computed in this kernel - float w[3][3], Ww[6][3], V[6][6]; - - // Read weights - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 3; j++) - { - w[i][j] = *(inptrs[i][j]++); - } - } - - // Compute the matrix W w - for (int j = 0; j < 3; j++) - { - Ww[0][j] = 6*w[0][j]; - Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; - Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; - Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; - Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; - Ww[5][j] = 24*w[2][j]; - } - - // Compute V = W w WT - for (int i = 0; i < 6; i++) - { - V[i][0] = ( 6*Ww[i][0]) / 576.0; - V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0; - V[i][2] = (-4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]) / 576.0; - V[i][3] = ( 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]) / 576.0; - V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]) / 576.0; - V[i][5] = (24*Ww[i][2]) / 576.0; - } - - // Store the transformed weights - for (int i = 0, m = 0; i < 6; i++) - { - for (int j = 0; j < 6; j++, m++) - { - *(outptr + m*matrix_stride) = V[i][j]; - } - } - outptr++; - } - } - } - - template <> - template <> - int WinogradGEMM<4, 4, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) - { - const int channel_prod = shape.n_input_channels * shape.n_output_channels; - return 9 * 16 * channel_prod; - } - - template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform; -} diff --git a/src/core/NEON/kernels/winograd/utils.cpp b/src/core/NEON/kernels/winograd/utils.cpp deleted file mode 100644 index 24d0386c76..0000000000 --- a/src/core/NEON/kernels/winograd/utils.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include - -double TimeInUs(void) -{ -#ifdef CYCLE_PROFILING - timespec t; - clock_gettime(CLOCK_REALTIME, &t); - return 1e6*t.tv_sec + 1e-3*t.tv_nsec; -#else - return 0; -#endif -} - -void PrintMatrix(const float* const m, const int M, const int N, const int row_stride) -{ - for (int i = 0; i < M; i++) - { - for (int j = 0; j < N; j++) - { - printf("%.3f ", m[i*row_stride + j]); - } - printf("\n"); - } - printf("\n"); -} diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/winograd/winograd_gemm.cpp deleted file mode 100644 index 05426450a6..0000000000 --- a/src/core/NEON/kernels/winograd/winograd_gemm.cpp +++ /dev/null @@ -1,568 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "winograd_gemm.hpp" -#include "batched_blocked_gemm.hpp" -using namespace winograd; - -/** Get the output shape of a convolution. */ -template -template -Tensor4DShape WinogradGEMM::Convolution::get_output_shape( - const KernelShape &kernel_shape, - const Tensor4DShape &in_shape, - const PaddingType padding -) -{ - return Tensor4DShape { - in_shape.n_batches, - (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1), - (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1), - kernel_shape.n_output_channels, - in_shape.ordering - }; -} - -/* Get the memory required to transform the kernel. - */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_transform_working_size(const KernelShape &shape) -{ - if (shape.ordering == HWIO) - { - // Kernel is already in the correct order, so no additional memory is - // required. - return 0; - } - else - { - // Need to re-order the kernel into HWIO form, require enough space to - // represent the tensor. - return sizeof(TIn) * shape.size(); - } -} - -/** Get the memory required to store the kernel transformed into the - * Winograd domain. - */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_storage_size(const KernelShape &shape) -{ - return N_GEMMS * get_kernel_matrix_size(shape); -} - - -template -template -size_t WinogradGEMM::Convolution::get_input_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding -) -{ - return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding); -} - - -template -template -size_t WinogradGEMM::Convolution::get_output_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding -) -{ - return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding); -} - - -/** Get the memory required to apply a Winograd operator to some input. - */ -template -template -size_t WinogradGEMM::Convolution::get_working_space_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); - - // Get the memory required to store the matrices - const size_t matrix_sizes = N_GEMMS * ( - get_input_matrix_size(kernel_shape, input_shape, padding_type) + - get_output_matrix_size(kernel_shape, input_shape, padding_type) - ); - - // Add additional space to re-order the input and output if the input tensor - // is not in NHWC format. - if (input_shape.ordering == NHWC) - { - return matrix_sizes; // No extra spacing required - } - else // NCHW, must reorder the input and output tensors - { - // We only need to re-order the input or output at any one time, so request - // enough memory to do the largest of these. - const size_t extra_memory = std::max( - sizeof(TIn) * input_shape.size(), - sizeof(TOut) * output_shape.size() - ); - return matrix_sizes + extra_memory; - } -} - - -/* Get the memory required by a single "input" matrix. - */ -template -template -size_t WinogradGEMM::Convolution::get_input_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn); -} - -template -template -int WinogradGEMM::Convolution::get_input_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - // Compute shape for the GEMM - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); - const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK); - const int K = kernel_shape.n_input_channels; - - return M * K; -} - - -/* Get the memory required by a single "output" matrix. - */ -template -template -size_t WinogradGEMM::Convolution::get_output_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut); -} - - -template -template -int WinogradGEMM::Convolution::get_output_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - // Compute shape for the GEMM - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); - const int M = roundup(tile_rows * tile_cols, M_BLOCK); - const int N = roundup(kernel_shape.n_output_channels, N_BLOCK); - - return input_shape.n_batches * M * N; -} - - -/* Get the memory required by a single "kernel" matrix. - */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_matrix_size(const KernelShape &shape) -{ - return sizeof(TIn) * get_kernel_matrix_stride(shape); -} - -template -template -int WinogradGEMM::Convolution::get_kernel_matrix_stride(const KernelShape &shape) -{ - const int K = shape.n_input_channels; - const int N = roundup(shape.n_output_channels, N_BLOCK); - return K * N; -} - - -/** Create a new Winograd operator. */ -template -template -WinogradGEMM::Convolution::Convolution( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding, - void *kernel_storage -) : kernel_shape(kernel_shape), // Store the kernel shape - kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), - manage_kernel_storage(kernel_storage == NULL), - _kernel_storage(manage_kernel_storage ? - ALLOCATE(get_kernel_storage_size(kernel_shape)) : - kernel_storage), - input_shape(input_shape), - padding(padding), - output_shape(get_output_shape(kernel_shape, input_shape, padding)), - tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)), - tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)), - M(input_shape.n_batches * tile_rows * tile_cols), - K(kernel_shape.n_input_channels), - N(kernel_shape.n_output_channels), - prof() -{ - // Create pointers to the kernel matrices - const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); - int8_t* const ks_bytes = reinterpret_cast(_kernel_storage); - for (int i = 0; i < N_GEMMS; i++) { - kernel_matrices[i] = reinterpret_cast( - ks_bytes + i*kernel_matrix_size_bytes); - } -} - - -/** Create a new Winograd operator and initialise the weights. */ -template -template -WinogradGEMM::Convolution::Convolution( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding, - const TIn* const kernel, - void *kernel_storage, - void *transform_working_space -) : Convolution(kernel_shape, input_shape, padding, kernel_storage) -{ - transform_weights(kernel, transform_working_space); -} - - -/** Clean up a convolution engine. */ -template -template -WinogradGEMM:: -Convolution::~Convolution() -{ - // If we were responsible for managing kernel storage ensure that it is - // freed. - if (manage_kernel_storage) - { - free(_kernel_storage); - } -} - - -/** Transform weights into the Winograd domain and store them for later use/reuse. */ -template -template -template -void WinogradGEMM:: -Convolution::transform_weights( - const TIn* const kernel, - void *transform_working_space -) -{ - // Allocate working space if it is required - bool allocated_working_space = false; - if (transform_working_space == NULL && // If no memory has been provided - get_kernel_transform_working_size(kernel_shape) != 0) // And we need the space - { - allocated_working_space = true; - transform_working_space = ALLOCATE( - get_kernel_transform_working_size(kernel_shape) - ); - } - - // The transformation methods only work on weights laid out in HWIO form, if - // the weights are not in this form then we need to re-order them. - const TIn *kernel_hwio = kernel; - if (kernel_shape.ordering != HWIO) - { - kernel_hwio = reinterpret_cast(transform_working_space); - - // Re-order the weights from OIHW to HWIO - this->prof( - "Weight reorder", - [&kernel, &kernel_hwio, this] () { - reorder::ofm_ifm_h_w_to_h_w_ifm_ofm( - kernel, const_cast(kernel_hwio), - kernel_shape.n_output_channels, - kernel_shape.n_input_channels, - kernel_shape.n_rows, - kernel_shape.n_cols - ); - }, - kernel_shape.size() * sizeof(TIn), - 0, - kernel_shape.size() * sizeof(TIn) - ); - } - - const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); - WeightsTransformT weights_transform( - kernel_hwio, kernel_matrices[0], - kernel_matrix_size_bytes / sizeof(TIn), - kernel_matrix_row_stride, - kernel_shape.n_output_channels, - kernel_shape.n_input_channels - ); - - // Transform the weights into the Winograd domain - auto kernel_prep = [&] () - { - weights_transform.run(0, weights_transform.get_window()); - }; - - prof( - "Kernel Prep", kernel_prep, - WeightsTransformT::bytes_read(kernel_shape), - WeightsTransformT::ops_performed(kernel_shape), - WeightsTransformT::bytes_written(kernel_shape) - ); - - // Free memory if we allocated it - if (allocated_working_space) - { - free(transform_working_space); - } -} - - -/** Perform a convolution. */ -template -template -void WinogradGEMM:: -Convolution::execute( - TOut* const output, - const TIn* const input, - const TOut* const biases, - void *working_space, - const int n_threads -) -{ - const auto padding_type = padding; - const auto input_shape = this->input_shape; - - // Allocate working space if none has been provided - const bool manage_working_space = (working_space == NULL); - if (manage_working_space) - { - const size_t ws_size = get_working_space_size( - kernel_shape, input_shape, padding_type - ); - working_space = ALLOCATE(ws_size * sizeof(int8_t)); - memset(working_space, 0x00, ws_size); - } - int8_t* const ws_bytes = reinterpret_cast(working_space); - - // Split the working space into that required for 16 input matrices and - // output matrices. - TIn *input_matrices[N_GEMMS]; - TOut *output_matrices[N_GEMMS]; - const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type); - const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type); - - for (int i = 0; i < N_GEMMS; i++) - { - input_matrices[i] = reinterpret_cast( - ws_bytes + i*in_matrix_stride_bytes); - output_matrices[i] = reinterpret_cast( - ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes); - } - - // If we need to re-order the input and output tensors then the final chunk - // of the working space can be used for this purpose. - // TODO - Overlay the input reorder on top of the output matrices - // - Overlay the output reorder on top of the input matrices - // Reorder the input input form if it was not provided in this ordering. - const TIn* input_nhwc = input; - if (input_shape.ordering == NCHW) - { - input_nhwc = reinterpret_cast( - ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) - ); - - this->prof( - "NCHW -> NHWC", - [input, input_shape, input_nhwc] () { - reorder::nchw_to_nhwc( - input, const_cast(input_nhwc), - input_shape.n_batches, - input_shape.n_channels, - input_shape.n_rows, - input_shape.n_cols - ); - }, - input_shape.size(), 0, input_shape.size() - ); - } - - // Compute shape for the GEMM - const auto output_shape = this->output_shape; - int M = this->M; - int K = this->K; - int N = this->N; - - const int in_matrix_row_stride = K; - const int out_matrix_row_stride = kernel_matrix_row_stride; - - InputTransform input_transform( - input_nhwc, - input_shape.n_batches, - input_shape.n_rows, - input_shape.n_cols, - input_shape.n_channels, - padding_type, - input_matrices[0], - in_matrix_stride_bytes / sizeof(TIn), - in_matrix_row_stride - ); - - // Transform the input into the Winograd domain - auto input_prep = [&] () { - input_transform.run(0, input_transform.get_window()); - }; - prof( - "Input Prep", input_prep, - InputTransform::bytes_read(input_shape), - InputTransform::ops_performed(input_shape), - InputTransform::bytes_written(input_shape) - ); - - // Perform the GEMMs - const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape); - BatchedBlockedGemm gemms( - N_GEMMS, M, K, N, - in_matrix_stride_bytes / sizeof(TIn), - in_matrix_row_stride, - kernel_matrix_stride_bytes / sizeof(TIn), - kernel_matrix_row_stride, - out_matrix_stride_bytes / sizeof(TOut), - out_matrix_row_stride, - input_matrices[0], - kernel_matrices[0], - output_matrices[0] - ); - for (unsigned int i = 0; i < gemms.get_window(); i++) - { - auto run_gemm = [&] () { gemms.run(i, i+1); }; - prof("GEMM", run_gemm, 0, 0, 0); - } - - // If the output tensor needs to be in NCHW form then store the NHWC output - // tensor in temporary storage and then reorder. If the output tensor needs - // to be in NHWC then just write straight to the output tensor. - TOut *output_nhwc = output; - if (input_shape.ordering == NCHW) - { - output_nhwc = reinterpret_cast( - ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) - ); - } - - // Transform the output tensor from the Winograd domain to the spatial - // domain. - OutputTransform output_transform( - output_matrices[0], - out_matrix_stride_bytes / sizeof(TOut), - out_matrix_row_stride, - biases, - output_nhwc, - output_shape.n_batches, - output_shape.n_rows, - output_shape.n_cols, - output_shape.n_channels - ); - auto output_prep = [&] () { - output_transform.run(0, output_transform.get_window()); - }; - prof( - "Output Comp", output_prep, - OutputTransform::bytes_read(output_shape), - OutputTransform::ops_performed(output_shape), - OutputTransform::bytes_written(output_shape) - ); - - // Reorder the output tensor if it is required to be in NCHW form. - if (input_shape.ordering == NCHW) - { - prof( - "NHWC -> NCHW", - [output_nhwc, output_shape, output] () { - reorder::nhwc_to_nchw( - output_nhwc, output, - output_shape.n_batches, - output_shape.n_rows, - output_shape.n_cols, - output_shape.n_channels - ); - }, - output_shape.size(), 0, output_shape.size() - ); - } - - // Free working space if we were responsible for allocating it - if (manage_working_space) - { - free(working_space); - } -} - - -/** Perform a convolution. */ -template -template -void WinogradGEMM:: -Convolution::execute( - TOut* const output, - const TIn* const input, - const TOut* const biases, - const int n_threads -) -{ - execute(output, input, biases, NULL, n_threads); -} - - -// Instantiate required implementations -template class WinogradGEMM<2, 2, 3, 3>::Convolution; -template class WinogradGEMM<4, 4, 3, 3>::Convolution; - -template class WinogradGEMM<2, 2, 5, 5>::Convolution; diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 83a843de58..f4b45532cf 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -250,6 +250,21 @@ std::string arm_compute::lower_string(const std::string &val) return res; } +PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info) +{ + const auto &strides = conv_info.stride(); + const int out_width = std::ceil(float(input_shape.x()) / float(strides.first)); + const int out_height = std::ceil(float(input_shape.y()) / float(strides.second)); + const int pad_width = ((out_width - 1) * strides.first + weights_shape.x() - input_shape.x()); + const int pad_height = ((out_height - 1) * strides.second + weights_shape.y() - input_shape.y()); + const int same_pad_left = pad_width / 2; + const int same_pad_top = pad_height / 2; + const int same_pad_right = pad_width - same_pad_left; + const int same_pad_bottom = pad_height - same_pad_top; + + return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL); +} + TensorShape arm_compute::deconvolution_output_shape(const std::pair &out_dims, TensorShape input, TensorShape weights) { TensorShape out_shape(input); diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp index 88bf3ec0a0..5a00e230ea 100644 --- a/src/graph/operations/NESimpleOperations.cpp +++ b/src/graph/operations/NESimpleOperations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -149,13 +149,23 @@ REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType:: auto *biases = ctx.num_inputs() == 3 ? dynamic_cast(ctx.input(2)) : nullptr; auto *out = dynamic_cast(ctx.output(0)); const auto conv_info = ctx.parameter("ConvolutionInfo"); + const auto opt3x3 = ctx.parameter("Optimized3x3"); // Create and configure function std::unique_ptr func; - // TODO (COMPMID-769): Add support for asymmetric padding in NEDepthwiseConvolutionLayer3x3 to enable opt3x3 support - auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); - depwthwise_conv->configure(in, weights, biases, out, conv_info); - func = std::move(depwthwise_conv); + bool run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3; + if(run_3x3_opt) + { + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + depwthwise_conv->configure(in, weights, biases, out, conv_info); + func = std::move(depwthwise_conv); + } + else + { + auto depwthwise_conv = arm_compute::support::cpp14::make_unique(); + depwthwise_conv->configure(in, weights, biases, out, conv_info); + func = std::move(depwthwise_conv); + } // Log info ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer" diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 1f3e5d1192..d35e3e6026 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -33,9 +33,11 @@ using namespace arm_compute; using namespace arm_compute::misc; +using namespace arm_compute::misc::shape_calculator; NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3() - : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false) + : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false), + _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false) { } @@ -48,20 +50,49 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); _has_bias = biases != nullptr; + _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(), + conv_info, + input->info()->data_type()); + _are_weights_reshaped = false; - // Allocate the intermediate accumulator tensor in case of fixed point input - if(_is_quantized) + if(_is_optimized) { - _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32)); - _accumulator.info()->set_quantization_info(input->info()->quantization_info()); - zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + // Configure the function to transform the input tensor from NCHW -> NHWC + _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); + + // Configure the function to transform the weights tensor from IHW -> HWI + _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U)); + + // Configure optimized depthwise + _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC); + + // Configure the function to transform the convoluted output to ACL's native ordering format NCHW + _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U)); + + // Allocate tensors + _input_nhwc.allocator()->allocate(); + _weights_hwio.allocator()->allocate(); + _output_nhwc.allocator()->allocate(); + + // Create convolver (deferred) + _dwc_kernel.generate_convolver(); } + else + { + // Allocate the intermediate accumulator tensor in case of fixed point input + if(_is_quantized) + { + _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32)); + _accumulator.info()->set_quantization_info(input->info()->quantization_info()); + zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + } - // Configure depthwise convolution kernel - _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info); + // Configure depthwise convolution kernel + _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info); - // Configure border handler - _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value); + // Configure border handler + _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value); + } // Configure biases accumulation if(_has_bias || _is_quantized) @@ -83,8 +114,35 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we void NEDepthwiseConvolutionLayer3x3::run() { - NEScheduler::get().schedule(&_border_handler, Window::DimX); - NEScheduler::get().schedule(&_kernel, Window::DimX); + // Permute weights in HWIO format if the optimized kernel will be executedd + if(!_are_weights_reshaped && _is_optimized) + { + _are_weights_reshaped = true; + _permute_weights.run(); + } + + // Handle input + if(_is_optimized) + { + // Permute input to NHWC format execution + _permute_input.run(); + } + else + { + // Fill border in NCHW format execution + NEScheduler::get().schedule(&_border_handler, Window::DimX); + } + + // Execute depthwise convolution + NEScheduler::get().schedule(&_dwc_kernel, Window::DimX); + + // Permute output to ACL's native NCHW format in case of NHWC execution + if(_is_optimized) + { + _permute_output.run(); + } + + // Add biases if(_has_bias || _is_quantized) { NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX); diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index dd878ab18a..d2d40dfcb0 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -28,7 +28,7 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "support/ToolchainSupport.h" -#include "arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" namespace { -- cgit v1.2.1