aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-09-25 16:01:35 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:55:19 +0000
commiteb027e933758b1e749f0f6bd2817ee8979ef903c (patch)
tree99543b716f42c040d801a01d7e4674c54a1173a7
parent4284bfab4594d4babb23123001ef63db7bebeccb (diff)
downloadComputeLibrary-eb027e933758b1e749f0f6bd2817ee8979ef903c.tar.gz
COMPMID-1600: Reduce number of tile specialisations.
Change-Id: I4d06eca9404ea6d3df9d0ca52f5d6f5421ab7116 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/150117 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp98
-rw-r--r--arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp127
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp179
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp191
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp400
5 files changed, 419 insertions, 576 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
index 473a13c3b0..b813bbb25c 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
@@ -155,15 +155,15 @@ namespace winograd
T* const outptr = matrix_base + tile_j*matrix_row_stride;
// Apply the specific tile processing function
- const int f_pad_top = iceildiv(pad_top, 2);
- const int f_pad_left = iceildiv(t_pad_left, 2);
- tile_fns[f_pad_top][f_pad_left][pad_bottom][t_pad_right](
+ const typename Tiles::TileFn tilefn = Tiles::get_tile_specialization(
+ pad_top, t_pad_left, pad_bottom, t_pad_right
+ );
+
+ tilefn(
n_channels,
- input_base_col,
- input_row_stride,
- input_col_stride,
- outptr,
- matrix_stride
+ input_base_col, input_row_stride, input_col_stride,
+ outptr, matrix_stride,
+ pad_top, t_pad_left, pad_bottom, t_pad_right
);
}
}
@@ -264,4 +264,86 @@ namespace winograd
matrix_stride, matrix_batch_stride, matrix_row_stride
);
}
+
+ template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T>
+ typename InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::TileFn
+ InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::
+ get_tile_specialization(
+ const int pad_top,
+ const int pad_left,
+ const int pad_bottom,
+ const int pad_right
+ )
+ {
+ if (!(pad_top || pad_left || pad_bottom || pad_right))
+ {
+ // No padding, return unpadded specialisation
+ return tilefn_unpadded;
+ }
+ else if (pad_top && !(pad_left || pad_bottom || pad_right))
+ {
+ // Top padding only
+ const int index = (pad_top - min_pad_top) / (InnerTileRows - overlap_rows);
+ return tilefn_top_padded[index];
+ }
+ else if (!(pad_top) && pad_left && !(pad_bottom || pad_right))
+ {
+ // Left padding only
+ const int index = (pad_left - min_pad_left) / (InnerTileCols - overlap_cols);
+ return tilefn_left_padded[index];
+ }
+ else if (!(pad_top || pad_left) && pad_bottom && !(pad_right))
+ {
+ // Bottom padding only
+ return tilefn_bottom_padded[pad_bottom - 1];
+ }
+ else if (!(pad_top || pad_left || pad_bottom) && pad_right)
+ {
+ // Right padding only
+ return tilefn_right_padded[pad_right - 1];
+ }
+ else
+ {
+ // Combination of paddings, return an unspecialised method
+ return tilefn_generic;
+ }
+ }
+
+ template <int KernelCols, int InnerTileCols, typename T>
+ typename InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>::TileFn
+ InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>::
+ get_tile_specialization(
+ const int pad_top,
+ const int pad_left,
+ const int pad_bottom,
+ const int pad_right
+ )
+ {
+ (void) pad_top;
+ (void) pad_bottom;
+
+ if (!(pad_left || pad_right))
+ {
+ // No padding, return unpadded specialisation
+ return tilefn_unpadded;
+ }
+ else if (pad_left && !pad_right)
+ {
+ // Left padding only
+ const int index = (pad_left - min_pad_left) / (InnerTileCols - overlap_cols);
+ return tilefn_left_padded[index];
+ }
+ else if (!pad_left && pad_right)
+ {
+ // Right padding only
+ return tilefn_right_padded[pad_right - 1];
+ }
+ else
+ {
+ // Combination of paddings, return an unspecialised method
+ return tilefn_generic;
+ }
+ }
}
+
+
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp
index abcda53534..995554d7f2 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp
@@ -31,6 +31,109 @@ namespace
{
template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T>
+class InputTransformImplTiles
+{
+ public:
+ /** Method to transform a tile of the input tensor into the Winograd domain. */
+ typedef void (*TileFn)(
+ const int n_channels, /** @param[in] Number of channels in the tensor. */
+ const T* const inptr_base, /** @param[in] Pointer to the base of the input tile. */
+ const int input_row_stride, /** @param[in] Stride between rows of the input tensor. */
+ const int input_col_stride, /** @param[in] Stride between columns of the input tensor. */
+ T* const mptr_base, /** @param[out] Base pointer to transformed input matrices. */
+ const int matrix_stride, /** @param[in] Stride between matrices in the input space. */
+ const int _pad_top, /** @param[in] Top padding for unspecialised tiles. */
+ const int _pad_left, /** @param[in] Left padding for unspecialised tiles. */
+ const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */
+ const int _pad_right /** @param[in] Right padding for unspecialised tiles. */
+ );
+
+ static TileFn get_tile_specialization(
+ const int pad_top,
+ const int pad_left,
+ const int pad_bottom,
+ const int pad_right
+ );
+
+ // Tile overlaps
+ static constexpr int overlap_rows = KernelRows - 1;
+ static constexpr int overlap_cols = KernelCols - 1;
+
+ private:
+
+ // Maximum padding and number of distinct paddings
+ static constexpr int max_pad_top = KernelRows / 2;
+ static constexpr int min_pad_top = KernelRows % (InnerTileRows - overlap_rows);
+ static constexpr int n_pad_top = iceildiv(max_pad_top, InnerTileRows - overlap_rows);
+
+ static constexpr int max_pad_left = KernelCols / 2;
+ static constexpr int min_pad_left = KernelCols % (InnerTileCols - overlap_cols);
+ static constexpr int n_pad_left = iceildiv(max_pad_left, InnerTileCols - overlap_cols);
+
+ static constexpr int n_pad_bottom = InnerTileRows;
+ static constexpr int n_pad_right = InnerTileCols;
+
+ // Pointers to methods implementing a generically padded tile and a totally unpadded tile.
+ static const TileFn tilefn_generic; /** Generic tile processing function. */
+ static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */
+
+ // Arrays of methods covering tiles which are padded only on a single side.
+ static const TileFn tilefn_top_padded[n_pad_top];
+ static const TileFn tilefn_left_padded[n_pad_left];
+ static const TileFn tilefn_bottom_padded[n_pad_bottom];
+ static const TileFn tilefn_right_padded[n_pad_right];
+};
+
+
+template < int KernelCols, int InnerTileCols, typename T>
+class InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>
+{
+ public:
+ /** Method to transform a tile of the input tensor into the Winograd domain. */
+ typedef void (*TileFn)(
+ const int n_channels, /** @param[in] Number of channels in the tensor. */
+ const T* const inptr_base, /** @param[in] Pointer to the base of the input tile. */
+ const int input_row_stride, /** @param[in] Stride between rows of the input tensor. */
+ const int input_col_stride, /** @param[in] Stride between columns of the input tensor. */
+ T* const mptr_base, /** @param[out] Base pointer to transformed input matrices. */
+ const int matrix_stride, /** @param[in] Stride between matrices in the input space. */
+ const int _pad_top, /** @param[in] Top padding for unspecialised tiles. */
+ const int _pad_left, /** @param[in] Left padding for unspecialised tiles. */
+ const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */
+ const int _pad_right /** @param[in] Right padding for unspecialised tiles. */
+ );
+
+ static TileFn get_tile_specialization(
+ const int pad_top,
+ const int pad_left,
+ const int pad_bottom,
+ const int pad_right
+ );
+
+ // Tile overlaps
+ static constexpr int overlap_rows = 0;
+ static constexpr int overlap_cols = KernelCols - 1;
+
+ private:
+ // Maximum padding and number of distinct paddings
+ static constexpr int max_pad_left = KernelCols / 2;
+ static constexpr int min_pad_left = KernelCols % (InnerTileCols - overlap_cols);
+ static constexpr int n_pad_left = iceildiv(max_pad_left, InnerTileCols - overlap_cols);
+
+ static constexpr int n_pad_right = InnerTileCols;
+
+ // Pointers to methods implementing a generically padded tile and a totally unpadded tile.
+ static const TileFn tilefn_generic; /** Generic tile processing function. */
+ static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */
+
+ // Arrays of methods covering tiles which are padded only on a single side.
+ static const TileFn tilefn_left_padded[n_pad_left];
+ static const TileFn tilefn_right_padded[n_pad_right];
+};
+
+
+
+template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T>
class InputTransformImpl
{
public:
@@ -69,29 +172,13 @@ class InputTransformImpl
const int n_cols
);
- // Tile overlaps
- static constexpr int overlap_rows = KernelRows - 1;
- static constexpr int overlap_cols = KernelCols - 1;
-
- // Maximum padding and number of distinct paddings
- static constexpr int max_pad_top = KernelRows / 2;
- static constexpr int n_pad_top = 1 + iceildiv(max_pad_top, InnerTileRows - overlap_rows);
+ using Tiles = InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>;
- static constexpr int max_pad_left = KernelCols / 2;
- static constexpr int n_pad_left = 1 + iceildiv(max_pad_left, InnerTileCols - overlap_cols);
+ static constexpr int overlap_rows = Tiles::overlap_rows;
+ static constexpr int overlap_cols = Tiles::overlap_cols;
- static constexpr int n_pad_bottom = InnerTileRows;
- static constexpr int n_pad_right = InnerTileCols;
-
- /** Process a single tile of the input tensor. */
- template <int pad_top, int pad_left, int pad_bottom, int pad_right>
- static void process_tile(int, const T*, int, int, T*, int);
- // Array of methods to transform tiles of the input tensor.
- typedef void (*TileFn)(int, const T*, int, int, T*, int);
- static const TileFn
- tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right];
-};
+ };
template <int KernelRows, int InnerTileRows, typename T>
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
index 042d4debbc..e66300d39a 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
@@ -29,19 +29,30 @@
namespace
{
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
void winograd_input_transform_1x8_fp32_process_tile(
int n_channels,
const float* const input_base,
const int input_row_stride,
const int input_col_stride,
float* const matrix_base,
- const int matrix_stride
+ const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
)
{
(void) input_row_stride; // No rows over which to stride
+ (void) _pad_top; // Never any top padding
+ (void) _pad_bottom; // Never any bottom padding
+
+ // Extract padding arguments
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
constexpr int inner_tile_cols = 8;
- constexpr int cells_j = inner_tile_cols - pad_right;
+ const int cells_j = inner_tile_cols - pad_right;
float *outptr = matrix_base;
@@ -162,109 +173,85 @@ void winograd_input_transform_1x8_fp32_process_tile(
namespace winograd
{
template <int x>
-using Transform = InputTransformImpl<1, x, 1, 8, float>;
+using Tiles = InputTransformImplTiles<1, x, 1, 8, float>;
+/*****************************************************************************/
+// 1x3 specialisations
template <>
-const Transform<3>::TileFn
- Transform<3>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>,
- }
- },
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>,
- }
- }
- }
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
};
template <>
-const Transform<5>::TileFn
- Transform<5>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>,
- }
- },
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 6>,
- }
- }
- }
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
};
+/*****************************************************************************/
+/*****************************************************************************/
+// 1x5 specialisations
template <>
-const Transform<7>::TileFn
- Transform<7>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>,
- }
- },
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>,
- }
- },
- {
- {
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 6>,
- }
- }
- }
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>,
};
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+/*****************************************************************************/
+// 1x7 specialisations
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>,
+};
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+
template class InputTransform<1, 3, 1, 8, float>;
template class InputTransform<3, 1, 8, 1, float>;
template class InputTransform<1, 5, 1, 8, float>;
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
index a9d5d52d15..4203945dd3 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
@@ -29,75 +29,36 @@
namespace winograd
{
-using Transform = InputTransformImpl<3, 3, 4, 4, float>;
-
-/*****************************************************************************
-* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an image
-* can require one row or column of padding on their top and left sides if the
-* padding type is SAME (where X represents a padded value):
-*
-* _______ _______
-* |X X X X| |X X X X|
-* |X | | | . . .
-* |X | | |
-* |X______| |_______|
-* _______
-* |X | .
-* |X | . . . .
-* |X | .
-* |X______|
-*
-* For tiles near the right or bottom of the image it is more complicated. Such
-* tiles might require padding by 0 or 1 rows or columns if the padding type is
-* VALID or 1 or 2 rows or columns if the padding type is SAME:
-*
-* _______ _______ _______ _______
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X______| |_______| |______X| |____X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X______| |_______| |______X| |____X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-*
-* Additional tiles are required for especially small input images.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-* Padding top in {0, 1}
-* Padding left in {0, 1}
-* Padding bottom in {0, 1, 2}
-* Padding right in {0, 1, 2}
-*/
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
+using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>;
+
+namespace
+{
+
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_4x4_fp32_process_tile(
int n_channels,
const float* const input_base,
const int input_row_stride,
const int input_col_stride,
float* const matrix_base,
- const int matrix_stride
-)
+ const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
+ )
{
+const int pad_top = Specialized ? PadTop : _pad_top;
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
constexpr int inner_tile_i = 4, inner_tile_j = 4;
- constexpr int cells_i = inner_tile_i - pad_bottom;
- constexpr int cells_j = inner_tile_i - pad_right;
+ const int cells_i = inner_tile_i - pad_bottom;
+ const int cells_j = inner_tile_i - pad_right;
+
+
float *outptr = matrix_base;
@@ -311,81 +272,39 @@ void Transform::process_tile(
}
}
+} // namespace (anonymous)
+
template <>
-const Transform::TileFn Transform::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- Transform::template process_tile<0, 0, 0, 0>, // No padding
- Transform::template process_tile<0, 0, 0, 1>, // Right
- Transform::template process_tile<0, 0, 0, 2>, // Right
- },
- {
- Transform::template process_tile<0, 0, 1, 0>, // Bottom
- Transform::template process_tile<0, 0, 1, 1>, // Bottom-right
- Transform::template process_tile<0, 0, 1, 2>, // Bottom-right
- },
- {
- Transform::template process_tile<0, 0, 2, 0>, // Bottom
- Transform::template process_tile<0, 0, 2, 1>, // Bottom-right
- Transform::template process_tile<0, 0, 2, 2>, // Bottom-right
- }
- },
- {
- {
- Transform::template process_tile<0, 1, 0, 0>, // Left
- Transform::template process_tile<0, 1, 0, 1>, // Left AND right
- Transform::template process_tile<0, 1, 0, 2>, // Left AND right
- },
- {
- Transform::template process_tile<0, 1, 1, 0>, // Left-bottom
- Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right
- Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right
- },
- {
- Transform::template process_tile<0, 1, 2, 0>, // Left-bottom
- Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right
- Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right
- }
- },
- },
- {
- {
- {
- Transform::template process_tile<1, 0, 0, 0>, // Top
- Transform::template process_tile<1, 0, 0, 1>, // Top-right
- Transform::template process_tile<1, 0, 0, 2>, // Top-right
- },
- {
- Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom
- Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right
- Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right
- },
- {
- Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom
- Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right
- Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right
- }
- },
- {
- {
- Transform::template process_tile<1, 1, 0, 0>, // Top-left
- Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right
- Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right
- },
- {
- Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom
- Transform::template process_tile<1, 1, 1, 1>, // All padded
- Transform::template process_tile<1, 1, 1, 2>, // All padded
- },
- {
- Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom
- Transform::template process_tile<1, 1, 2, 1>, // All padded
- Transform::template process_tile<1, 1, 2, 2>, // All padded
- }
- }
- }
+const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>;
+
+
+template <>
+const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>,
};
template class InputTransform<3, 3, 4, 4, float>;
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
index 908613068a..893122cc45 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
@@ -29,20 +29,30 @@
namespace
{
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
void winograd_input_transform_6x6_fp32_process_tile(
int n_channels,
const float* const input_base,
const int input_row_stride,
const int input_col_stride,
float* const matrix_base,
- const int matrix_stride
+const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
)
{
- constexpr int inner_tile_rows = 6;
+ const int pad_top = Specialized ? PadTop : _pad_top;
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
+ constexpr int inner_tile_rows = 6;
constexpr int inner_tile_cols = 6;
- constexpr int cells_i = inner_tile_rows - pad_bottom;
- constexpr int cells_j = inner_tile_cols - pad_right;
+
+ const int cells_i = inner_tile_rows - pad_bottom;
+ const int cells_j = inner_tile_cols - pad_right;
float *outptr = matrix_base;
@@ -285,322 +295,80 @@ void winograd_input_transform_6x6_fp32_process_tile(
namespace winograd
{
template <int k>
-using Transform = InputTransformImpl<k, k, 6, 6, float>;
+using Tiles = InputTransformImplTiles<k, k, 6, 6, float>;
template <>
-const Transform<3>::TileFn
- Transform<3>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 0>, // No padding
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 1>, // Right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 4>, // " "
- }
- },
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 0>, // Left
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 0>, // Bottom left
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 4>,
- }
- }
- },
- {
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 0>, // Top
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 1>, // Top right
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 4>,
- },
- },
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 0>, // Top left
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 4>,
- }
- }
- }
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>,
};
template <>
-const Transform<5>::TileFn
- Transform<5>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] =
-{
- {
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 0>, // No padding
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 1>, // Right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 0>, // Bottom
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 1>, // Bottom right
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 4>, // " "
- }
- },
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 0>, // Left
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 0>, // Bottom left
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 0>, // " "
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 4>,
- }
- }
- },
- {
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 0>, // Top
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 1>, // Top right
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 2>, // " "
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 3>, // " "
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 4>, // " "
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 4>,
- },
- },
- {
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 0>, // Top left
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 4>,
- },
- {
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 1>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 2>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 3>,
- winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 4>,
- }
- }
- }
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
};
template class InputTransform<3, 3, 6, 6, float>;