diff options
9 files changed, 542 insertions, 431 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp index 00327f5102..77cd9de513 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp @@ -27,10 +27,10 @@ namespace winograd { - template <int output_tile_rows, int output_tile_cols, - int kernel_rows, int kernel_cols> - template <typename T> - void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::OutputTransform<T>::execute( +/***************************************************************************/ + /* Instance-less API */ + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + void OutputTransformImpl<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::execute( const int n_batches, const int output_batch_stride, const int n_rows, @@ -45,28 +45,12 @@ namespace winograd T* const output ) { - // If an Nx1 kernel then transpose and redirect to the 1xN implementation. - if (kernel_cols == 1) - { - WinogradGEMM<output_tile_cols, output_tile_rows, kernel_cols, kernel_rows>:: - template OutputTransform<T>::execute( - n_batches, - output_batch_stride, - n_cols, output_col_stride, - n_rows, output_row_stride, - n_channels, - matrix_base, matrix_stride, matrix_row_stride, - biases, output - ); - return; - } - // Compute the number of tiles and hence the padding required on the bottom // and right of the image. - const int tile_M = iceildiv(n_rows, output_tile_rows); - const int tile_N = iceildiv(n_cols, output_tile_cols); - const int pad_bottom = output_tile_rows*tile_M - n_rows; - const int pad_right = output_tile_cols*tile_N - n_cols; + const int tile_M = iceildiv(n_rows, OutputTileRows); + const int tile_N = iceildiv(n_cols, OutputTileCols); + const int pad_bottom = OutputTileRows*tile_M - n_rows; + const int pad_right = OutputTileCols*tile_N - n_cols; const int matrix_tile_row_stride = tile_N * matrix_row_stride; const int matrix_batch_stride = tile_M * matrix_tile_row_stride; @@ -84,7 +68,7 @@ namespace winograd // Compute properties of this row of output tiles const int row_pad_bottom = (tile_i < tile_M - 1) ? 0: pad_bottom; const T* const matrix_tile_row = matrix_batch + tile_i * matrix_tile_row_stride; - T* const outptr_row = outptr_batch + output_tile_rows*tile_i*output_row_stride; + T* const outptr_row = outptr_batch + OutputTileRows*tile_i*output_row_stride; // Process the row process_tile_row( @@ -97,10 +81,36 @@ namespace winograd } } - template <int output_tile_rows, int output_tile_cols, - int kernel_rows, int kernel_cols> - template <typename T> - void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::OutputTransform<T>::process_tile_row( +template <int KernelRows, int InnerTileRows, typename T> + void OutputTransformImpl<KernelRows, 1, InnerTileRows, 1, T>::execute( + const int n_batches, + const int output_batch_stride, + const int n_rows, + const int output_row_stride, + const int n_cols, + const int output_col_stride, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output + ) + { + // If an Nx1 kernel then transpose and redirect to the 1xN implementation. + OutputTransformImpl<1, KernelRows, 1, InnerTileRows, T>::execute( + n_batches, + output_batch_stride, + n_cols, output_col_stride, + n_rows, output_row_stride, + n_channels, + matrix_base, matrix_stride, matrix_row_stride, + biases, output + ); + } + + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + void OutputTransformImpl<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::process_tile_row( const int tile_N, const int n_channels, const T* const matrix_base, @@ -114,48 +124,27 @@ namespace winograd const int row_pad_right ) { - if (kernel_cols == 1) - { - // If an Nx1 implementation then this should never be reached. - return; - } - // Loop over columns of tiles for (int tile_j = 0; tile_j < tile_N; tile_j++) { // Properties of this tile const int tile_pad_right = (tile_j < tile_N - 1) ? 0 : row_pad_right; const T* const matrix_row = matrix_base + tile_j * matrix_row_stride; - T* const outptr = output + output_tile_cols*tile_j*output_col_stride; + T* const outptr = output + OutputTileCols *tile_j*output_col_stride; // Perform the output transformation - tile_fns[row_pad_bottom][tile_pad_right]( + const typename Tiles::TileFn tilefn = Tiles::get_tile_specialization(row_pad_bottom, tile_pad_right); + tilefn( n_channels, matrix_row, matrix_stride, biases, - outptr, output_row_stride, output_col_stride + outptr, output_row_stride, output_col_stride, + row_pad_bottom, tile_pad_right ); } } - template <int output_tile_rows, int output_tile_cols, int kr, int kc> - template <typename T> - size_t WinogradGEMM<output_tile_rows, output_tile_cols, kr, kc>::OutputTransform<T>::bytes_read(const Tensor4DShape &shape) - { - const int M = iceildiv(shape.n_rows, output_tile_rows) * - iceildiv(shape.n_cols, output_tile_cols); - const int N = shape.n_channels; - return inner_tile_rows * inner_tile_cols * M * N * sizeof(T); - } - - template <int otr, int otc, int kr, int kc> - template <typename T> - size_t WinogradGEMM<otr, otc, kr, kc>::OutputTransform<T>::bytes_written(const Tensor4DShape &shape) - { - return shape.size() * sizeof(T); - } - - template <int output_tile_rows, int output_tile_cols, int kr, int kc> - template <typename T> - WinogradGEMM<output_tile_rows, output_tile_cols, kr, kc>::OutputTransform<T>::OutputTransform( +/***************************************************************************/ + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::OutputTransform( const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, @@ -171,26 +160,24 @@ namespace winograd ) : _matrix_base(matrix_base), _biases(biases), _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride), _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), - _n_channels(n_channels), _tile_M(iceildiv(n_rows, output_tile_rows)), - _tile_N(iceildiv(n_cols, output_tile_cols)), + _n_channels(n_channels), _tile_M(iceildiv(n_rows, OutputTileRows)), + _tile_N(iceildiv(n_cols, OutputTileCols)), _out_col_stride(out_col_stride ? out_col_stride : n_channels), _out_row_stride(out_row_stride ? out_row_stride : n_cols * _out_col_stride), _out_batch_stride(out_batch_stride ? out_batch_stride : n_rows * _out_row_stride) { } - template <int otr, int otc, int kr, int kc> - template <typename T> - unsigned int WinogradGEMM<otr, otc, kr, kc>::OutputTransform<T>::get_window() const + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + unsigned int OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::get_window() const { // The final window includes the tail, all other windows will be a multiple // of the window block in size. return iceildiv(_n_channels, WINDOW_BLOCK); } - template <int otr, int otc, int kr, int kc> - template <typename T> - void WinogradGEMM<otr, otc, kr, kc>::OutputTransform<T>::run( +template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + void OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::run( const unsigned int start, const unsigned int stop ) { @@ -221,4 +208,71 @@ namespace winograd _outptr + start_channel ); } + + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + void OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::execute( + const int n_batches, + const int out_batch_stride, + const int n_rows, + const int out_row_stride, + const int n_cols, + const int out_col_stride, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output + ) + { + Transform::execute( + n_batches, out_batch_stride, + n_rows, out_row_stride, + n_cols, out_col_stride, n_channels, + matrix_base, matrix_stride, matrix_row_stride, + biases, output + ); + } + + template <int KernelCols, int InnerTileCols, typename T> + typename OutputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>::TileFn + OutputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>:: + get_tile_specialization(const int pad_bottom, const int pad_right) + { + (void) pad_bottom; + + if (!pad_right) + { + // No padding, return unpadded specialisation + return tilefn_unpadded; + } + else + { + return tilefn_right_padded[pad_right - 1]; + } + } + + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + typename OutputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::TileFn + OutputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>:: + get_tile_specialization(const int pad_bottom, const int pad_right) + { + if (!(pad_bottom || pad_right)) + { + // No padding, return unpadded specialisation + return tilefn_unpadded; + } + else if (pad_bottom && !pad_right) + { + return tilefn_bottom_padded[pad_bottom - 1]; + } + else if (!pad_bottom && pad_right) + { + return tilefn_right_padded[pad_right - 1]; + } + else + { + return tilefn_generic; + } + } } // namespace winograd diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp index 31aee35fab..71b5fd516f 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp @@ -31,6 +31,7 @@ #include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" #include "winograd_input_transform.hpp" +#include "winograd_output_transform.hpp" #include <thread> #include <utility> @@ -124,95 +125,13 @@ class WinogradGEMM /** Transform output feature maps from the Winograd to the spatial domain. */ template <typename T> - struct OutputTransform - { - /** Get the bytes read during the transform. */ - static size_t bytes_read(const Tensor4DShape &shape); - - /** Get the bytes written during the transform. */ - static size_t bytes_written(const Tensor4DShape &shape); - - /** Get the count of operations performed by the transform. */ - static int ops_performed(const Tensor4DShape &shape); - - /** Apply the transform to create a tensor. */ - static void execute( - const int n_batches, - const int out_batch_stride, - const int n_rows, - const int out_row_stride, - const int n_cols, - const int out_col_stride, - const int n_channels, - const T* const matrix_base, - const int matrix_stride, - const int matrix_row_stride, - const T* const biases, - T* const output - ); - - /***********************************************************************/ - /** Create an OutputTransform operator fixed on a given problem and set - * of pointers. - */ - OutputTransform( - const T* const matrix_base, /** Pointer to base of matrices. */ - const int matrix_stride, /** Stride between matrices. */ - const int matrix_row_stride, /** Stride within a matrix. */ - const T* const biases, /** Pointer to biases vector. */ - T* const output, /** Pointer to output tensor. */ - const int n_batches, /** Number of batches in output tensor. */ - const int n_rows, /** Number of rows in output tensor. */ - const int n_cols, /** Number of columns in output tensor. */ - const int n_channels, /** Number of channels in output tensor. */ - const int out_batch_stride=0, /** Output batch stride. */ - const int out_row_stride=0, /** Output row stride. */ - const int out_col_stride=0 /** Output column stride. */ - ); - - /** Get the window of work a given operator can perform. */ - unsigned int get_window() const; - static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window - - /** Perform work upon a window of the input. */ - void run(const unsigned int start, const unsigned int stop); - /***********************************************************************/ - - private: - static void process_tile_row( - const int tile_N, - const int n_channels, - const T* const matrix_base, - const int matrix_stride, - const int matrix_row_stride, - const T* const biases, - T* const output, - const int output_row_stride, - const int output_col_stride, - const int row_pad_bottom, - const int row_pad_right - ); - - // Limits on the amount of anti-padding to be applied - static constexpr int max_pad_bottom = output_tile_rows; - static constexpr int max_pad_right = output_tile_cols; - - /** Prepare a single tile of the output tensor. */ - template <int pad_bottom, int pad_right> - static void process_tile(int, const T*, int, const T*, T*, int, int); - - // Array of methods to produce tiles of output tensor. - typedef void (*TileFn)(int, const T*, int, const T*, T*, int, int); - static const TileFn tile_fns[max_pad_bottom][max_pad_right]; + using OutputTransform = OutputTransform< + KernelRows, KernelCols, + (OutputTileRows + KernelRows - 1), + (OutputTileCols + KernelCols - 1), + T + >; - /** Member constants for instances of the transform. */ - const T* const _matrix_base; - const T* const _biases; - const int _matrix_stride, _matrix_row_stride; - T* const _outptr; - const int _n_batches, _n_rows, _n_cols, _n_channels, _tile_M, _tile_N; - const int _out_col_stride, _out_row_stride, _out_batch_stride; - }; /** Perform a convolution. */ diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp new file mode 100644 index 0000000000..07a0b8666a --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +namespace winograd +{ + + +namespace +{ + +template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> +class OutputTransformImplTiles +{ + public: + typedef void (*TileFn)( + const int n_channels, /** @param[in] Number of channels in output tensor */ + const T* const matrix_base, /** @param[in] Base pointer to Winograd output matrices. */ + const int matrix_stride, /** @param[in] Stride between matrices in the output space. */ + const T* const biases, /** @param[in] Pointer to bias vector (may be nullptr). */ + T* const output, /** @param[out] Pointer to output tensor. */ + const int output_row_stride, /** @param[in] Stride across rows of the output tensor. */ + const int output_col_stride, /** @param[in] Stride between columns of the output tensor. */ + const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */ + const int _pad_right /** @param[in] Right padding for unspecialised tiles. */ + ); + + static TileFn get_tile_specialization( + const int pad_bottom, + const int pad_right + ); + + static constexpr unsigned int OutputTileRows = InnerTileRows - KernelRows + 1; + static constexpr unsigned int OutputTileCols = InnerTileCols - KernelCols + 1; + + private: + static constexpr unsigned int n_pad_bottom = OutputTileRows - 1; + static constexpr unsigned int n_pad_right = OutputTileCols - 1; + + static const TileFn tilefn_generic; /** Generic tile processing function. */ + static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */ + static const TileFn tilefn_bottom_padded[n_pad_bottom]; /** Bottom padding only. */ + static const TileFn tilefn_right_padded[n_pad_right]; /** Right padding only. */ +}; + +template <int KernelCols, int InnerTileCols, typename T> +class OutputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T> +{ + public: + typedef void (*TileFn)( + const int n_channels, /** @param[in] Number of channels in output tensor */ + const T* const matrix_base, /** @param[in] Base pointer to Winograd output matrices. */ + const int matrix_stride, /** @param[in] Stride between matrices in the output space. */ + const T* const biases, /** @param[in] Pointer to bias vector (may be nullptr). */ + T* const output, /** @param[out] Pointer to output tensor. */ + const int output_row_stride, /** @param[in] Stride across rows of the output tensor. */ + const int output_col_stride, /** @param[in] Stride between columns of the output tensor. */ + const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */ + const int _pad_right /** @param[in] Right padding for unspecialised tiles. */ + ); + + static TileFn get_tile_specialization( + const int pad_bottom, + const int pad_right + ); + + static constexpr unsigned int OutputTileRows = 1; + static constexpr unsigned int OutputTileCols = InnerTileCols - KernelCols + 1; + + private: + static constexpr unsigned int n_pad_right = OutputTileCols - 1; + + static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */ + static const TileFn tilefn_right_padded[n_pad_right]; /** Right padding only. */ +}; + +template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> +class OutputTransformImpl +{ + private: + static void process_tile_row( + const int tile_N, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output, + const int output_row_stride, + const int output_col_stride, + const int row_pad_bottom, + const int row_pad_right + ); + + using Tiles = OutputTransformImplTiles< + KernelRows, KernelCols, InnerTileRows, InnerTileCols, T + >; + + public: + /** Apply the output transform to a tensor. */ + static void execute( + const int n_batches, + const int out_batch_stride, + const int n_rows, + const int out_row_stride, + const int n_cols, + const int out_col_stride, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output + ); + + static constexpr unsigned int OutputTileRows = Tiles::OutputTileRows; + static constexpr unsigned int OutputTileCols = Tiles::OutputTileCols; +}; + +template <int KernelRows, int InnerTileRows, typename T> +class OutputTransformImpl<KernelRows, 1, InnerTileRows, 1, T> +{ + public: + /** Apply the output transform to a tensor. */ + static void execute( + const int n_batches, + const int out_batch_stride, + const int n_rows, + const int out_row_stride, + const int n_cols, + const int out_col_stride, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output + ); + + static constexpr unsigned int OutputTileRows = InnerTileRows - KernelRows + 1; + static constexpr unsigned int OutputTileCols = 1; +}; + +} // namespace (anonymous) + +template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> +class OutputTransform +{ + public: + /***********************************************************************/ + /** Create an OutputTransform operator fixed on a given problem and set + * of pointers. + */ + OutputTransform( + const T* const matrix_base, /** Pointer to base of matrices. */ + const int matrix_stride, /** Stride between matrices. */ + const int matrix_row_stride, /** Stride within a matrix. */ + const T* const biases, /** Pointer to biases vector. */ + T* const output, /** Pointer to output tensor. */ + const int n_batches, /** Number of batches in output tensor. */ + const int n_rows, /** Number of rows in output tensor. */ + const int n_cols, /** Number of columns in output tensor. */ + const int n_channels, /** Number of channels in output tensor. */ + const int out_batch_stride=0, /** Output batch stride. */ + const int out_row_stride=0, /** Output row stride. */ + const int out_col_stride=0 /** Output column stride. */ + ); + + /** Get the window of work a given operator can perform. */ + unsigned int get_window() const; + static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window + + /** Perform work upon a window of the input. */ + void run(const unsigned int start, const unsigned int stop); + + /** Apply the transform to create a tensor. */ + static void execute( + const int n_batches, + const int out_batch_stride, + const int n_rows, + const int out_row_stride, + const int n_cols, + const int out_col_stride, + const int n_channels, + const T* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + const T* const biases, + T* const output + ); + + private: + using Transform = OutputTransformImpl< + KernelRows, KernelCols, InnerTileRows, InnerTileCols, T + >; + + static constexpr unsigned int OutputTileRows = Transform::OutputTileRows; + static constexpr unsigned int OutputTileCols = Transform::OutputTileCols; + + /** Member constants for instances of the transform. */ + const T* const _matrix_base; + const T* const _biases; + const int _matrix_stride, _matrix_row_stride; + T* const _outptr; + const int _n_batches, _n_rows, _n_cols, _n_channels, _tile_M, _tile_N; + const int _out_col_stride, _out_row_stride, _out_batch_stride; +}; + +} // namespace winograd + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp index cfd2029f11..ea842a45ee 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp @@ -23,38 +23,33 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 2, 1, 7>::OutputTransform<float>; -using TransformTransposed = WinogradGEMM<2, 1, 7, 1>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadRight=0> +void winograd_output_transform_2_7_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 2; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; + // Construct a map to the output cells float *outptrs[cells_j]; @@ -149,22 +144,20 @@ void Transform::process_tile( } } } +} // namespace (anonymous) -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - }, -}; - +using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>; template <> +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>; + template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2_7_fp32_process_tile<true, 1> +}; -template struct WinogradGEMM<1, 2, 1, 7>::OutputTransform<float>; -template struct WinogradGEMM<2, 1, 7, 1>::OutputTransform<float>; +template class OutputTransform<1, 7, 1, 8, float>; +template class OutputTransform<7, 1, 8, 1, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp index 3b3cda0aa9..597b074026 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -23,59 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 2); - const int tile_N = iceildiv(shape.n_cols, 2); - return 24 * tile_M * tile_N * shape.n_channels; -} -/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadBottom=0, int PadRight=0> +void winograd_output_transform_2x2_3x3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; + constexpr int OutputTileRows = 2, OutputTileCols = 2; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + + const int cells_i = OutputTileRows - pad_bottom; + const int cells_j = OutputTileCols - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[OutputTileRows][OutputTileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -373,19 +348,28 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0> }; -template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1> +}; + +template class OutputTransform<3, 3, 4, 4, float>; } // namespace winograd + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp index cafce9549d..60d7181d97 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp @@ -23,57 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadBottom=0, int PadRight=0> +void winograd_output_transform_2x2_5x5_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; + constexpr int OutputTileRows = 2, OutputTileCols = 2; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + + const int cells_i = 2 - pad_bottom; + const int cells_j = 2 - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[OutputTileRows][OutputTileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -365,19 +342,28 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0> }; -template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1> +}; + +template class OutputTransform<5, 5, 6, 6, float>; } // namespace winograd + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp index 2417f527bf..911759b128 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp @@ -23,38 +23,32 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 4, 1, 5>::OutputTransform<float>; -using TransformTransposed = WinogradGEMM<4, 1, 5, 1>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadRight=0> +void winograd_output_transform_4_5_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 4; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; // Construct a map to the output cells float *outptrs[cells_j]; @@ -156,23 +150,22 @@ void Transform::process_tile( } } -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +} // namespace (anonymous) + +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - }, -}; +using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>; template <> -template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_4_5_fp32_process_tile<true, 1>, + winograd_output_transform_4_5_fp32_process_tile<true, 2>, + winograd_output_transform_4_5_fp32_process_tile<true, 3> +}; -template struct WinogradGEMM<1, 4, 1, 5>::OutputTransform<float>; -template struct WinogradGEMM<4, 1, 5, 1>::OutputTransform<float>; +template class OutputTransform<1, 5, 1, 8, float>; +template class OutputTransform<5, 1, 8, 1, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp index cd3bdef0d2..15cc04b352 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp @@ -23,73 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd +namespace { -using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 4); - const int tile_N = iceildiv(shape.n_cols, 4); - return 170 * tile_M * tile_N * shape.n_channels; -} - -/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain up to 3 - * padded values to the right and bottom columns or rows of the tile, e.g.: -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |_______| |______X| |____X_X| |__X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* -* We provide a specialised output transform for each of these instances. -*/ -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadBottom=0, int PadRight=0> +void winograd_output_transform_4x4_3x3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 4 - pad_bottom; - constexpr int cells_j = 4 - pad_right; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + constexpr int TileRows = 4, TileCols = 4; + + const int cells_i = TileRows - pad_bottom; + const int cells_j = TileCols - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[TileRows][TileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -437,35 +398,31 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - }, - { - Transform::template process_tile<1, 0>, - Transform::template process_tile<1, 1>, - Transform::template process_tile<1, 2>, - Transform::template process_tile<1, 3>, - }, - { - Transform::template process_tile<2, 0>, - Transform::template process_tile<2, 1>, - Transform::template process_tile<2, 2>, - Transform::template process_tile<2, 3>, - }, - { - Transform::template process_tile<3, 0>, - Transform::template process_tile<3, 1>, - Transform::template process_tile<3, 2>, - Transform::template process_tile<3, 3>, - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>, + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>, + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>, +}; + +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>, + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>, + winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>, }; -template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>; +template class OutputTransform<3, 3, 6, 6, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp index 16667ccdb6..58bed71a47 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp @@ -23,38 +23,32 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 6, 1, 3>::OutputTransform<float>; -using TransformTransposed = WinogradGEMM<6, 1, 3, 1>::OutputTransform<float>; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template <int pad_bottom, int pad_right> -void Transform::process_tile( +template <bool Specialized, int PadRight=0> +void winograd_output_transform_6_3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 6; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; // Construct a map to the output cells float *outptrs[cells_j]; @@ -162,25 +156,24 @@ void Transform::process_tile( } } -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +} // namespace (anonymous) + +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - Transform::template process_tile<0, 4>, - Transform::template process_tile<0, 5>, - }, -}; +using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>; template <> -template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_6_3_fp32_process_tile<true, 1>, + winograd_output_transform_6_3_fp32_process_tile<true, 2>, + winograd_output_transform_6_3_fp32_process_tile<true, 3>, + winograd_output_transform_6_3_fp32_process_tile<true, 4>, + winograd_output_transform_6_3_fp32_process_tile<true, 5>, +}; -template struct WinogradGEMM<1, 6, 1, 3>::OutputTransform<float>; -template struct WinogradGEMM<6, 1, 3, 1>::OutputTransform<float>; +template class OutputTransform<1, 3, 1, 8, float>; +template class OutputTransform<3, 1, 8, 1, float>; } // namespace winograd |