From 5264b7d5555ec980f9c52c719122479d0d676af8 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Mon, 21 Oct 2019 14:25:41 +0100 Subject: COMPMID-2576: Fuse activation in Winograd output transform. Change-Id: I26dd1307847adeaaefae0a7374b9858c07d71372 Signed-off-by: Pablo Tello Reviewed-on: https://review.mlplatform.org/c/2172 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- .../NEON/kernels/convolution/winograd/winograd.cpp | 250 ++++++++------------- 1 file changed, 96 insertions(+), 154 deletions(-) (limited to 'src/core/NEON/kernels/convolution/winograd/winograd.cpp') diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.cpp b/src/core/NEON/kernels/convolution/winograd/winograd.cpp index 226f303c7d..a4eb9fce59 100644 --- a/src/core/NEON/kernels/convolution/winograd/winograd.cpp +++ b/src/core/NEON/kernels/convolution/winograd/winograd.cpp @@ -21,205 +21,147 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #include +#include "utils.hpp" #include "winograd.hpp" + using namespace winograd; +using array2 = std::pair; -/** Get the output shape of a convolution. */ -template -template -Tensor4DShape WinogradGEMM::Convolution::get_output_shape( - const KernelShape &kernel_shape, - const Tensor4DShape &in_shape, - const PaddingType padding -) -{ - return Tensor4DShape { - in_shape.n_batches, - (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1), - (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1), - kernel_shape.n_output_channels, - in_shape.ordering - }; -} +#define MEMBERFN(RTYPE) \ + template \ + template \ + RTYPE WinogradGEMM::Convolution -/* Get the memory required to transform the kernel. - */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_transform_working_size(const KernelShape &shape) -{ - if (shape.ordering == HWIO) - { - // Kernel is already in the correct order, so no additional memory is - // required. - return 0; - } - else - { - // Need to re-order the kernel into HWIO form, require enough space to - // represent the tensor. - return sizeof(TIn) * shape.size(); - } +/** Get the output shape of a convolution. */ +MEMBERFN(array2) +::get_output_shape(const std::pair input_shape, + const bool padding_same) { + const unsigned int n_rows = + padding_same ? input_shape.first : input_shape.first - (kernel_rows - 1); + const unsigned int n_cols = padding_same + ? input_shape.second + : input_shape.second - (kernel_cols - 1); + return {n_rows, n_cols}; } /** Get the memory required to store the kernel transformed into the * Winograd domain. */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_storage_size(const KernelShape &shape) -{ - return N_GEMMS * get_kernel_matrix_size(shape); +MEMBERFN(size_t) +::get_kernel_storage_size(const unsigned int n_input_channels, + const unsigned int n_output_channels) { + return N_GEMMS * get_kernel_matrix_size(n_input_channels, n_output_channels); } - -template -template -size_t WinogradGEMM::Convolution::get_input_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding -) -{ - return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding); +MEMBERFN(size_t) +::get_input_storage_size(const unsigned int n_batches, + const unsigned int n_rows, const unsigned int n_cols, + const unsigned int n_channels, + const bool same_padding) { + return N_GEMMS * get_input_matrix_size(n_batches, n_rows, n_cols, n_channels, + same_padding); } - -template -template -size_t WinogradGEMM::Convolution::get_output_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding -) -{ - return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding); +MEMBERFN(size_t) +::get_output_storage_size(const unsigned int n_batches, + const unsigned int n_rows, const unsigned int n_cols, + const unsigned int n_channels) { + return N_GEMMS * + get_output_matrix_size(n_batches, n_rows, n_cols, n_channels); } - /** Get the memory required to apply a Winograd operator to some input. */ -template -template -size_t WinogradGEMM::Convolution::get_working_space_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); +MEMBERFN(size_t) +::get_working_space_size(const unsigned int n_batches, + const unsigned int n_rows, const unsigned int n_cols, + const unsigned int n_input_channels, + const unsigned int n_output_channels, + const bool padding_same) { + const auto output_shape = get_output_shape({n_rows, n_cols}, padding_same); // Get the memory required to store the matrices - const size_t matrix_sizes = N_GEMMS * ( - get_input_matrix_size(kernel_shape, input_shape, padding_type) + - get_output_matrix_size(kernel_shape, input_shape, padding_type) - ); - - // Add additional space to re-order the input and output if the input tensor - // is not in NHWC format. - if (input_shape.ordering == NHWC) - { - return matrix_sizes; // No extra spacing required - } - else // NCHW, must reorder the input and output tensors - { - // We only need to re-order the input or output at any one time, so request - // enough memory to do the largest of these. - const size_t extra_memory = std::max( - sizeof(TIn) * input_shape.size(), - sizeof(TOut) * output_shape.size() - ); - return matrix_sizes + extra_memory; - } + const size_t matrix_sizes = + N_GEMMS * + (get_input_matrix_size(n_batches, n_rows, n_cols, n_input_channels, + padding_same) + + get_output_matrix_size(n_batches, output_shape.first, + output_shape.second, n_output_channels)); + return matrix_sizes; } - /* Get the memory required by a single "input" matrix. */ -template -template -size_t WinogradGEMM::Convolution::get_input_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGIn); +MEMBERFN(size_t) +::get_input_matrix_size(const unsigned int n_batches, const unsigned int n_rows, + const unsigned int n_cols, + const unsigned int n_channels, + const bool same_padding) { + return get_input_matrix_stride(n_batches, n_rows, n_cols, n_channels, + same_padding) * + sizeof(TGEMMIn); } -template -template -int WinogradGEMM::Convolution::get_input_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - // Compute shape for the GEMM - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); - const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK); - const int K = kernel_shape.n_input_channels; +MEMBERFN(int) +::get_input_matrix_stride(const unsigned int n_batches, const unsigned int n_rows, + const unsigned int n_cols, + const unsigned int n_channels, + const bool same_padding) { + const auto output_shape = get_output_shape({n_rows, n_cols}, same_padding); + const unsigned int tile_rows = iceildiv(output_shape.first, output_tile_rows); + const unsigned int tile_cols = + iceildiv(output_shape.second, output_tile_cols); + const unsigned int M = + roundup(n_batches * tile_rows * tile_cols, M_BLOCK); + const unsigned int K = n_channels; return M * K; } - /* Get the memory required by a single "output" matrix. */ -template -template -size_t WinogradGEMM::Convolution::get_output_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ - return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGOut); +MEMBERFN(size_t) +::get_output_matrix_size(const unsigned int n_batches, + const unsigned int n_rows, const unsigned int n_cols, + const unsigned int n_channels) { + return get_output_matrix_stride(n_batches, n_rows, n_cols, n_channels) * + sizeof(TGEMMOut); } - -template -template -int WinogradGEMM::Convolution::get_output_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type -) -{ +MEMBERFN(int) +::get_output_matrix_stride(const unsigned int n_batches, + const unsigned int n_rows, const unsigned int n_cols, + const unsigned int n_channels) { // Compute shape for the GEMM - const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); - const int M = roundup(tile_rows * tile_cols, M_BLOCK); - const int N = roundup(kernel_shape.n_output_channels, N_BLOCK); + const int tile_rows = iceildiv(n_rows, output_tile_rows); + const int tile_cols = iceildiv(n_cols, output_tile_cols); + const int M = roundup(tile_rows * tile_cols, M_BLOCK); + const int N = roundup(n_channels, N_BLOCK); - return input_shape.n_batches * M * N; + return n_batches * M * N; } /* Get the memory required by a single "kernel" matrix. */ -template -template -size_t WinogradGEMM::Convolution::get_kernel_matrix_size(const KernelShape &shape) -{ - return sizeof(TGIn) * get_kernel_matrix_stride(shape); +MEMBERFN(size_t) +::get_kernel_matrix_size(const unsigned int n_input_channels, + const unsigned int n_output_channels) { + return sizeof(TGEMMIn) * + get_kernel_matrix_stride(n_input_channels, n_output_channels); } -template -template -int WinogradGEMM::Convolution::get_kernel_matrix_stride(const KernelShape &shape) -{ - const int K = shape.n_input_channels; - const int N = roundup(shape.n_output_channels, N_BLOCK); - return K * N; +MEMBERFN(int) +::get_kernel_matrix_stride(const unsigned int n_input_channels, + const unsigned int n_output_channels) { + return n_input_channels * roundup(n_output_channels, N_BLOCK); } - // Instantiate required implementations template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution; template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution; -- cgit v1.2.1