From 9ceebbeb8dfe61746fdc7022a147f8e2d24c5493 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 10 Jan 2018 16:44:13 +0000 Subject: COMPMID-815: Updated NEWinogradLayer with the lastest code from Research. Change-Id: I86d7f53b5f5d1dbc22078aea5c32b08a25d1f49e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116634 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/core/NEON/kernels/winograd/winograd_gemm.cpp | 560 +++++++++++++++++++++++ 1 file changed, 560 insertions(+) create mode 100644 src/core/NEON/kernels/winograd/winograd_gemm.cpp (limited to 'src/core/NEON/kernels/winograd/winograd_gemm.cpp') diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/winograd/winograd_gemm.cpp new file mode 100644 index 0000000000..b44a45367f --- /dev/null +++ b/src/core/NEON/kernels/winograd/winograd_gemm.cpp @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "winograd_gemm.hpp" +#include "batched_blocked_gemm.hpp" +using namespace winograd; + +/** Get the output shape of a convolution. */ +template +template +Tensor4DShape WinogradGEMM::Convolution::get_output_shape( + const KernelShape &kernel_shape, + const Tensor4DShape &in_shape, + const PaddingType padding +) +{ + // TODO Accept different kernel sizes + return Tensor4DShape { + in_shape.n_batches, + (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2, + (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2, + kernel_shape.n_output_channels, + in_shape.ordering + }; +} + +/* Get the memory required to transform the kernel. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_transform_working_size(const KernelShape &shape) +{ + if (shape.ordering == HWIO) + { + // Kernel is already in the correct order, so no additional memory is + // required. + return 0; + } + else + { + // Need to re-order the kernel into HWIO form, require enough space to + // represent the tensor. + return sizeof(TIn) * shape.size(); + } +} + +/** Get the memory required to store the kernel transformed into the + * Winograd domain. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_storage_size(const KernelShape &shape) +{ + return N_GEMMS * get_kernel_matrix_size(shape); +} + + +template +template +size_t WinogradGEMM::Convolution::get_input_storage_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding +) +{ + return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding); +} + + +template +template +size_t WinogradGEMM::Convolution::get_output_storage_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding +) +{ + return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding); +} + + +/** Get the memory required to apply a Winograd operator to some input. + */ +template +template +size_t WinogradGEMM::Convolution::get_working_space_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + + // Get the memory required to store the matrices + const size_t matrix_sizes = N_GEMMS * ( + get_input_matrix_size(kernel_shape, input_shape, padding_type) + + get_output_matrix_size(kernel_shape, input_shape, padding_type) + ); + + // Add additional space to re-order the input and output if the input tensor + // is not in NHWC format. + if (input_shape.ordering == NHWC) + { + return matrix_sizes; // No extra spacing required + } + else // NCHW, must reorder the input and output tensors + { + // We only need to re-order the input or output at any one time, so request + // enough memory to do the largest of these. + const size_t extra_memory = std::max( + sizeof(TIn) * input_shape.size(), + sizeof(TOut) * output_shape.size() + ); + return matrix_sizes + extra_memory; + } +} + + +/* Get the memory required by a single "input" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_input_matrix_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn); +} + +template +template +int WinogradGEMM::Convolution::get_input_matrix_stride( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + // Compute shape for the GEMM + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK); + const int K = kernel_shape.n_input_channels; + + return M * K; +} + + +/* Get the memory required by a single "output" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_output_matrix_size( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut); +} + + +template +template +int WinogradGEMM::Convolution::get_output_matrix_stride( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding_type +) +{ + // Compute shape for the GEMM + const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int M = roundup(tile_rows * tile_cols, M_BLOCK); + const int N = roundup(kernel_shape.n_output_channels, N_BLOCK); + + return input_shape.n_batches * M * N; +} + + +/* Get the memory required by a single "kernel" matrix. + */ +template +template +size_t WinogradGEMM::Convolution::get_kernel_matrix_size(const KernelShape &shape) +{ + return sizeof(TIn) * get_kernel_matrix_stride(shape); +} + +template +template +int WinogradGEMM::Convolution::get_kernel_matrix_stride(const KernelShape &shape) +{ + const int K = shape.n_input_channels; + const int N = roundup(shape.n_output_channels, N_BLOCK); + return K * N; +} + + +/** Create a new Winograd operator. */ +template +template +WinogradGEMM::Convolution::Convolution( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding, + void *kernel_storage +) : kernel_shape(kernel_shape), // Store the kernel shape + kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), + manage_kernel_storage(kernel_storage == NULL), + _kernel_storage(manage_kernel_storage ? + ALLOCATE(get_kernel_storage_size(kernel_shape)) : + kernel_storage), + input_shape(input_shape), + padding(padding), + output_shape(get_output_shape(kernel_shape, input_shape, padding)), + tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)), + tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)), + M(input_shape.n_batches * tile_rows * tile_cols), + K(kernel_shape.n_input_channels), + N(kernel_shape.n_output_channels), + prof() +{ + // Create pointers to the kernel matrices + const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); + int8_t* const ks_bytes = reinterpret_cast(_kernel_storage); + for (int i = 0; i < N_GEMMS; i++) { + kernel_matrices[i] = reinterpret_cast( + ks_bytes + i*kernel_matrix_size_bytes); + } +} + + +/** Create a new Winograd operator and initialise the weights. */ +template +template +WinogradGEMM::Convolution::Convolution( + const KernelShape &kernel_shape, + const Tensor4DShape &input_shape, + const PaddingType padding, + const TIn* const kernel, + void *kernel_storage, + void *transform_working_space +) : Convolution(kernel_shape, input_shape, padding, kernel_storage) +{ + transform_weights(kernel, transform_working_space); +} + + +/** Clean up a convolution engine. */ +template +template +WinogradGEMM:: +Convolution::~Convolution() +{ + // If we were responsible for managing kernel storage ensure that it is + // freed. + if (manage_kernel_storage) + { + free(_kernel_storage); + } +} + + +/** Transform weights into the Winograd domain and store them for later use/reuse. */ +template +template +template +void WinogradGEMM:: +Convolution::transform_weights( + const TIn* const kernel, + void *transform_working_space +) +{ + // Allocate working space if it is required + bool allocated_working_space = false; + if (transform_working_space == NULL && // If no memory has been provided + get_kernel_transform_working_size(kernel_shape) != 0) // And we need the space + { + allocated_working_space = true; + transform_working_space = ALLOCATE( + get_kernel_transform_working_size(kernel_shape) + ); + } + + // The transformation methods only work on weights laid out in HWIO form, if + // the weights are not in this form then we need to re-order them. + const TIn *kernel_hwio = kernel; + if (kernel_shape.ordering != HWIO) + { + kernel_hwio = reinterpret_cast(transform_working_space); + + // Re-order the weights from OIHW to HWIO + this->prof( + "Weight reorder", + [&kernel, &kernel_hwio, this] () { + reorder::ofm_ifm_h_w_to_h_w_ifm_ofm( + kernel, const_cast(kernel_hwio), + kernel_shape.n_output_channels, + kernel_shape.n_input_channels, + kernel_shape.n_rows, + kernel_shape.n_cols + ); + }, + kernel_shape.size() * sizeof(TIn), + 0, + kernel_shape.size() * sizeof(TIn) + ); + } + + const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); + WeightsTransformT weights_transform( + kernel_hwio, kernel_matrices[0], + kernel_matrix_size_bytes / sizeof(TIn), + kernel_matrix_row_stride, + kernel_shape.n_output_channels, + kernel_shape.n_input_channels + ); + + // Transform the weights into the Winograd domain + auto kernel_prep = [&] () + { + weights_transform.run(0, weights_transform.get_window()); + }; + + prof( + "Kernel Prep", kernel_prep, + WeightsTransformT::bytes_read(kernel_shape), + WeightsTransformT::ops_performed(kernel_shape), + WeightsTransformT::bytes_written(kernel_shape) + ); + + // Free memory if we allocated it + if (allocated_working_space) + { + free(transform_working_space); + } +} + + +/** Perform a convolution. */ +template +template +void WinogradGEMM:: +Convolution::execute( + TOut* const output, + const TIn* const input, + void *working_space, + const int n_threads +) +{ + const auto padding_type = padding; + const auto input_shape = this->input_shape; + + // Allocate working space if none has been provided + const bool manage_working_space = (working_space == NULL); + if (manage_working_space) + { + const size_t ws_size = get_working_space_size( + kernel_shape, input_shape, padding_type + ); + working_space = ALLOCATE(ws_size * sizeof(int8_t)); + memset(working_space, 0x00, ws_size); + } + int8_t* const ws_bytes = reinterpret_cast(working_space); + + // Split the working space into that required for 16 input matrices and + // output matrices. + TIn *input_matrices[N_GEMMS]; + TOut *output_matrices[N_GEMMS]; + const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type); + const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type); + + for (int i = 0; i < N_GEMMS; i++) + { + input_matrices[i] = reinterpret_cast( + ws_bytes + i*in_matrix_stride_bytes); + output_matrices[i] = reinterpret_cast( + ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes); + } + + // If we need to re-order the input and output tensors then the final chunk + // of the working space can be used for this purpose. + // TODO - Overlay the input reorder on top of the output matrices + // - Overlay the output reorder on top of the input matrices + // Reorder the input input form if it was not provided in this ordering. + const TIn* input_nhwc = input; + if (input_shape.ordering == NCHW) + { + input_nhwc = reinterpret_cast( + ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) + ); + + this->prof( + "NCHW -> NHWC", + [input, input_shape, input_nhwc] () { + reorder::nchw_to_nhwc( + input, const_cast(input_nhwc), + input_shape.n_batches, + input_shape.n_channels, + input_shape.n_rows, + input_shape.n_cols + ); + }, + input_shape.size(), 0, input_shape.size() + ); + } + + // Compute shape for the GEMM + const auto output_shape = this->output_shape; + int M = this->M; + int K = this->K; + int N = this->N; + + const int in_matrix_row_stride = K; + const int out_matrix_row_stride = kernel_matrix_row_stride; + + InputTransform input_transform( + input_nhwc, + input_shape.n_batches, + input_shape.n_rows, + input_shape.n_cols, + input_shape.n_channels, + padding_type, + input_matrices[0], + in_matrix_stride_bytes / sizeof(TIn), + in_matrix_row_stride + ); + + // Transform the input into the Winograd domain + auto input_prep = [&] () { + input_transform.run(0, input_transform.get_window()); + }; + prof( + "Input Prep", input_prep, + InputTransform::bytes_read(input_shape), + InputTransform::ops_performed(input_shape), + InputTransform::bytes_written(input_shape) + ); + + // Perform the GEMMs + const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape); + BatchedBlockedGemm gemms( + N_GEMMS, M, K, N, + in_matrix_stride_bytes / sizeof(TIn), + in_matrix_row_stride, + kernel_matrix_stride_bytes / sizeof(TIn), + kernel_matrix_row_stride, + out_matrix_stride_bytes / sizeof(TOut), + out_matrix_row_stride, + input_matrices[0], + kernel_matrices[0], + output_matrices[0] + ); + gemms.run(0, gemms.get_window()); + + // If the output tensor needs to be in NCHW form then store the NHWC output + // tensor in temporary storage and then reorder. If the output tensor needs + // to be in NHWC then just write straight to the output tensor. + TOut *output_nhwc = output; + if (input_shape.ordering == NCHW) + { + output_nhwc = reinterpret_cast( + ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) + ); + } + + // Transform the output tensor from the Winograd domain to the spatial + // domain. + OutputTransform output_transform( + output_matrices[0], + out_matrix_stride_bytes / sizeof(TOut), + out_matrix_row_stride, + output_nhwc, + output_shape.n_batches, + output_shape.n_rows, + output_shape.n_cols, + output_shape.n_channels + ); + auto output_prep = [&] () { + output_transform.run(0, output_transform.get_window()); + }; + prof( + "Output Comp", output_prep, + OutputTransform::bytes_read(output_shape), + OutputTransform::ops_performed(output_shape), + OutputTransform::bytes_written(output_shape) + ); + + // Reorder the output tensor if it is required to be in NCHW form. + if (input_shape.ordering == NCHW) + { + prof( + "NHWC -> NCHW", + [output_nhwc, output_shape, output] () { + reorder::nhwc_to_nchw( + output_nhwc, output, + output_shape.n_batches, + output_shape.n_rows, + output_shape.n_cols, + output_shape.n_channels + ); + }, + output_shape.size(), 0, output_shape.size() + ); + } + + // Free working space if we were responsible for allocating it + if (manage_working_space) + { + free(working_space); + } +} + + +/** Perform a convolution. */ +template +template +void WinogradGEMM:: +Convolution::execute( + TOut* const output, + const TIn* const input, + const int n_threads +) +{ + execute(output, input, NULL, n_threads); +} + + +// Instantiate required implementations +template class WinogradGEMM<2, 2, 3, 3>::Convolution; +template class WinogradGEMM<4, 4, 3, 3>::Convolution; -- cgit v1.2.1