/* * Copyright (c) 2017 ARM Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "winograd_gemm.hpp" #include "batched_blocked_gemm.hpp" using namespace winograd; /** Get the output shape of a convolution. */ template template Tensor4DShape WinogradGEMM::Convolution::get_output_shape( const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding ) { return Tensor4DShape { in_shape.n_batches, (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1), (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1), kernel_shape.n_output_channels, in_shape.ordering }; } /* Get the memory required to transform the kernel. */ template template size_t WinogradGEMM::Convolution::get_kernel_transform_working_size(const KernelShape &shape) { if (shape.ordering == HWIO) { // Kernel is already in the correct order, so no additional memory is // required. return 0; } else { // Need to re-order the kernel into HWIO form, require enough space to // represent the tensor. return sizeof(TIn) * shape.size(); } } /** Get the memory required to store the kernel transformed into the * Winograd domain. */ template template size_t WinogradGEMM::Convolution::get_kernel_storage_size(const KernelShape &shape) { return N_GEMMS * get_kernel_matrix_size(shape); } template template size_t WinogradGEMM::Convolution::get_input_storage_size( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding ) { return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding); } template template size_t WinogradGEMM::Convolution::get_output_storage_size( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding ) { return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding); } /** Get the memory required to apply a Winograd operator to some input. */ template template size_t WinogradGEMM::Convolution::get_working_space_size( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type ) { const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); // Get the memory required to store the matrices const size_t matrix_sizes = N_GEMMS * ( get_input_matrix_size(kernel_shape, input_shape, padding_type) + get_output_matrix_size(kernel_shape, input_shape, padding_type) ); // Add additional space to re-order the input and output if the input tensor // is not in NHWC format. if (input_shape.ordering == NHWC) { return matrix_sizes; // No extra spacing required } else // NCHW, must reorder the input and output tensors { // We only need to re-order the input or output at any one time, so request // enough memory to do the largest of these. const size_t extra_memory = std::max( sizeof(TIn) * input_shape.size(), sizeof(TOut) * output_shape.size() ); return matrix_sizes + extra_memory; } } /* Get the memory required by a single "input" matrix. */ template template size_t WinogradGEMM::Convolution::get_input_matrix_size( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type ) { return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn); } template template int WinogradGEMM::Convolution::get_input_matrix_stride( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type ) { // Compute shape for the GEMM const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK); const int K = kernel_shape.n_input_channels; return M * K; } /* Get the memory required by a single "output" matrix. */ template template size_t WinogradGEMM::Convolution::get_output_matrix_size( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type ) { return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut); } template template int WinogradGEMM::Convolution::get_output_matrix_stride( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type ) { // Compute shape for the GEMM const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type); const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); const int M = roundup(tile_rows * tile_cols, M_BLOCK); const int N = roundup(kernel_shape.n_output_channels, N_BLOCK); return input_shape.n_batches * M * N; } /* Get the memory required by a single "kernel" matrix. */ template template size_t WinogradGEMM::Convolution::get_kernel_matrix_size(const KernelShape &shape) { return sizeof(TIn) * get_kernel_matrix_stride(shape); } template template int WinogradGEMM::Convolution::get_kernel_matrix_stride(const KernelShape &shape) { const int K = shape.n_input_channels; const int N = roundup(shape.n_output_channels, N_BLOCK); return K * N; } /** Create a new Winograd operator. */ template template WinogradGEMM::Convolution::Convolution( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding, void *kernel_storage ) : kernel_shape(kernel_shape), // Store the kernel shape kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), manage_kernel_storage(kernel_storage == NULL), _kernel_storage(manage_kernel_storage ? ALLOCATE(get_kernel_storage_size(kernel_shape)) : kernel_storage), input_shape(input_shape), padding(padding), output_shape(get_output_shape(kernel_shape, input_shape, padding)), tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)), tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)), M(input_shape.n_batches * tile_rows * tile_cols), K(kernel_shape.n_input_channels), N(kernel_shape.n_output_channels), prof() { // Create pointers to the kernel matrices const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); int8_t* const ks_bytes = reinterpret_cast(_kernel_storage); for (int i = 0; i < N_GEMMS; i++) { kernel_matrices[i] = reinterpret_cast( ks_bytes + i*kernel_matrix_size_bytes); } } /** Create a new Winograd operator and initialise the weights. */ template template WinogradGEMM::Convolution::Convolution( const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding, const TIn* const kernel, void *kernel_storage, void *transform_working_space ) : Convolution(kernel_shape, input_shape, padding, kernel_storage) { transform_weights(kernel, transform_working_space); } /** Clean up a convolution engine. */ template template WinogradGEMM:: Convolution::~Convolution() { // If we were responsible for managing kernel storage ensure that it is // freed. if (manage_kernel_storage) { free(_kernel_storage); } } /** Transform weights into the Winograd domain and store them for later use/reuse. */ template template template void WinogradGEMM:: Convolution::transform_weights( const TIn* const kernel, void *transform_working_space ) { // Allocate working space if it is required bool allocated_working_space = false; if (transform_working_space == NULL && // If no memory has been provided get_kernel_transform_working_size(kernel_shape) != 0) // And we need the space { allocated_working_space = true; transform_working_space = ALLOCATE( get_kernel_transform_working_size(kernel_shape) ); } // The transformation methods only work on weights laid out in HWIO form, if // the weights are not in this form then we need to re-order them. const TIn *kernel_hwio = kernel; if (kernel_shape.ordering != HWIO) { kernel_hwio = reinterpret_cast(transform_working_space); // Re-order the weights from OIHW to HWIO this->prof( "Weight reorder", [&kernel, &kernel_hwio, this] () { reorder::ofm_ifm_h_w_to_h_w_ifm_ofm( kernel, const_cast(kernel_hwio), kernel_shape.n_output_channels, kernel_shape.n_input_channels, kernel_shape.n_rows, kernel_shape.n_cols ); }, kernel_shape.size() * sizeof(TIn), 0, kernel_shape.size() * sizeof(TIn) ); } const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); WeightsTransformT weights_transform( kernel_hwio, kernel_matrices[0], kernel_matrix_size_bytes / sizeof(TIn), kernel_matrix_row_stride, kernel_shape.n_output_channels, kernel_shape.n_input_channels ); // Transform the weights into the Winograd domain auto kernel_prep = [&] () { weights_transform.run(0, weights_transform.get_window()); }; prof( "Kernel Prep", kernel_prep, WeightsTransformT::bytes_read(kernel_shape), WeightsTransformT::ops_performed(kernel_shape), WeightsTransformT::bytes_written(kernel_shape) ); // Free memory if we allocated it if (allocated_working_space) { free(transform_working_space); } } /** Perform a convolution. */ template template void WinogradGEMM:: Convolution::execute( TOut* const output, const TIn* const input, const TOut* const biases, void *working_space, const int n_threads ) { const auto padding_type = padding; const auto input_shape = this->input_shape; // Allocate working space if none has been provided const bool manage_working_space = (working_space == NULL); if (manage_working_space) { const size_t ws_size = get_working_space_size( kernel_shape, input_shape, padding_type ); working_space = ALLOCATE(ws_size * sizeof(int8_t)); memset(working_space, 0x00, ws_size); } int8_t* const ws_bytes = reinterpret_cast(working_space); // Split the working space into that required for 16 input matrices and // output matrices. TIn *input_matrices[N_GEMMS]; TOut *output_matrices[N_GEMMS]; const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type); const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type); for (int i = 0; i < N_GEMMS; i++) { input_matrices[i] = reinterpret_cast( ws_bytes + i*in_matrix_stride_bytes); output_matrices[i] = reinterpret_cast( ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes); } // If we need to re-order the input and output tensors then the final chunk // of the working space can be used for this purpose. // TODO - Overlay the input reorder on top of the output matrices // - Overlay the output reorder on top of the input matrices // Reorder the input input form if it was not provided in this ordering. const TIn* input_nhwc = input; if (input_shape.ordering == NCHW) { input_nhwc = reinterpret_cast( ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) ); this->prof( "NCHW -> NHWC", [input, input_shape, input_nhwc] () { reorder::nchw_to_nhwc( input, const_cast(input_nhwc), input_shape.n_batches, input_shape.n_channels, input_shape.n_rows, input_shape.n_cols ); }, input_shape.size(), 0, input_shape.size() ); } // Compute shape for the GEMM const auto output_shape = this->output_shape; int M = this->M; int K = this->K; int N = this->N; const int in_matrix_row_stride = K; const int out_matrix_row_stride = kernel_matrix_row_stride; InputTransform input_transform( input_nhwc, input_shape.n_batches, input_shape.n_rows, input_shape.n_cols, input_shape.n_channels, padding_type, input_matrices[0], in_matrix_stride_bytes / sizeof(TIn), in_matrix_row_stride ); // Transform the input into the Winograd domain auto input_prep = [&] () { input_transform.run(0, input_transform.get_window()); }; prof( "Input Prep", input_prep, InputTransform::bytes_read(input_shape), InputTransform::ops_performed(input_shape), InputTransform::bytes_written(input_shape) ); // Perform the GEMMs const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape); BatchedBlockedGemm gemms( N_GEMMS, M, K, N, in_matrix_stride_bytes / sizeof(TIn), in_matrix_row_stride, kernel_matrix_stride_bytes / sizeof(TIn), kernel_matrix_row_stride, out_matrix_stride_bytes / sizeof(TOut), out_matrix_row_stride, input_matrices[0], kernel_matrices[0], output_matrices[0] ); for (unsigned int i = 0; i < gemms.get_window(); i++) { auto run_gemm = [&] () { gemms.run(i, i+1); }; prof("GEMM", run_gemm, 0, 0, 0); } // If the output tensor needs to be in NCHW form then store the NHWC output // tensor in temporary storage and then reorder. If the output tensor needs // to be in NHWC then just write straight to the output tensor. TOut *output_nhwc = output; if (input_shape.ordering == NCHW) { output_nhwc = reinterpret_cast( ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes) ); } // Transform the output tensor from the Winograd domain to the spatial // domain. OutputTransform output_transform( output_matrices[0], out_matrix_stride_bytes / sizeof(TOut), out_matrix_row_stride, biases, output_nhwc, output_shape.n_batches, output_shape.n_rows, output_shape.n_cols, output_shape.n_channels ); auto output_prep = [&] () { output_transform.run(0, output_transform.get_window()); }; prof( "Output Comp", output_prep, OutputTransform::bytes_read(output_shape), OutputTransform::ops_performed(output_shape), OutputTransform::bytes_written(output_shape) ); // Reorder the output tensor if it is required to be in NCHW form. if (input_shape.ordering == NCHW) { prof( "NHWC -> NCHW", [output_nhwc, output_shape, output] () { reorder::nhwc_to_nchw( output_nhwc, output, output_shape.n_batches, output_shape.n_rows, output_shape.n_cols, output_shape.n_channels ); }, output_shape.size(), 0, output_shape.size() ); } // Free working space if we were responsible for allocating it if (manage_working_space) { free(working_space); } } /** Perform a convolution. */ template template void WinogradGEMM:: Convolution::execute( TOut* const output, const TIn* const input, const TOut* const biases, const int n_threads ) { execute(output, input, biases, NULL, n_threads); } // Instantiate required implementations template class WinogradGEMM<2, 2, 3, 3>::Convolution; template class WinogradGEMM<4, 4, 3, 3>::Convolution; template class WinogradGEMM<2, 2, 5, 5>::Convolution;