/* * Copyright (c) 2017 ARM Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include "alloc.hpp" #include "gemm.hpp" #include "profiler.hpp" #include "utils.hpp" #include "shims.hpp" #include "winograd_gemm.hpp" #include "transforms.hpp" #ifndef ALLOC_ALIGN #define ALLOC_ALIGN 64 #endif // ALLOC_ALIGN namespace winograd_shim_nchw { /***************************************************************************/ /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM * internally. */ template class Winograd2x2_3x3GEMM : public winograd::Winograd2x2_3x3GEMM { public: /* Instantiate a new Winograd operator. */ Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage); void nchw2nhwc( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input); void nhwc2nchw( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output); std::pair get_nhwc_ptrs(const Tensor4DShape& input_shape,const PaddingType padding_type,void *working_space); static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, const PaddingType padding); protected: /* Get the memory required to store an NHWC copy of the input tensor. */ static size_t get_working_nhwc_input_size(const Tensor4DShape &input_shape); /* Get the memory required to store an NHWC copy of the input tensor. */ static size_t get_working_nhwc_output_size(const Tensor4DShape &output_shape, const KernelShape &k_shape, const PaddingType padding) ; }; } // namespace winograd /*****************************************************************************/ template winograd_shim_nchw::Winograd2x2_3x3GEMM::Winograd2x2_3x3GEMM( const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage ) : winograd::Winograd2x2_3x3GEMM(kernel_shape,input_shape,padding_type,kernel_storage) { } /*****************************************************************************/ template void winograd_shim_nchw::Winograd2x2_3x3GEMM::nchw2nhwc(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input) { assert(working_space); int8_t* const ws_bytes = reinterpret_cast(working_space); // Extract the top chunk of the working space to store the input and output // tensors in NHWC format. const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); // Allocate working space for the input and output in NHWC format TIn* const input_nhwc = reinterpret_cast( ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) ); // Re-order the input tensor this->prof( "NCHW -> NHWC", [input, input_shape, input_nhwc] () { nchw_to_nhwc( input, input_nhwc, input_shape.n_batches, input_shape.n_channels, input_shape.n_rows, input_shape.n_cols ); }, input_shape.size(), 0, input_shape.size() ); } /*****************************************************************************/ template void winograd_shim_nchw::Winograd2x2_3x3GEMM::nhwc2nchw(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output) { assert(working_space); int8_t* const ws_bytes = reinterpret_cast(working_space); // Extract the top chunk of the working space to store the input and output // tensors in NHWC format. const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); TOut* const output_nhwc = reinterpret_cast(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape)); // Re-order the output tensor into NCHW const auto output_shape = winograd::Winograd2x2_3x3GEMM::get_output_shape(input_shape, this->kernel_shape, padding_type); this->prof( "NHWC -> NCHW", [output_nhwc, output_shape, output] () { nhwc_to_nchw( output_nhwc, output, output_shape.n_batches, output_shape.n_rows, output_shape.n_cols, output_shape.n_channels ); }, output_shape.size(), 0, output_shape.size() ); } /*****************************************************************************/ template std::pair winograd_shim_nchw::Winograd2x2_3x3GEMM::get_nhwc_ptrs( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space ) { assert(working_space); int8_t* const ws_bytes = reinterpret_cast(working_space); // Extract the top chunk of the working space to store the input and output // tensors in NHWC format. const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); // Allocate working space for the input and output in NHWC format TIn* input_nhwc = reinterpret_cast(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes)); TOut* output_nhwc = reinterpret_cast(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape)); return std::make_pair(output_nhwc,input_nhwc); } /*****************************************************************************/ template size_t winograd_shim_nchw::Winograd2x2_3x3GEMM::get_working_space_size( const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type ) { // TODO Add memory required for NHWC copies of input tensors return winograd::Winograd2x2_3x3GEMM::get_working_space_size( input_shape, k_shape, padding_type) + get_working_nhwc_input_size(input_shape) + get_working_nhwc_output_size(input_shape, k_shape, padding_type); } template size_t winograd_shim_nchw::Winograd2x2_3x3GEMM::get_working_nhwc_input_size( const Tensor4DShape& input_shape ) { return roundup(input_shape.size() * sizeof(TIn), static_cast(ALLOC_ALIGN)); } template size_t winograd_shim_nchw::Winograd2x2_3x3GEMM::get_working_nhwc_output_size( const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type ) { const auto output_shape = winograd::Winograd2x2_3x3GEMM::get_output_shape(input_shape,k_shape, padding_type); return roundup(output_shape.size() * sizeof(TIn), static_cast(ALLOC_ALIGN)); }