diff options
Diffstat (limited to 'src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp')
-rw-r--r-- | src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp b/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp new file mode 100644 index 0000000000..c5bcffbaef --- /dev/null +++ b/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once +#include <cstdint> +#include <cstdlib> + +#include "gemm.hpp" +#include "profiler.hpp" +#include "utils.hpp" +#include "shims.hpp" +#include "winograd_gemm.hpp" + +#include "transforms.hpp" + +#ifndef ALLOC_ALIGN +#define ALLOC_ALIGN 64 +#endif // ALLOC_ALIGN + + +namespace winograd_shim_nchw { + /***************************************************************************/ + /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM + * internally. + */ + template <typename TOut, typename TIn> + class Winograd2x2_3x3GEMM : public winograd::Winograd2x2_3x3GEMM<TOut, TIn> { + public: + /* Instantiate a new Winograd operator. + */ + Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage); + + void nchw2nhwc( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input); + void nhwc2nchw( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output); + + + std::pair<TOut*,TIn*> get_nhwc_ptrs(const Tensor4DShape& input_shape,const PaddingType padding_type,void *working_space); + + static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, const PaddingType padding); + protected: + /* Get the memory required to store an NHWC copy of the input tensor. */ + static size_t get_working_nhwc_input_size(const Tensor4DShape &input_shape); + + /* Get the memory required to store an NHWC copy of the input tensor. */ + static size_t get_working_nhwc_output_size(const Tensor4DShape &output_shape, const KernelShape &k_shape, const PaddingType padding) ; + }; +} // namespace winograd + +/*****************************************************************************/ +template <typename TOut, typename TIn> +winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM( + const KernelShape &kernel_shape, const Tensor4DShape input_shape, + const PaddingType padding_type, void *kernel_storage +) : winograd::Winograd2x2_3x3GEMM<TOut, TIn>(kernel_shape,input_shape,padding_type,kernel_storage) { +} + +/*****************************************************************************/ +template <typename TOut, typename TIn> +void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nchw2nhwc(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input) { + assert(working_space); + int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space); + + // Extract the top chunk of the working space to store the input and output + // tensors in NHWC format. + const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); + const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); + + // Allocate working space for the input and output in NHWC format + TIn* const input_nhwc = reinterpret_cast<TIn *>( + ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + ); + + // Re-order the input tensor + this->prof( + "NCHW -> NHWC", + [input, input_shape, input_nhwc] () { + nchw_to_nhwc( + input, input_nhwc, + input_shape.n_batches, + input_shape.n_channels, + input_shape.n_rows, + input_shape.n_cols + ); + }, + input_shape.size(), 0, input_shape.size() + ); +} + +/*****************************************************************************/ +template <typename TOut, typename TIn> +void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nhwc2nchw(const Tensor4DShape& input_shape, const PaddingType padding_type, + void *working_space, TOut* const output) { + + assert(working_space); + int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space); + + // Extract the top chunk of the working space to store the input and output + // tensors in NHWC format. + const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); + const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); + + TOut* const output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape)); + + // Re-order the output tensor into NCHW + const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape, this->kernel_shape, padding_type); + this->prof( + "NHWC -> NCHW", + [output_nhwc, output_shape, output] () { + nhwc_to_nchw( + output_nhwc, output, + output_shape.n_batches, + output_shape.n_rows, + output_shape.n_cols, + output_shape.n_channels + ); + }, + output_shape.size(), 0, output_shape.size() + ); +} + + +/*****************************************************************************/ +template <typename TOut, typename TIn> +std::pair<TOut*,TIn*> winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_nhwc_ptrs( + const Tensor4DShape& input_shape, + const PaddingType padding_type, + void *working_space +) { + assert(working_space); + int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space); + + // Extract the top chunk of the working space to store the input and output + // tensors in NHWC format. + const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type); + const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type); + + // Allocate working space for the input and output in NHWC format + TIn* input_nhwc = reinterpret_cast<TIn *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes)); + TOut* output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape)); + return std::make_pair(output_nhwc,input_nhwc); +} + + + + +/*****************************************************************************/ +template <typename TOut, typename TIn> +size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size( + const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type +) { + // TODO Add memory required for NHWC copies of input tensors + return winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size( + input_shape, k_shape, padding_type) + + get_working_nhwc_input_size(input_shape) + + get_working_nhwc_output_size(input_shape, k_shape, padding_type); +} + +template <typename TOut, typename TIn> +size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_input_size( + const Tensor4DShape& input_shape +) { + return roundup(input_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN)); +} + +template <typename TOut, typename TIn> +size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_output_size( + const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type +) { + const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape,k_shape, padding_type); + return roundup(output_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN)); +} |