diff options
author | Pablo Tello <pablo.tello@arm.com> | 2018-01-23 09:36:04 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:45:00 +0000 |
commit | d6ca478a7e410f8f529c2e505305b46d9fe21a9b (patch) | |
tree | 5c50c06e07f812890f127b1c4933996987f74f17 /src/core/NEON/kernels/NEWinogradLayerKernel.cpp | |
parent | d05dce46a14a7b67f322328ecd95bf96bdd30bae (diff) | |
download | ComputeLibrary-d6ca478a7e410f8f529c2e505305b46d9fe21a9b.tar.gz |
COMPMID-784: Added support for biases in WinogradLayer.
1) Updated to the latest code from the RSH repo.
2) Moved winograd transforms into kernels.
3) Added support for biases
Change-Id: I7f39f34a599b49d7d9b549cc10a4f4d4a8007ab8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117474
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEWinogradLayerKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEWinogradLayerKernel.cpp | 144 |
1 files changed, 125 insertions, 19 deletions
diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp index ea48e1f32b..e2e4e40fe4 100644 --- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp @@ -55,7 +55,7 @@ public: float *const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ float *const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ ) - : convolver(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output, winograd_output) + : convolver(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, nullptr, output, winograd_output) { } T convolver; @@ -65,24 +65,6 @@ Winograd3x3F32::~Winograd3x3F32() { } -void Winograd3x3F32::transform_output() -{ - auto win = _pimpl->convolver.output_transform.get_window(); - _pimpl->convolver.output_transform.run(0, win); -} - -void Winograd3x3F32::transform_input() -{ - auto win = _pimpl->convolver.input_transform.get_window(); - _pimpl->convolver.input_transform.run(0, win); -} - -void Winograd3x3F32::transform_weights() -{ - auto win = _pimpl->convolver.weights_transform.get_window(); - _pimpl->convolver.weights_transform.run(0, win); -} - Winograd3x3F32::Winograd3x3F32( const int n_batches, /** Number of batches in the input and output tensors. */ const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ @@ -146,4 +128,128 @@ void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info) const size_t last_gemm = window.x().end(); _convolver->_pimpl->convolver.gemms.run(first_gemm, last_gemm); } + +INEWinogradLayerTransformKernel::INEWinogradLayerTransformKernel() + : _convolver(nullptr) +{ +} + +void INEWinogradLayerTransformKernel::configure(Winograd3x3F32 *convolver) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(convolver); + _convolver = convolver; +} + +// Weights transform + +void NEWinogradLayerTransformWeightsKernel::configure(Winograd3x3F32 *convolver) +{ + INEWinogradLayerTransformKernel::configure(convolver); + Window win; + auto win_last = _convolver->_pimpl->convolver.weights_transform.get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); +} + +void NEWinogradLayerTransformWeightsKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + _convolver->_pimpl->convolver.weights_transform.run(fst, lst); +} + +bool NEWinogradLayerTransformWeightsKernel::is_parallelisable() const +{ + return false; +} + +// Input transform + +void NEWinogradLayerTransformInputKernel::configure(Winograd3x3F32 *convolver) +{ + INEWinogradLayerTransformKernel::configure(convolver); + Window win; + auto win_last = _convolver->_pimpl->convolver.input_transform.get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); +} + +void NEWinogradLayerTransformInputKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + _convolver->_pimpl->convolver.input_transform.run(fst, lst); +} +bool NEWinogradLayerTransformInputKernel::is_parallelisable() const +{ + return false; +} + +// Output transform +NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() + : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0) +{ +} + +void NEWinogradLayerTransformOutputKernel::configure( + const ITensor *biases, + const float *const output_workingspace, + const int matrix_stride, + float *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) +{ + using WinogradBase = winograd::WinogradGEMM<2, 2, 3, 3>; + using OutputTransform = typename WinogradBase::template OutputTransform<float>; + + _biases = biases; + _output_workspace = output_workingspace; + _matrix_stride = matrix_stride; + _matrix_row_stride = roundup(n_channels, WinogradBase::Convolution<float, float>::N_BLOCK); + _output = output; + _n_batches = n_batches; + _n_rows = n_rows; + _n_cols = n_cols; + _n_channels = n_channels; + + // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window + OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels); + Window win; + auto win_last = output_transform.get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); +} + +void NEWinogradLayerTransformOutputKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer()); + ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace); + ARM_COMPUTE_ERROR_ON_NULLPTR(_output); + + using WinogradBase = winograd::WinogradGEMM<2, 2, 3, 3>; + using OutputTransform = typename WinogradBase::template OutputTransform<float>; + + OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, + reinterpret_cast<float *>(_biases->buffer()), _output, + _n_batches, _n_rows, _n_cols, _n_channels); + + // The code below cannot be moved to configure because biases hasn't been allocated at that point + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + output_transform.run(fst, lst); +} + +bool NEWinogradLayerTransformOutputKernel::is_parallelisable() const +{ + return false; +} + } // namespace arm_compute |