From 9ceebbeb8dfe61746fdc7022a147f8e2d24c5493 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 10 Jan 2018 16:44:13 +0000 Subject: COMPMID-815: Updated NEWinogradLayer with the lastest code from Research. Change-Id: I86d7f53b5f5d1dbc22078aea5c32b08a25d1f49e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116634 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/runtime/NEON/functions/NEWinogradLayer.cpp | 113 ++++++++++++------------- 1 file changed, 55 insertions(+), 58 deletions(-) (limited to 'src/runtime/NEON/functions/NEWinogradLayer.cpp') diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index 21f298ca25..da46f8773c 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,8 +43,8 @@ inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input) namespace arm_compute { NEWinogradLayer::NEWinogradLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), - _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv() + : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), + _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv() { } /* arm_compute */ @@ -72,36 +72,37 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides."); // Get convolved dimensions - auto padding = PADDING_VALID; - const int in_channels = input->info()->dimension(2); - const int out_channels = output->info()->dimension(2); - const int weights_width = weights->info()->dimension(0); - const int weights_height = weights->info()->dimension(1); + const int in_channels = input->info()->dimension(2); + const int out_channels = output->info()->dimension(2); - const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels }); const Tensor4DShape in_shape(internal_get_input_shape(input)); // Get the memory required to instantiate a new Winograd operator. - constexpr size_t kstore_alignment = 64; - const size_t kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape); - _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8)); + constexpr size_t storage_alignment = 64; + const size_t kernel_storage_size = NEWinogradLayerKernel::get_weight_storage_size(out_channels, in_channels) * sizeof(float); + _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_kernel_storage); - - // Get workbench size and allocate memory - - constexpr size_t wspace_alignment = 64; - const size_t ws_size = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding); - _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8)); - _memory_group.manage(&_workspace); _memory_group.manage(&_input_nhwc); _kernel_storage.allocator()->allocate(); - _workspace.allocator()->allocate(); + // Input storage + const size_t input_storage_size = NEWinogradLayerKernel::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, false) * sizeof(float); + _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8)); + _memory_group.manage(&_input_workspace); + _input_workspace.allocator()->allocate(); + + // Output storage + const size_t output_storage_size = NEWinogradLayerKernel::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, false) * sizeof(float); + _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8)); + _memory_group.manage(&_output_workspace); + _output_workspace.allocator()->allocate(); - // Create Winograd operator object - _conv = support::cpp14::make_unique(kernel_shape, in_shape, padding, _kernel_storage.buffer()); + // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() + TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0), + _output->info()->dimension(1), _output->info()->dimension(3)), + 1, _output->info()->data_type()); + _output_nhwc.allocator()->init(info); - // Configure the kernel, padding not needed so it's safe to call configure after allocare - _winograd_kernel.configure(_conv.get()); + _output_nhwc.allocator()->allocate(); // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] switch(weights->info()->num_dimensions()) @@ -122,60 +123,56 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co break; } } + + _weights_hwio.allocator()->allocate(); + // configure the kernel to transform the input tensor from NCHW -> NHWC _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0), - _output->info()->dimension(1), _output->info()->dimension(3)), - 1, _output->info()->data_type()); - _output_nhwc.allocator()->init(info); - - _output_nhwc.allocator()->allocate(); - _weights_hwio.allocator()->allocate(); _input_nhwc.allocator()->allocate(); + + // Create Winograd operator object + _conv = support::cpp14::make_unique( + in_shape.n_batches, + in_shape.n_channels, + in_shape.n_rows, + in_shape.n_cols, + out_channels, + false, + reinterpret_cast(_weights_hwio.buffer()), + reinterpret_cast(_kernel_storage.buffer()), + reinterpret_cast(_input_nhwc.buffer()), + reinterpret_cast(_input_workspace.buffer()), + reinterpret_cast(_output_nhwc.buffer()), + reinterpret_cast(_output_workspace.buffer())); + + // Configure the kernel, padding not needed so it's safe to call configure after allocare + _winograd_kernel.configure(_conv.get()); + + // Reorder the convoluted output to ACL's ordering NCHW + _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); + } void NEWinogradLayer::run() { -#if defined(__aarch64__) _memory_group.acquire(); if(!_reshaped_kernel) { _reshaped_kernel = true; _permute_weights.run(); - _conv->transform_weights(reinterpret_cast(_weights_hwio.buffer()), nullptr); + _conv->transform_weights(); } - const Tensor4DShape in_shape(internal_get_input_shape(_input)); - auto padding = PADDING_VALID; - //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC _permute_input.run(); - - //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM. - _conv->reshape_input(in_shape, padding, reinterpret_cast(_input_nhwc.buffer()), _workspace.buffer()); - + // Transform input tensor to the winograd domain + _conv->transform_input(); //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs NEScheduler::get().schedule(&_winograd_kernel, Window::DimX); - - //Transform the output to the appropriate form - _conv->reshape_output(in_shape, padding, reinterpret_cast(_output_nhwc.buffer())); - + // Transform output tensor to the spatial domain + _conv->transform_output(); // Reorder the convoluted output to ACL's ordering NCHW - _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); _permute_output.run(); - _memory_group.release(); -#else /* __aarch64__ */ - ARM_COMPUTE_UNUSED(_winograd_kernel); - ARM_COMPUTE_UNUSED(_workspace); - ARM_COMPUTE_UNUSED(_kernel_storage); - ARM_COMPUTE_UNUSED(_input); - ARM_COMPUTE_UNUSED(_weights); - ARM_COMPUTE_UNUSED(_output); - ARM_COMPUTE_UNUSED(_reshaped_kernel); - ARM_COMPUTE_UNUSED(_conv); - ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a."); -#endif /* __aarch64__ */ } } // namespace arm_compute -- cgit v1.2.1