From e1553374d037dbf84999258d5bc88927891770cc Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 16 Jul 2018 18:53:52 +0100 Subject: COMPMID-1357: Stop passing around raw pointers in NEWinogradConvolution First step to allow us to enable the memory manager in this function Change-Id: Ic42fdac4c74cd21973c71130b59883e4a87d3dca Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140167 Reviewed-by: Pablo Tello Reviewed-by: Vidhya Sudhan Loganathan Tested-by: Jenkins --- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 84 ++++++++++++---------- 1 file changed, 47 insertions(+), 37 deletions(-) (limited to 'src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp') diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 29da0803a3..a71eade9a1 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -155,29 +155,32 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * { if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } else { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } break; } case 5: { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; break; } default: @@ -195,21 +198,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int out_channels = output->info()->dimension(channel_idx); const Tensor4DShape in_shape(internal_get_input_shape(input)); + const DataType data_type = input->info()->data_type(); const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. constexpr size_t storage_alignment = 64; // Kernel Storage const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + in_channels) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Output storage const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ ; const KernelShape kernel_shape({ out_channels, static_cast(kernel_size.height), static_cast(kernel_size.width), in_channels }); const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); @@ -229,28 +239,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int output_matrix_row_stride = kernel_matrix_row_stride; TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(element_size_from_data_type(DataType::F32)); + Strides a_strides(data_type_size); a_strides.set(1, a_strides[0] * k); + //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. a_strides.set(2, 0); - //a_strides.set(2, element_size_from_data_type(DataType::F32) * input_matrix_stride / n_gemms); - a_strides.set(3, element_size_from_data_type(DataType::F32) * input_matrix_stride); + a_strides.set(3, data_type_size * input_matrix_stride); TensorShape b_shape(n, k, n_gemms); - Strides b_strides(element_size_from_data_type(DataType::F32)); - b_strides.set(1, element_size_from_data_type(DataType::F32) * kernel_matrix_row_stride); - b_strides.set(2, element_size_from_data_type(DataType::F32) * kernel_matrix_stride); + Strides b_strides(data_type_size); + b_strides.set(1, data_type_size * kernel_matrix_row_stride); + b_strides.set(2, data_type_size * kernel_matrix_stride); TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(element_size_from_data_type(DataType::F32)); - d_strides.set(1, element_size_from_data_type(DataType::F32) * output_matrix_row_stride); + Strides d_strides(data_type_size); + d_strides.set(1, data_type_size * output_matrix_row_stride); + //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. d_strides.set(2, 0); - //d_strides.set(2, element_size_from_data_type(DataType::F32) * output_matrix_stride / n_gemms); - d_strides.set(3, element_size_from_data_type(DataType::F32) * output_matrix_stride); + d_strides.set(3, data_type_size * output_matrix_stride); TensorInfo a_info, b_info, d_info; - a_info.init(a_shape, 1, DataType::F32, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, DataType::F32, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, DataType::F32, d_strides, 0, output_storage_size); + a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); + b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); + d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); _input_workspace.allocator()->init(a_info, storage_alignment); _input_workspace.allocator()->allocate(); @@ -276,12 +286,12 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } else { transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } // Configure WeightsTransform @@ -290,14 +300,14 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } else { // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } _weights_hwio.allocator()->allocate(); @@ -306,13 +316,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * if(data_layout == DataLayout::NCHW) { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, &_output_nhwc, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } else { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, _output, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } -- cgit v1.2.1