From e1553374d037dbf84999258d5bc88927891770cc Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 16 Jul 2018 18:53:52 +0100 Subject: COMPMID-1357: Stop passing around raw pointers in NEWinogradConvolution First step to allow us to enable the memory manager in this function Change-Id: Ic42fdac4c74cd21973c71130b59883e4a87d3dca Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140167 Reviewed-by: Pablo Tello Reviewed-by: Vidhya Sudhan Loganathan Tested-by: Jenkins --- .../kernels/NEWinogradConvolutionLayerKernel.cpp | 125 ++------------------- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 84 ++++++++------ 2 files changed, 58 insertions(+), 151 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp index 50e69a8adf..b295a0c685 100644 --- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp @@ -40,38 +40,6 @@ namespace arm_compute namespace { -Status validate_arguments_winograd_gemm(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta, - const GEMMInfo &gemm_info = GEMMInfo()) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(b); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - if(c != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B"); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != a->num_dimensions()); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_UNUSED(alpha, beta); - return Status{}; -} - Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); @@ -232,78 +200,6 @@ std::pair validate_and_configure_window_winograd_output_trans(IT return std::make_pair(err, win); } } // namespace -template -NEWinogradLayerBatchedGEMMKernel::NEWinogradLayerBatchedGEMMKernel() - : _gemms() -{ -} - -template -void NEWinogradLayerBatchedGEMMKernel::configure( - const unsigned int n_gemms, - const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const TIn *const a_ptr, - const TIn *const b_ptr, - TOut *const c_ptr) -{ - _gemms = support::cpp14::make_unique(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr); - Window win; - auto win_last = _gemms->get_window(); - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); - INEKernel::configure(win); -} - -template -void NEWinogradLayerBatchedGEMMKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - const size_t first_gemm = window.x().start(); - const size_t last_gemm = window.x().end(); - _gemms->run(first_gemm, last_gemm); -} - -template -unsigned int NEWinogradLayerBatchedGEMMKernel::get_number_gemms() const -{ - return WinogradBase::N_GEMMS; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_output_tile_rows() const -{ - return _output_tile_rows; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_output_tile_cols() const -{ - return _output_tile_cols; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_number_blocks() const -{ - return WinogradConv::N_BLOCK; -} - -template -Status NEWinogradLayerBatchedGEMMKernel::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, - const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_gemm(a, b, c, output, alpha, beta, gemm_info)); - return Status{}; -} - -template class NEWinogradLayerBatchedGEMMKernel; -template class NEWinogradLayerBatchedGEMMKernel; -template class NEWinogradLayerBatchedGEMMKernel; // Weights transform @@ -332,7 +228,7 @@ int NEWinogradLayerTransformWeightsKernel void NEWinogradLayerTransformWeightsKernel::configure( const ITensor *weights_hwio, - T *const output, + ITensor *output, const int matrix_stride, /** Stride across matrices in the output. */ const int num_output_channels, /** Number of filters. */ const int num_input_channels) /** Number of channels in each filter. */ @@ -344,7 +240,7 @@ void NEWinogradLayerTransformWeightsKernel(_weights_hwio->buffer()), _output, _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels); + WeightsTransform transform(reinterpret_cast(_weights_hwio->buffer()), reinterpret_cast(_output->buffer()), _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels); const size_t fst = window.x().start(); const size_t lst = window.x().end(); transform.run(fst, lst); @@ -423,7 +319,7 @@ void NEWinogradLayerTransformInputKernel(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, _output, _matrix_stride, _num_channels); + InputTransform input_transform(reinterpret_cast(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, reinterpret_cast(_output->buffer()), + _matrix_stride, _num_channels); // The code below cannot be moved to configure because biases hasn't been allocated at that point const size_t fst = window.x().start(); @@ -511,9 +408,9 @@ Tensor4DShape NEWinogradLayerTransformOutputKernel void NEWinogradLayerTransformOutputKernel::configure( const ITensor *biases, - const T *const output_workingspace, + const ITensor *output_workingspace, const int matrix_stride, - ITensor *const output_nhwc, + ITensor *output_nhwc, const int num_batches, const int num_rows, const int num_cols, @@ -529,7 +426,7 @@ void NEWinogradLayerTransformOutputKernel(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride, (_biases ? reinterpret_cast(_biases->buffer()) : nullptr), reinterpret_cast(_output_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T)); diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 29da0803a3..a71eade9a1 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -155,29 +155,32 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * { if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } else { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } break; } case 5: { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; break; } default: @@ -195,21 +198,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int out_channels = output->info()->dimension(channel_idx); const Tensor4DShape in_shape(internal_get_input_shape(input)); + const DataType data_type = input->info()->data_type(); const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. constexpr size_t storage_alignment = 64; // Kernel Storage const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + in_channels) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Output storage const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ ; const KernelShape kernel_shape({ out_channels, static_cast(kernel_size.height), static_cast(kernel_size.width), in_channels }); const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); @@ -229,28 +239,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int output_matrix_row_stride = kernel_matrix_row_stride; TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(element_size_from_data_type(DataType::F32)); + Strides a_strides(data_type_size); a_strides.set(1, a_strides[0] * k); + //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. a_strides.set(2, 0); - //a_strides.set(2, element_size_from_data_type(DataType::F32) * input_matrix_stride / n_gemms); - a_strides.set(3, element_size_from_data_type(DataType::F32) * input_matrix_stride); + a_strides.set(3, data_type_size * input_matrix_stride); TensorShape b_shape(n, k, n_gemms); - Strides b_strides(element_size_from_data_type(DataType::F32)); - b_strides.set(1, element_size_from_data_type(DataType::F32) * kernel_matrix_row_stride); - b_strides.set(2, element_size_from_data_type(DataType::F32) * kernel_matrix_stride); + Strides b_strides(data_type_size); + b_strides.set(1, data_type_size * kernel_matrix_row_stride); + b_strides.set(2, data_type_size * kernel_matrix_stride); TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(element_size_from_data_type(DataType::F32)); - d_strides.set(1, element_size_from_data_type(DataType::F32) * output_matrix_row_stride); + Strides d_strides(data_type_size); + d_strides.set(1, data_type_size * output_matrix_row_stride); + //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. d_strides.set(2, 0); - //d_strides.set(2, element_size_from_data_type(DataType::F32) * output_matrix_stride / n_gemms); - d_strides.set(3, element_size_from_data_type(DataType::F32) * output_matrix_stride); + d_strides.set(3, data_type_size * output_matrix_stride); TensorInfo a_info, b_info, d_info; - a_info.init(a_shape, 1, DataType::F32, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, DataType::F32, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, DataType::F32, d_strides, 0, output_storage_size); + a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); + b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); + d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); _input_workspace.allocator()->init(a_info, storage_alignment); _input_workspace.allocator()->allocate(); @@ -276,12 +286,12 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } else { transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } // Configure WeightsTransform @@ -290,14 +300,14 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } else { // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } _weights_hwio.allocator()->allocate(); @@ -306,13 +316,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * if(data_layout == DataLayout::NCHW) { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, &_output_nhwc, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } else { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, _output, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } -- cgit v1.2.1