From edfa9f463bed084f8b0953557202b2a1e56da817 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 15 Aug 2017 11:45:22 +0100 Subject: COMPMID-477 - Optimized batched case in CLConvolutionLayer Change-Id: I4ef18f49f1da0cb816aaa0762466b940792c15ed Reviewed-on: http://mpd-gerrit.cambridge.arm.com/84162 Tested-by: Kaizen Reviewed-by: Anthony Barbier --- src/runtime/CL/functions/CLConvolutionLayer.cpp | 5 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 239 ++++++--------------- src/runtime/CL/functions/CLGEMM.cpp | 30 +-- 3 files changed, 91 insertions(+), 183 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index b1b83985d0..0bbec94e78 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -197,9 +197,12 @@ void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weig // Configure kernels _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); + + // Configure matrix multiply if(_is_fully_connected_convolution) { - _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f); + // The matrix A and Matrix B have not been reshaped + _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false); } else { diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 66a858d3ed..f7cea551f6 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -26,217 +26,127 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" #include -#include -namespace arm_compute +using namespace arm_compute; + +void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) { -CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights() - : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false) + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); +} + +CLFullyConnectedLayer::CLFullyConnectedLayer() + : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), + _accumulate_biases(false) { } -void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer) +void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2); - ARM_COMPUTE_ERROR_ON(output == nullptr); - ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer); + ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - const DataType data_type = input->info()->data_type(); + const DataType dt = input->info()->data_type(); const int fixed_point_position = input->info()->fixed_point_position(); - _transpose_weights = transpose_weights; - _is_batched_fc_layer = is_batched_fc_layer; + // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - // Check if we need to transpose the weights - if(_transpose_weights) - { - if(_is_batched_fc_layer) - { - // Initialize the output tensor for transpose - TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0)); - _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position)); - _transpose_kernel.configure(input, &_transpose_output); + // Initialize output tensor for im2col + TensorShape shape_im2col; + shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)); + shape_im2col.set(1, input->info()->dimension(3)); + shape_im2col.set(2, input->info()->dimension(4)); + shape_im2col.set(3, input->info()->dimension(5)); + _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position)); - // Configure transpose 1xW kernel - _transpose1xW_kernel.configure(&_transpose_output, output); + // Configure im2col kernel + _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); - // Allocate temporary tensor used for transposing the weights - _transpose_output.allocator()->allocate(); - } - else - { - _transpose_kernel.configure(input, output); - } - } - else - { - if(_is_batched_fc_layer) - { - // Configure transpose 1xW kernel - _transpose1xW_kernel.configure(input, output); - } - else - { - ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported"); - } - } -} - -void CLFullyConnectedLayerReshapeWeights::run() -{ - if(_transpose_weights) - { - CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer); - } + // Configure matrix multiply kernel + _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false); - if(_is_batched_fc_layer) - { - CLScheduler::get().enqueue(_transpose1xW_kernel); - } + // Allocate the output tensor for im2col once all the configure methods have been called + _im2col_output.allocator()->allocate(); } -CLFullyConnectedLayer::CLFullyConnectedLayer() - : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(), - _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false) +void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + _mm_kernel.configure(input, weights, output, 1.0f, false); } void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped) { - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - // Expected shape before transpose and reshaping - // Input: In x B (In and B can be multi-dimensional) - // Weights: flat(In) x Out - // Biases: Out - // Output: Out x B (B can be multi-dimensional) - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2); - const DataType data_type = input->info()->data_type(); - const int fixed_point_position = input->info()->fixed_point_position(); - const int num_batch_dimensions = std::max(0, static_cast(output->info()->tensor_shape().num_dimensions()) - 1); - const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions; - const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions); - - _linearize_input = input->info()->tensor_shape().x() != linear_input_size; - _are_weights_reshaped = are_weights_reshaped; - _accumulate_biases = biases != nullptr; - _is_batched_fc_layer = num_batch_dimensions > 0; - - // Check if number of batches match - ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1)); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); + _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; - const size_t interleave_width = 16 / input->info()->element_size(); - const ICLTensor *weights_to_use = weights; - - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) + if(biases != nullptr) { - weights_to_use = &_reshape_weights_output; + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + + _accumulate_biases = true; - TensorShape reshaped_weights_shape(weights->info()->tensor_shape()); + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } - // Transpose weights if the user hasn't done it - if(transpose_weights) - { - const size_t shape_x = reshaped_weights_shape.x(); - reshaped_weights_shape.set(0, reshaped_weights_shape.y()); - reshaped_weights_shape.set(1, shape_x); - } + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches - // If the we run multiple batches we need 1xW transpose, too. - if(_is_batched_fc_layer) - { - const float shape_x = reshaped_weights_shape.x(); - reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width); - reshaped_weights_shape.set(1, static_cast(std::ceil(shape_x / interleave_width))); - } + const ICLTensor *weights_to_use = weights; - _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position)); + if(!_are_weights_reshaped) + { + weights_to_use = &_reshape_weights_output; // Reshape the weights - _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer); + _reshape_weights_kernel.configure(weights, &_reshape_weights_output); } - // Check correct shape of weights - if(_is_batched_fc_layer) + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + + if(is_batched_fc_layer) { - // Transpose + Transpose1xW - ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width); - ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast(std::ceil(static_cast(output->info()->tensor_shape().x()) / interleave_width))); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { - // Transpose - ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x()); - ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size); + _is_fc_after_conv = input->info()->num_dimensions() > 1; } - const ICLTensor *multiply_input = input; - - if(_linearize_input) + if(_is_fc_after_conv) { - TensorShape shape_im2col(input->info()->tensor_shape()); - shape_im2col.collapse(num_input_dimensions); - _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position)); - - // Configure im2col kernel - _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); - - multiply_input = &_im2col_output; + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, output); } - - if(_is_batched_fc_layer) - { - TensorShape shape_interleaved(multiply_input->info()->tensor_shape()); - shape_interleaved.set(0, shape_interleaved.x() * 4); - shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f)); - _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position)); - - // Configure interleave4x4 kernel - _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output); - - multiply_input = &_interleave4x4_output; - } - - // Configure matrix multiply kernel - _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f); - - if(_accumulate_biases) + else { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x()); - - // Configure accumulate biases kernel - _accumulate_biases_kernel.configure(output, biases); + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, output); } // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) + if(!_are_weights_reshaped) { // Allocate the tensor for the weights reshaped _reshape_weights_output.allocator()->allocate(); } - - if(_linearize_input) - { - _im2col_output.allocator()->allocate(); - } - - if(_is_batched_fc_layer) - { - _interleave4x4_output.allocator()->allocate(); - } } void CLFullyConnectedLayer::run() @@ -249,17 +159,11 @@ void CLFullyConnectedLayer::run() } // Linearize input if it comes from a convolutional layer - if(_linearize_input) + if(_is_fc_after_conv) { CLScheduler::get().enqueue(_im2col_kernel, false); } - // Interleave input - if(_is_batched_fc_layer) - { - CLScheduler::get().enqueue(_interleave4x4_kernel, false); - } - // Run matrix multiply CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases); @@ -269,4 +173,3 @@ void CLFullyConnectedLayer::run() CLScheduler::get().enqueue(_accumulate_biases_kernel); } } -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index e81d8a6b97..9867229a7c 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -39,7 +39,7 @@ using namespace arm_compute; CLGEMM::CLGEMM() - : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false) + : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false) { } @@ -59,12 +59,16 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - _mm_kernel.set_target(CLScheduler::get().target()); + // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors + _is_interleaved_transposed = a->info()->dimension(1) > 16; - // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped - if(a->info()->dimension(1) != 1) + const ICLTensor *matrix_a = a; + const ICLTensor *matrix_b = b; + + if(_is_interleaved_transposed) { - _run_vector_matrix_multiplication = false; + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; TensorShape shape_tmp_a = a->info()->tensor_shape(); TensorShape shape_tmp_b = b->info()->tensor_shape(); @@ -89,19 +93,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * _transpose_kernel.configure(b, &_tmp_b); // Configure matrix multiply kernel - _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha); + _mm_kernel.set_target(CLScheduler::get().target()); + } + _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed); + + if(_is_interleaved_transposed) + { // Allocate intermediate tensors _tmp_a.allocator()->allocate(); _tmp_b.allocator()->allocate(); } - else // The first input tensor is a vector - { - _run_vector_matrix_multiplication = true; - - // Configure the matrix multiply kernel - _mm_kernel.configure(a, b, output, alpha); - } // Configure matrix addition kernel if(beta != 0 && c != nullptr) @@ -113,7 +115,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * void CLGEMM::run() { - if(!_run_vector_matrix_multiplication) + if(_is_interleaved_transposed) { // Run interleave kernel CLScheduler::get().enqueue(_interleave_kernel, false); -- cgit v1.2.1