From 27400b90a9cb3fe028c5b724b58ce0e82d89b5e8 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Thu, 1 Nov 2018 13:44:05 +0000 Subject: COMPMID-1707: Create 3 special CLWidthConcatenate kernel to concatenate 2/4 and 8 tensors (Part 1) Creating special cases for concatening 2 and 4 tensors. Change-Id: I6a739a494ae45011acb65369e353f9ef96970b90 --- .../CL/functions/CLWidthConcatenateLayer.cpp | 75 +++++++++++++++++----- 1 file changed, 60 insertions(+), 15 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp index 5233ff4f52..46a2d80d10 100644 --- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp @@ -36,26 +36,46 @@ using namespace arm_compute; CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT : _concat_kernels_vector(), + _concat_x2_kernel(), + _concat_x4_kernel(), _num_inputs(0) { } Status CLWidthConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output) // NOLINT { + const unsigned int num_inputs = inputs_vector.size(); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2); + ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); // Output auto inizialitation if not yet initialized TensorInfo tmp_output_info = *output->clone(); TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector); auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type()); - unsigned int width_offset = 0; - for(const auto &input : inputs_vector) + switch(num_inputs) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info)); - width_offset += input->dimension(0); + case 2: + // Validate WidthConcatenate2Tensors kernels if there are 2 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]); + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info)); + break; + case 4: + // Validate WidthConcatenate4Tensors kernels if there are 4 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]); + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info)); + break; + default: + unsigned int width_offset = 0; + // Validate generic case of WidthConcatenate kernel + for(const auto &input : inputs_vector) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info)); + width_offset += input->dimension(0); + } + break; } return Status{}; @@ -74,16 +94,30 @@ void CLWidthConcatenateLayer::configure(std::vector inputs_vector, // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info())); - - unsigned int width_offset = 0; - _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info())); - for(unsigned int i = 0; i < _num_inputs; i++) + switch(_num_inputs) { - _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output); - width_offset += inputs_vector.at(i)->info()->dimension(0); + case 2: + // Configure WidthConcatenate2Tensors kernel + _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output); + break; + case 4: + // Configure WidthConcatenate4Tensors kernel + _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output); + break; + default: + // Configure generic case WidthConcatenate kernels + _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + + unsigned int width_offset = 0; + for(unsigned int i = 0; i < _num_inputs; ++i) + { + _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output); + width_offset += inputs_vector.at(i)->info()->dimension(0); + } + break; } } @@ -91,8 +125,19 @@ void CLWidthConcatenateLayer::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_inputs; i++) + switch(_num_inputs) { - CLScheduler::get().enqueue(_concat_kernels_vector[i], true); + case 2: + CLScheduler::get().enqueue(_concat_x2_kernel, true); + break; + case 4: + CLScheduler::get().enqueue(_concat_x4_kernel, true); + break; + default: + for(unsigned int i = 0; i < _num_inputs; ++i) + { + CLScheduler::get().enqueue(_concat_kernels_vector[i], true); + } + break; } } -- cgit v1.2.1