From 678d83a5c3ec1b19ddb9df07a990262ce4bd65e1 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 7 Jan 2019 16:05:36 +0000 Subject: COMPMID-1838: Add 4D softmax support for NEON and achieve parity with CL Change-Id: I15c4a747cde2536b1caba2baf4ded9ca76e6dae2 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/487 Tested-by: Arm Jenkins Reviewed-by: VidhyaSudhan Loganathan --- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 152 ++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 20 deletions(-) (limited to 'src/runtime/NEON/functions') diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 9be9e6817a..36b7d47d28 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -25,54 +25,155 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "utils/TypePrinter.h" #include -using namespace arm_compute; - +namespace arm_compute +{ NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp() + : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(), + _output_flattened(), _needs_flattening(false) +{ +} + +void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis) { + // Flatten the input + const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis); + + // Initialize the flat input + _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); + + // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel + // If flattening on the third axes, we use NEFlattenKernel. + // In all other cases we have to use NEReshapeKernel + if(axis != 3) + { + auto reshape_kernel_ptr = support::cpp14::make_unique(); + reshape_kernel_ptr->configure(input, &_input_flattened); + _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr); + } + else + { + auto flatten_kernel_ptr = support::cpp14::make_unique(); + flatten_kernel_ptr->configure(input, &_input_flattened); + _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr); + } + + // We need to init the output tensor here. Indeed, the reshape kernel expects + // both tensors to be already initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); } void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis) { + // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_UNUSED(axis); + ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis)); - // Configure Kernels - _max_kernel.configure(input, &_max); - _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE); - _softmax_kernel.configure(input, &_max, output, beta, &_tmp); + // We don't need flattening only in the case the input is 2D and axis is 1 + _needs_flattening = axis != 1; + + // If we are dealing with a 4D tensor, we will: + // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor + // - Execute all the pipeline (reduction + normalization) on the flattened tensor + // - Reshape the flattened output into the real output + if(_needs_flattening) + { + // Add to the memory manager _input_flattened + _memory_group.manage(&_input_flattened); + + // Configure _flatten_kernel and _input_flattened + configure_reshape_input_kernel(input, output, axis); + } + + // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case) + // or it is the original input case (2D case) + ITensor *input_2D = (_needs_flattening ? &_input_flattened : input); + + // Create intermediate tensors shapes + const TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true); + DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type(); + TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); // Init intermediate tensors - _max.allocator()->init(*_max.info()); - _tmp.allocator()->init(*_tmp.info()); + TensorShape max_sum_shape = input_2D->info()->tensor_shape(); + max_sum_shape.set(0, 1); + _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape)); + _tmp.allocator()->init(tensor_info_tmp); // Manage intermediate buffers _memory_group.manage(&_max); _memory_group.manage(&_tmp); - // Allocate intermediate tensors + // Configure Kernels + _max_kernel.configure(input_2D, &_max); + if(_needs_flattening) + { + // Add to the memory manager _output_flattened + _memory_group.manage(&_output_flattened); + + // The normalization kernel stores the result in a flat output tensor + _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp); + _input_flattened.allocator()->allocate(); + + // Reshape the flat output into the requested (4D) output + _reshape_kernel.configure(&_output_flattened, output); + + // Allocate the intermediate flat tensors + _output_flattened.allocator()->allocate(); + } + else + { + // Softmax 2D case + _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE); + _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp); + } + + // Allocate intermediate buffers _max.allocator()->allocate(); _tmp.allocator()->allocate(); } Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON"); - // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported"); - - const TensorShape max_shape = TensorShape(input->tensor_shape()).set(0, 1); - const TensorInfo tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding(); - const TensorInfo dont_care; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON(axis < 1 || input->num_dimensions() < axis); + + // Create intermediate tensor info + DataType tmp_data_type = input->data_type(); + const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); + + TensorShape max_sum_shape = input->tensor_shape(); + max_sum_shape.set(0, 1); + const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true)); + const TensorInfo dont_care; + + const bool needs_flattening = (axis != 1); + + if(needs_flattening) + { + const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis); + TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true)); + + if(axis != 3) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat)); + } + } ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care)); return Status{}; } @@ -81,9 +182,20 @@ void NESoftmaxLayer::run() { _memory_group.acquire(); + if(_needs_flattening) + { + NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY); + } + NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); NEScheduler::get().schedule(&_max_kernel, Window::DimY); NEScheduler::get().schedule(&_softmax_kernel, Window::DimY); + if(_needs_flattening) + { + NEScheduler::get().schedule(&_reshape_kernel, Window::DimY); + } + _memory_group.release(); } +} // namespace arm_compute \ No newline at end of file -- cgit v1.2.1