/* * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "src/core/NEON/kernels/NEStackLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; namespace { Status validate_arguments(const ITensorInfo *input, uint32_t axis, uint32_t idx_input, uint32_t num_tensors, uint32_t rank, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors); ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != rank); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } return Status{}; } inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, uint32_t axis, uint32_t idx_input, uint32_t num_dims) { Coordinates id_out = id; for (uint32_t i = num_dims; i > axis; --i) { id_out.set(i, id[i - 1]); } id_out.set(axis, idx_input); return id_out; } void elementwise_stack(const std::vector &input, ITensor *output, uint32_t axis, const Window &window) { Window window_out; window_out.use_tensor_dimensions(output->info()->tensor_shape()); const int32_t num_tensors = input.size(); const size_t element_size = input[0]->info()->element_size(); const uint32_t num_dims = static_cast(input[0]->info()->num_dimensions()); for (int32_t idx_input = 0; idx_input < num_tensors; ++idx_input) { Iterator input_it(input[idx_input], window); execute_window_loop( window, [&](const Coordinates &id) { Coordinates id_out = shift_from_axis_and_replace_coordinate(id, axis, idx_input, num_dims); std::memcpy(output->ptr_to_element(id_out), input_it.ptr(), element_size); }, input_it); } } void memcpy_stack(const std::vector &input, ITensor *output, uint32_t axis, const Window &window) { const int32_t element_size = input[0]->info()->element_size(); const int32_t chunk_size = input[0]->info()->tensor_shape().total_size_lower(axis) * element_size; const int32_t num_tensors = input.size(); const int32_t out_chunk_step = chunk_size * num_tensors; const int32_t start_x = window.x().start(); const int32_t end_x = window.x().end(); const int32_t start_y = window.y().start(); const int32_t end_y = window.y().end(); uint8_t *out_ptr_base = output->buffer() + output->info()->offset_first_element_in_bytes() + start_x * chunk_size; for (int32_t x = start_x; x < end_x; ++x) { const uint8_t *in_ptr = input[x]->buffer() + input[x]->info()->offset_first_element_in_bytes() + start_y * chunk_size; uint8_t *out_ptr = out_ptr_base + start_y * out_chunk_step; for (int32_t y = start_y; y < end_y; ++y) { std::memcpy(out_ptr, in_ptr, chunk_size); in_ptr += chunk_size; out_ptr += out_chunk_step; } out_ptr_base += chunk_size; } } } // namespace NEStackLayerKernel::NEStackLayerKernel() : _input(), _output(nullptr), _axis(), _split_dimension(Window::DimY) { } void NEStackLayerKernel::configure(const std::vector &input, uint32_t axis, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(output); const int32_t num_tensors = input.size(); ARM_COMPUTE_ERROR_ON(num_tensors == 0); const uint32_t rank = input[0]->info()->num_dimensions(); ARM_COMPUTE_UNUSED(rank); for (int32_t i = 0; i < num_tensors; ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input[i]->info(), axis, i, num_tensors, rank, output->info())); } auto_init_if_empty(*output->info(), input[0]->info()->clone()->set_tensor_shape( compute_stack_shape(*input[0]->info(), axis, num_tensors))); _input = input; _output = output; _axis = axis; } Status NEStackLayerKernel::validate(const std::vector &input, uint32_t axis, const ITensorInfo *output) { const int32_t num_tensors = input.size(); const size_t rank = input[0]->num_dimensions(); for (int32_t i = 0; i < num_tensors; ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(input[i]); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input[i], axis, i, num_tensors, rank, output)); } return Status{}; } void NEStackLayerKernel::prepare() { // Prepare calculates the window at runtime, in case there is padding being added after configure() const ITensorInfo *input_info = _input[0]->info(); const int32_t num_dims = input_info->num_dimensions(); const int32_t num_tensors = _input.size(); // Check if there are any paddings in the input tensors bool has_padding = false; for (const ITensor *in : _input) { if (has_holes(*in->info(), num_dims - 1)) { has_padding = true; break; } } has_padding = has_padding || has_holes(*_output->info(), num_dims); Window win; if (!has_padding) { _stack_fn = memcpy_stack; // 2D execution window (X,Y): [Num_tensors, Dimensions >= axis] win.set(Window::DimX, Window::Dimension(0, num_tensors, 1)); win.set(Window::DimY, Window::Dimension(0, input_info->tensor_shape().total_size_upper(_axis), 1)); } else { _stack_fn = elementwise_stack; win = calculate_max_window(*input_info); } INEKernel::configure(win); } void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); _stack_fn(_input, _output, _axis, window); } } // namespace arm_compute