From afa5d817b1d083837cd7ea30d32f845d82620c12 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 30 Nov 2017 14:25:57 +0000 Subject: COMPMID-617 Add validation methods to Kernels - NEActivationLayer - NESoftmax - NEDirectConvolutionLayer - NENormalizationLayer - NEPoolingLayer Change-Id: Ib279f1c1b7f9247679b0d6593aed7393da8fe87b Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111335 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Georgios Pinitas --- src/core/CL/kernels/CLActivationLayerKernel.cpp | 1 - src/core/NEON/kernels/NEActivationLayerKernel.cpp | 90 +++++--- ...EDirectConvolutionLayerBiasAccumulateKernel.cpp | 98 ++++++-- .../kernels/NEDirectConvolutionLayerKernel.cpp | 249 +++++++++++++-------- .../NEON/kernels/NENormalizationLayerKernel.cpp | 117 ++++++---- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 232 +++++++++++++------ src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 230 ++++++++++++++----- src/runtime/NEON/functions/NEActivationLayer.cpp | 5 + .../NEON/functions/NEDirectConvolutionLayer.cpp | 24 ++ .../NEON/functions/NENormalizationLayer.cpp | 13 +- src/runtime/NEON/functions/NEPoolingLayer.cpp | 5 + src/runtime/NEON/functions/NESoftmaxLayer.cpp | 19 +- 12 files changed, 781 insertions(+), 302 deletions(-) (limited to 'src') diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp index 5346dbbe15..eecc94f23c 100644 --- a/src/core/CL/kernels/CLActivationLayerKernel.cpp +++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp @@ -36,7 +36,6 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "support/ToolchainSupport.h" #include diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 94bd5f15e3..6ea504a173 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -39,6 +39,51 @@ #include using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + if(output != nullptr && (output->total_size() != 0)) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + window_changed = update_window_and_padding(win, + AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->valid_region()); + } + else + { + // In-place computation + window_changed = update_window_and_padding(win, + AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace NEActivationLayerKernel::NEActivationLayerKernel() : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC) @@ -47,7 +92,7 @@ NEActivationLayerKernel::NEActivationLayerKernel() void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(input); _input = input; _act_info = activation_info; @@ -56,15 +101,12 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat if(output != nullptr) { // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - + auto_init_if_empty(*output->info(), *input->info()->clone()); _output = output; } + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); + // Activation functions : FP32 static std::map act_map_f32 = { @@ -149,29 +191,10 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat ARM_COMPUTE_ERROR("Unsupported data type."); } - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - if(output != nullptr) - { - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - } - else - { - // In-place computation - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); - } - - ICPPKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICPPKernel::configure(win_config.second); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -653,6 +676,15 @@ typename std::enable_if::value, void>::type NEActivati input, output); } +Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first); + + return Status{}; +} + void NEActivationLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp index f00af9f93e..a6585ade12 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp @@ -40,6 +40,62 @@ using namespace arm_compute; namespace { +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); + if(is_data_type_quantized(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(bias, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +{ + bool window_changed = false; + const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type()); + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); + if(output != nullptr && (output->total_size() != 0)) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access, bias_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, input_access, bias_access); + input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + // Internal load inline float32x4_t internal_vld1q(const float *in) { @@ -186,40 +242,26 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); - ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position()); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, bias); + if(output != nullptr) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output); + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()); } - ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info())); _func = nullptr; _bias = bias; _input = input; _output = output; - const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type()); - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowStatic bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1)); - if(output != nullptr) - { - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access, bias_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - } - else - { - update_window_and_padding(win, input_access, bias_access); - input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape())); - } - INEKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); // Set appropriate function switch(input->info()->data_type()) @@ -266,6 +308,14 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con } } +Status NEDirectConvolutionLayerBiasAccumulateKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first); + + return Status{}; +} + void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 78afbc2c20..1ca213b04a 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -251,10 +251,15 @@ inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, c return r; } -constexpr int SmallTensorSizeOptim = 8; +constexpr int small_tensor_size_optim = 8; +inline bool run_optim_small_tensor_info(const ITensorInfo *t) +{ + return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim; +} + inline bool run_optim_small_tensor(const ITensor *t) { - return t->info()->dimension(Window::DimX) <= SmallTensorSizeOptim && t->info()->dimension(Window::DimY) <= SmallTensorSizeOptim; + return run_optim_small_tensor_info(t->info()); } // Optimized convolver for 1x1 kernels used only where input width and height are both <= 8 @@ -266,8 +271,8 @@ class convolver_w1x1_i8x8_f32 public: static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > SmallTensorSizeOptim); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > SmallTensorSizeOptim); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim); const int input_stride_y = input->info()->strides_in_bytes().y(); const int input_stride_z = input->info()->strides_in_bytes().z(); @@ -302,12 +307,12 @@ public: execute_window_loop(window_out, [&](const Coordinates & id) { - const uint8_t *input_ptr = in.ptr(); - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - float32x4_t accum0[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t accum1[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + const uint8_t *input_ptr = in.ptr(); + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + float32x4_t accum0[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + float32x4_t accum1[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; for(int oz = 0; oz < range_z; ++oz) { accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f); @@ -992,121 +997,118 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i } } -} // namespace - -NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() - : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0), - _num_elems_written_per_iteration(0) -{ -} - -BorderSize NEDirectConvolutionLayerKernel::border_size() const +inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info) { - return _border_size; -} - -void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), - "Pad > 0 not supported for 1x1 weights"); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), - "Pad > 1 not supported for 3x3 weights"); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2), - "Pad > 2 not supported for 5x5 weights"); - - ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); - - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); - const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); - - _input = input; - _weights = weights; - _output = output; - _conv_info = conv_info; - _kernel_size = weights->info()->dimension(0); - _border_size = BorderSize(conv_pad_y, conv_pad_x); - - const unsigned int kernel_size = weights->info()->dimension(0); - - // Get convolved dimensions unsigned int output_width = 0; unsigned int output_height = 0; - std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info); + std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info); - TensorShape output_shape = input->info()->tensor_shape(); + TensorShape output_shape = input->tensor_shape(); output_shape.set(0, output_width); output_shape.set(1, output_height); - output_shape.set(2, weights->info()->dimension(3)); + output_shape.set(2, weights->dimension(3)); - DataType data_type = input->info()->data_type(); + return output_shape; +} - if(is_data_type_fixed_point(data_type)) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), + "Pad > 0 not supported for 1x1 weights"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), + "Pad > 1 not supported for 3x3 weights"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2), + "Pad > 2 not supported for 5x5 weights"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Checks performed when output is configured + if(output->total_size() != 0) { - // Promote data type in case of fixed point - data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info); + + DataType data_type = input->data_type(); + if(is_data_type_fixed_point(data_type)) + { + // Promote data type in case of fixed point + data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type); } - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position()); + return Status{}; +} - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type()); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, + unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration) +{ + // Calculate right and bottom border + unsigned int kernel_size = weights->dimension(0); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + BorderSize border_size = BorderSize(conv_pad_y, conv_pad_x); + const int input_width = input->dimension(0); + const int input_height = input->dimension(1); - switch(_kernel_size) + switch(kernel_size) { case 1: { - switch(input->info()->data_type()) + switch(input->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QS8: case DataType::QS16: - _num_elems_written_per_iteration = 8; + num_elems_written_per_iteration = 8; break; case DataType::F32: - if(run_optim_small_tensor(input)) + if(run_optim_small_tensor_info(input)) { - _num_elems_written_per_iteration = 8; + num_elems_written_per_iteration = 8; } else { - _num_elems_written_per_iteration = 4; + num_elems_written_per_iteration = 4; } break; default: ARM_COMPUTE_ERROR("Data type not supported."); break; } - _num_weight_elems_read_per_row = kernel_size; - _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration; + num_weight_elems_read_per_row = kernel_size; + num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration; break; } case 3: case 5: { - switch(input->info()->data_type()) + switch(input->data_type()) { case DataType::F32: - _num_weight_elems_read_per_row = 4 + _kernel_size - 1; - _num_elems_read_per_iteration = 12; - _num_elems_written_per_iteration = 16 >> conv_stride_x; + num_weight_elems_read_per_row = 4 + kernel_size - 1; + num_elems_read_per_iteration = 12; + num_elems_written_per_iteration = 16 >> conv_stride_x; break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QS8: case DataType::QS16: - _num_weight_elems_read_per_row = 8 + _kernel_size - 1; - _num_elems_read_per_iteration = 24; - _num_elems_written_per_iteration = 32 >> conv_stride_x; + num_weight_elems_read_per_row = 8 + kernel_size - 1; + num_elems_read_per_iteration = 24; + num_elems_written_per_iteration = 32 >> conv_stride_x; break; default: ARM_COMPUTE_ERROR("Data type not supported."); @@ -1121,22 +1123,81 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens } } - // Calculate right and bottom border - const unsigned int conv_stride_y = std::get<1>(_conv_info.stride()); - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width; - const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height; - _border_size.right = std::max(upper_bound_w, static_cast(_kernel_size)); - _border_size.bottom = std::max(upper_bound_h, static_cast(_kernel_size)); - Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration)); - AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); - AccessWindowStatic weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size); - AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration); - update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); + const int upper_bound_w = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_pad_x - input_width; + const int upper_bound_h = ((output->dimension(1) - 1) * conv_stride_y - conv_pad_y + kernel_size) - input_height; + border_size.right = std::max(upper_bound_w, static_cast(kernel_size)); + border_size.bottom = std::max(upper_bound_h, static_cast(kernel_size)); + Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration)); + AccessWindowStatic input_access(input, -conv_pad_x, -conv_pad_y, input_width + border_size.right, input_height + border_size.bottom); + AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size); + AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration); + bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() + : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0), + _num_elems_written_per_iteration(0) +{ +} + +BorderSize NEDirectConvolutionLayerKernel::border_size() const +{ + return _border_size; +} + +void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + + _input = input; + _weights = weights; + _output = output; + _conv_info = conv_info; + _kernel_size = weights->info()->dimension(0); + _border_size = BorderSize(conv_pad_y, conv_pad_x); + + // Get convolved dimensions + TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info); + + DataType data_type = input->info()->data_type(); + + if(is_data_type_fixed_point(data_type)) + { + // Promote data type in case of fixed point + data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + } + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row, + _num_elems_read_per_iteration, _num_elems_written_per_iteration); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + unsigned int num_weight_elems_read_per_row = 0; + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_written_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, num_weight_elems_read_per_row, num_elems_read_per_iteration, + num_elems_written_per_iteration) + .first); + + return Status{}; } void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index b983609e49..776cb27d7a 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -34,6 +34,67 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); + + if(is_data_type_fixed_point(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared); + ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input); + ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input); + ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input); + } + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *input_squared, ITensorInfo *output, const NormalizationLayerInfo &norm_info) +{ + unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; + const unsigned int border_width = (norm_info.is_cross_map()) ? 0 : std::min(norm_info.norm_size() / 2, 3U); + BorderSize border_size = BorderSize(0, border_width); + bool window_changed = false; + + // Configure window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, num_rows); + AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, num_elems_read_per_iteration, num_rows); + + if(output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, input_squared_access, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding(win, input_access, input_squared_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + NENormalizationLayerKernel::NENormalizationLayerKernel() : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size() { @@ -46,20 +107,12 @@ BorderSize NENormalizationLayerKernel::border_size() const void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output); - ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); - if(is_data_type_fixed_point(input->info()->data_type())) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output); - ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input); - ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input); - ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input); - } + auto_init_if_empty(*output->info(), *input->info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info)); const unsigned int border_width = (norm_info.is_cross_map()) ? 0 : std::min(norm_info.norm_size() / 2, 3U); @@ -69,14 +122,10 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _norm_info = norm_info; _border_size = BorderSize(0, border_width); - unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - ARM_COMPUTE_UNUSED(num_elems_processed_per_iteration); - switch(_input->info()->data_type()) { case DataType::F32: { - num_elems_processed_per_iteration = 4; switch(norm_info.type()) { case NormType::IN_MAP_1D: @@ -90,14 +139,12 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _func = &NENormalizationLayerKernel::normalize_float; break; default: - ARM_COMPUTE_ERROR("Not supported"); break; } break; } case DataType::F16: { - num_elems_processed_per_iteration = 8; switch(norm_info.type()) { case NormType::IN_MAP_1D: @@ -111,14 +158,12 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _func = &NENormalizationLayerKernel::normalize_float; break; default: - ARM_COMPUTE_ERROR("Not supported"); break; } break; } case DataType::QS8: { - num_elems_processed_per_iteration = 16; switch(norm_info.type()) { case NormType::IN_MAP_1D: @@ -132,14 +177,12 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _func = &NENormalizationLayerKernel::normalize_fixed_point; break; default: - ARM_COMPUTE_ERROR("Not supported"); break; } break; } case DataType::QS16: { - num_elems_processed_per_iteration = 8; switch(norm_info.type()) { case NormType::IN_MAP_1D: @@ -153,7 +196,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _func = &NENormalizationLayerKernel::normalize_fixed_point; break; default: - ARM_COMPUTE_ERROR("Not supported"); break; } break; @@ -162,21 +204,10 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * ARM_COMPUTE_ERROR("NOT SUPPORTED!"); } - const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); - const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; - - // Configure window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows); - AccessWindowRectangle input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, input_squared_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - INEKernel::configure(win); + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), input_squared->info(), output->info(), norm_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); } template @@ -374,6 +405,14 @@ void NENormalizationLayerKernel::normalize_fixed_point(const Window &window) } } +Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), input_squared->clone().get(), output->clone().get(), norm_info).first); + + return Status{}; +} + void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 0e06704666..47372c2d5d 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -47,6 +47,15 @@ using namespace arm_compute; namespace { +void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h) +{ + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(0, pooled_w); + output_shape.set(1, pooled_h); + + auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); +} + template inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) @@ -88,75 +97,77 @@ inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, in const int val = ((end_y - start_y) * (end_x - start_x)); return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position)); } -} // namespace - -NEPoolingLayerKernel::NEPoolingLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0) -{ -} -BorderSize NEPoolingLayerKernel::border_size() const +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size) { - return _border_size; -} + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); -void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info) -{ int pool_pad_x = 0; int pool_pad_y = 0; int pool_stride_x = 0; int pool_stride_y = 0; - unsigned int pooled_w = 0; - unsigned int pooled_h = 0; PoolingType pool_type = pool_info.pool_type(); - int pool_size = pool_info.pool_size(); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); const bool exclude_padding = pool_info.exclude_padding(); const bool is_global_pooling = pool_info.is_global_pooling(); std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - static const std::set supported_pool_sizes = { 2, 3 }; - ARM_COMPUTE_UNUSED(supported_pool_sizes); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type())); - ARM_COMPUTE_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->info()->data_type() != DataType::F32)); - ARM_COMPUTE_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size)); - ARM_COMPUTE_ERROR_ON(is_global_pooling && (input->info()->tensor_shape().x() != input->info()->tensor_shape().y())); - ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2); - ARM_COMPUTE_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->info()->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->data_type() != DataType::F32)); + ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size)); + ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y())); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2); + ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type())); - // Update pool size in case of global pooling - pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size; + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h)); + } - // Check output dimensions - std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), - input->info()->dimension(1), - pool_size, - pool_size, - pool_info.pad_stride_info()); + return Status{}; +} - // Output auto initialization if not yet initialized - { - TensorShape output_shape{ input->info()->tensor_shape() }; - output_shape.set(0, pooled_w); - output_shape.set(1, pooled_h); +Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size) +{ + const bool is_global_pooling = pool_info.is_global_pooling(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()), + "Global pooling is supported only with rectangular inputs!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)), + "Invalid pool size and pool pad combination!"); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - } + return Status{}; +} - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration, + BorderSize &border_size, + unsigned int pooled_w, unsigned int pooled_h, int pool_size) +{ + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_horizontal_window = 0; + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int input_width = input->dimension(0); + const int input_height = input->dimension(1); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); - unsigned int num_elems_read_per_iteration = 0; - unsigned int num_elems_processed_per_iteration = 0; - unsigned int num_elems_horizontal_window = 0; + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), + input->dimension(1), + pool_size, + pool_size, + pad_stride_info); // Select element size - switch(input->info()->data_type()) + switch(input->data_type()) { case DataType::QS8: num_elems_read_per_iteration = 16; @@ -233,19 +244,89 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons break; } - _num_elems_processed_per_iteration = num_elems_processed_per_iteration; - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + border_size = BorderSize(pool_pad_y, pool_pad_x); + border_size.right = std::max(upper_bound_w, pool_pad_x); + border_size.bottom = std::max(upper_bound_h, pool_pad_y); + bool window_changed = false; + + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(0, pooled_w); + output_shape.set(1, pooled_h); + TensorInfo output_info(input->clone()->set_tensor_shape(output_shape)); + + Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration)); + AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom); + + if(output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, input_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEPoolingLayerKernel::NEPoolingLayerKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0) +{ +} + +BorderSize NEPoolingLayerKernel::border_size() const +{ + return _border_size; +} + +void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + PoolingType pool_type = pool_info.pool_type(); + int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + const bool exclude_padding = pool_info.exclude_padding(); + const bool is_global_pooling = pool_info.is_global_pooling(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + // Update pool size in case of global pooling + pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size; + + // Validate pool info before calling scaled_dimensions + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size)); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), + input->info()->dimension(1), + pool_size, + pool_size, + pool_info.pad_stride_info()); + + // Output auto initialization if not yet initialized + auto_init(input->info(), output->info(), pooled_w, pooled_h); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size)); // Set instance variables - _input = input; - _output = output; - _pool_info = pool_info; - _border_size = BorderSize(pool_pad_y, pool_pad_x); - _border_size.right = std::max(upper_bound_w, pool_pad_x); - _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + _input = input; + _output = output; + _pool_info = pool_info; // Select appropriate function switch(pool_size) @@ -413,12 +494,9 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons } // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - INEKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); } template @@ -1154,6 +1232,34 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window input, output); } +Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + unsigned int num_elems_processed_per_iteration = 0; + BorderSize border_size(0); + + const bool is_global_pooling = pool_info.is_global_pooling(); + const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size(); + + // Validate pool info befor calling scaled_dimensions + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size)); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), + input->dimension(1), + pool_size, + pool_size, + pool_info.pad_stride_info()); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first); + + return Status{}; +} + void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index a8a0f59a41..b13fb0e87c 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -42,6 +42,149 @@ using namespace arm_compute; namespace { +Status validate_arguments_logits_1d_max(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + // Softmax across the x dimension + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(0, 1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + } + + return Status{}; +} + +std::pair validate_and_configure_window_logits_1d_max(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + constexpr unsigned int num_elems_written_per_row = 1; + const int input_width = input->valid_region().shape.x(); + + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + bool window_changed = false; + + if(output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_written_per_row, 1.f / input_width); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, input_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +Status validate_arguments_logits_1d_shift_exp_sum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, max, sum, output); + ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + } + + // Checks performed when sum is configured + if(sum->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, sum); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max, sum); + } + + return Status{}; +} + +std::pair validate_and_configure_window_logits_1d_shift_exp_sum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum) +{ + unsigned int num_elems_processed_per_iteration = input->valid_region().shape.x(); + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal max_access(max, 0, 1); + AccessWindowHorizontal sum_access(sum, 0, 1); + bool window_changed = false; + + if(output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access); + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding(win, input_access, max_access, sum_access); + } + + sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +Status validate_arguments_logits_1d_norm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window_logits_1d_norm(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output) +{ + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowStatic sum_access(sum, 0, 0, 1, sum->dimension(1)); + bool window_changed = false; + + if(output->total_size() != 0) + { + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + window_changed = update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->valid_region()); + } + else + { + window_changed = update_window_and_padding(win, input_access, sum_access); + } + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) { Window in_slice = window.first_slice_window_1D(); @@ -184,8 +327,7 @@ BorderSize NELogits1DMaxKernel::border_size() const void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Softmax across the x dimension TensorShape output_shape{ input->info()->tensor_shape() }; @@ -194,9 +336,8 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(input->info(), output->info())); const int input_width = input->info()->valid_region().shape.x(); unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); @@ -226,17 +367,17 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0); // Configure kernel window - constexpr unsigned int num_elems_written_per_row = 1; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_row, 1.f / input_width); - - update_window_and_padding(win, input_access, output_access); + auto win_config = validate_and_configure_window_logits_1d_max(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); +Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(input->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Status{}; } void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info) @@ -512,20 +653,14 @@ NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel() void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output); - ARM_COMPUTE_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->info()->data_type())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output); // Output auto initialization if not yet initialized auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum); - - unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x(); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info(), beta)); switch(input->info()->data_type()) { @@ -555,18 +690,17 @@ void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor _beta = beta; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal max_access(max->info(), 0, 1); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal sum_access(sum->info(), 0, 1); - - update_window_and_padding(win, input_access, max_access, output_access, sum_access); + auto win_config = validate_and_configure_window_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); - sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape())); +Status NELogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_shift_exp_sum(input, max, output, sum, beta)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_shift_exp_sum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first); - INEKernel::configure(win); + return Status{}; } void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info) @@ -717,23 +851,18 @@ NELogits1DNormKernel::NELogits1DNormKernel() void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_norm(input->info(), sum->info(), output->info())); _input = input; _sum = sum; _output = output; - // Configure kernel window - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); - switch(input->info()->data_type()) { case DataType::QS8: @@ -755,17 +884,18 @@ void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, I break; } - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, sum_access, output_access); + // Configure kernel window + auto win_config = validate_and_configure_window_logits_1d_norm(input->info(), sum->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); +Status NELogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_norm(input, sum, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_norm(input->clone().get(), sum->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Status{}; } void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index 57a1738f85..cdf1b54659 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -34,3 +34,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay k->configure(input, output, activation_info); _kernel = std::move(k); } + +Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +{ + return NEActivationLayerKernel::validate(input, output, act_info); +} diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 52a4cc158f..2eabe459a5 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -85,6 +85,30 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); } +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, bias, output); + + DataType data_type = output->data_type(); + if(is_data_type_fixed_point(data_type)) + { + // Promote data type in case of fixed point + data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + } + TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), + "Biases size and number of input feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, + "Biases should be one dimensional"); + + ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output)); + + return Status{}; +} + void NEDirectConvolutionLayer::run() { NEScheduler::get().schedule(&_input_border_handler, Window::DimZ); diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index da4314b5ed..af98ac1f17 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -39,7 +39,7 @@ NENormalizationLayer::NENormalizationLayer(std::shared_ptr memor void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info) { - ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); _input_squared.allocator()->init(tensor_info); @@ -56,6 +56,17 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _input_squared.allocator()->allocate(); } +Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + return Status{}; +} + void NENormalizationLayer::run() { _memory_group.acquire(); diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index f8a85b9897..530c7fca4a 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -48,6 +48,11 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast(0.f))); } +Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +{ + return NEPoolingLayerKernel::validate(input, output, pool_info); +} + void NEPoolingLayer::run() { // Fill border diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 84ecfdaf33..8e6773c5b1 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -38,7 +38,7 @@ NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr memory_manager) void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Create intermediate tensors shapes TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()); @@ -67,6 +67,23 @@ void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta) _sum.allocator()->allocate(); } +Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + TensorShape max_sum_shape = input->tensor_shape(); + max_sum_shape.set(0, 1); + + TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape)); + + ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum)); + ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta)); + ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output)); + + return Status{}; +} + void NESoftmaxLayer::run() { _memory_group.acquire(); -- cgit v1.2.1