diff options
Diffstat (limited to 'src/core')
8 files changed, 526 insertions, 232 deletions
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index a29b661a00..1f4d9b176e 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -40,6 +40,50 @@ using namespace arm_compute; namespace { +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + if(output->total_size() != 0) + { + TensorShape output_shape = input->tensor_shape(); + output_shape.set(0, input->dimension(0) * 4); + output_shape.set(1, std::ceil(input->dimension(1) / 4.0f)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window) { const size_t in_stride = input->info()->strides_in_bytes()[1]; @@ -132,10 +176,7 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); TensorShape output_shape = input->info()->tensor_shape(); output_shape.set(0, input->info()->dimension(0) * 4); @@ -144,21 +185,16 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; - unsigned int num_elems_processed_per_iteration_x = 4; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - switch(input->info()->element_size()) { case 1: - num_elems_processed_per_iteration_x = 8; - _func = &gemm_interleave_8bit_elements; + _func = &gemm_interleave_8bit_elements; break; case 2: _func = &gemm_interleave_16bit_elements; @@ -172,15 +208,17 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) } // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, output_access, input_access); + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); +Error NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp index 2a4a46e76c..e971dcba8e 100644 --- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp @@ -40,6 +40,60 @@ using namespace arm_compute; namespace { +TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height) +{ + TensorShape output_shape = input->tensor_shape(); + const float interleave_by_f32 = block_height; + output_shape.set(0, input->dimension(0) * interleave_by_f32); + output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32)); + return output_shape; +} + +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0"); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height) +{ + const unsigned int num_elems_processed_per_iteration_x = block_width; + const unsigned int num_elems_processed_per_iteration_y = block_height; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + const float scaley_factor = 1.f / block_height; + + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, + 0, 0, + num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, + 1, num_elems_processed_per_iteration_y, scaley_factor); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height) { const size_t in_stride = input->info()->strides_in_bytes()[1]; @@ -122,20 +176,13 @@ NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel() void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0"); - ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0"); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = input->info()->tensor_shape(); - const float interleave_by_f32 = block_height; - output_shape.set(0, input->info()->dimension(0) * interleave_by_f32); - output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32)); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height)); _input = input; _output = output; @@ -143,20 +190,19 @@ void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *out _block_width = block_width; _transpose = transpose; - const unsigned int num_elems_processed_per_iteration_x = block_width; - const unsigned int num_elems_processed_per_iteration_y = block_height; - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - const float scaley_factor = 1.f / interleave_by_f32; - - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, output_access, input_access); + auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); +Error NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose) +{ + ARM_COMPUTE_UNUSED(transpose); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 5f052f797d..2bc251e91f 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -45,6 +45,48 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + TensorShape in0_shape = input0->tensor_shape(); + TensorShape in1_shape = input1->tensor_shape(); + TensorShape out_shape = output->tensor_shape(); + + in0_shape.collapse(2); + in1_shape.collapse(2); + out_shape.collapse(2); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1)); + AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) { @@ -52,42 +94,29 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - // Check if matrix B should be slidden or not - // Don't slide matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - TensorShape in0_shape = input0->info()->tensor_shape(); TensorShape in1_shape = input1->info()->tensor_shape(); - TensorShape out_shape = output->info()->tensor_shape(); - - in0_shape.collapse(2); in1_shape.collapse(2); - out_shape.collapse(2); - - ARM_COMPUTE_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); _input0 = input0; _input1 = input1; _output = output; _slide_matrix_b = in1_shape[2] != 1; - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic in0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), input0->info()->dimension(1)); - AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - update_window_and_padding(win, in0_access, in1_access, output_access); +Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - INEKernel::configure(win); + return Error{}; } void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window) diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index bd550db54c..62f4014acb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -44,106 +44,131 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +namespace { -} - -void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +Error validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); - - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); - vector_sum_col_shape.collapse(1); - - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col_shape[1] != 1; + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr if(b_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1)); - TensorShape output_shape = mm_result->info()->tensor_shape(); - TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + TensorShape output_shape = mm_result->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse(1); output_shape.collapse(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; + return Error{}; +} +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ constexpr unsigned int num_elems_processed_per_iteration = 16; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + mm_result_access); - // Accordingly with a_offset and b_offset, we can have 4 cases: - // a_offset != 0 && b_offset != 0 - // a_offset = 0 && b_offset != 0 - // a_offset != 0 && b_offset = 0 - // a_offset = 0 && b_offset = 0 - if(a_offset != 0 && b_offset != 0) + if(a_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - vector_sum_col_access, - vector_sum_row_access, - mm_result_access); + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + vector_sum_col_access); } - else if(a_offset == 0 && b_offset != 0) + if(b_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - - update_window_and_padding(win, - vector_sum_row_access, - mm_result_access); + AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT + window_changed = window_changed || update_window_and_padding(win, + vector_sum_row_access); } - else if(a_offset != 0 && b_offset == 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - vector_sum_col_access, - mm_result_access); - } - else + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + +NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +{ +} + +void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset)); // NOLINT + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) { - update_window_and_padding(win, - mm_result_access); + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT + vector_sum_col_shape.collapse(1); + + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col_shape[1] != 1; } - INEKernel::configure(win); + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Error NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + a_offset, b_offset) + .first); // NOLINT + + return Error{}; } void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp index 26aaa2a9d5..670b11fe67 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -40,6 +40,50 @@ using namespace arm_compute; namespace { +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(max > 255); + ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + } + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, + input_access, + output_result_access); + + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) { // Add the offset terms to GEMM's result @@ -185,17 +229,13 @@ NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON(max > 255); - ARM_COMPUTE_ERROR_ON(min < 0 || min > max); - - if(bias != nullptr) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0)); - } + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), + (bias != nullptr) ? bias->info() : nullptr, + output->info(), + min, + max)); _input = input; _bias = bias; @@ -206,34 +246,25 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp _min = min; _max = max; - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_result_access); - - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]); - - update_window_and_padding(win, - bias_access); - } - - output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - INEKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); + // Check if we need to clamp the result using min and max const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255)); + _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>; +} - // Check if we need to clamp the result using min and max - _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>; +Error NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + (bias != nullptr) ? bias->clone().get() : nullptr, + output->clone().get()) + .first); + + return Error{}; } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 81d9b5bb81..a8395a15cb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -44,6 +44,59 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Error validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Error{}; +} +std::pair<Error, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped) +{ + const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + +Error validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() : _input(), _output(), _k(0), _is_reshaped(false) { @@ -51,29 +104,28 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); _input = mtx_a; _output = vector_sum_row; _k = num_mtx_a_cols; _is_reshaped = is_interleaved4x4; - const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1; - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Error NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) +{ + ARM_COMPUTE_UNUSED(num_mtx_a_cols); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) @@ -200,29 +252,28 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); _input = mtx_b; _output = vector_sum_col; _k = num_mtx_b_rows; _is_reshaped = is_transposed1xW; - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Error NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) +{ + ARM_COMPUTE_UNUSED(num_mtx_b_rows); + ARM_COMPUTE_UNUSED(is_transposed1xW); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 7f83144e12..7d79c664d1 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -41,45 +41,87 @@ using namespace arm_compute; -void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) +namespace +{ +TensorShape get_output_shape(const ITensorInfo *input) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + TensorShape output_shape{ input->tensor_shape() }; + const size_t transpose_w = 16 / input->element_size(); + output_shape.set(0, input->dimension(1) * transpose_w); + output_shape.set(1, static_cast<size_t>(std::ceil((input->dimension(0) / static_cast<float>(transpose_w))))); + return output_shape; +} - TensorShape output_shape{ input->info()->tensor_shape() }; - const size_t transpose_w = 16 / input->info()->element_size(); - output_shape.set(0, input->info()->dimension(1) * transpose_w); - output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w))))); +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + return Error{}; +} - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); const int scale_x = num_elems_processed_per_iteration; - - _input = input; - _output = output; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension"); - AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), - output_access); + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); + } - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape())); + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + +void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Error NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp index ae711af89a..a025cac82d 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp @@ -48,13 +48,47 @@ namespace arm_compute // Enable only if compiled for AArch64-V8.2-A targets #ifdef ARM_COMPUTE_AARCH64_V8_2 +namespace +{ +using namespace arm_compute; + +Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + + return Error{}; +} + +std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*output); + + AccessWindowRectangle output_access(output, 0, 0, 12, 8); + + const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8); + const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12); + + bool window_changed = update_window_and_padding(win, + AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()), + AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()), + output_access); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + namespace arm_compute { void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); _input0 = input0; _input1 = input1; @@ -66,19 +100,17 @@ void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, cons _transform_1 = transform_1; // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); +Error NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info) |