aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-11-16 19:24:39 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commita3b1b469276b10484cd45901ae3a4b48b506caa9 (patch)
tree8c91176708bdede785edbb98c73ce0a479dff243 /src/core
parentfc35b51d598d12e2a0895ed82d2368f07df68829 (diff)
downloadComputeLibrary-a3b1b469276b10484cd45901ae3a4b48b506caa9.tar.gz
COMPMID-667: Add validation static method to NEON GEMMlowp
Change-Id: I8a470cc1351593ad8eeaf4ec92e04865e83d4f3c Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96147 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp76
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp92
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp79
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp151
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp101
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp107
-rw-r--r--src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp92
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp60
8 files changed, 526 insertions, 232 deletions
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index a29b661a00..1f4d9b176e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -40,6 +40,50 @@ using namespace arm_compute;
namespace
{
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = input->tensor_shape();
+ output_shape.set(0, input->dimension(0) * 4);
+ output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ bool window_changed = false;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Configure window in case of configured output
+ if(output->total_size() != 0)
+ {
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+
void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
{
const size_t in_stride = input->info()->strides_in_bytes()[1];
@@ -132,10 +176,7 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
- DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
TensorShape output_shape = input->info()->tensor_shape();
output_shape.set(0, input->info()->dimension(0) * 4);
@@ -144,21 +185,16 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- unsigned int num_elems_processed_per_iteration_x = 4;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
switch(input->info()->element_size())
{
case 1:
- num_elems_processed_per_iteration_x = 8;
- _func = &gemm_interleave_8bit_elements;
+ _func = &gemm_interleave_8bit_elements;
break;
case 2:
_func = &gemm_interleave_16bit_elements;
@@ -172,15 +208,17 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
}
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- update_window_and_padding(win, output_access, input_access);
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, input->info()->valid_region());
+Error NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
index 2a4a46e76c..e971dcba8e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
@@ -40,6 +40,60 @@ using namespace arm_compute;
namespace
{
+TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height)
+{
+ TensorShape output_shape = input->tensor_shape();
+ const float interleave_by_f32 = block_height;
+ output_shape.set(0, input->dimension(0) * interleave_by_f32);
+ output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32));
+ return output_shape;
+}
+
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+ const unsigned int num_elems_processed_per_iteration_x = block_width;
+ const unsigned int num_elems_processed_per_iteration_y = block_height;
+ bool window_changed = false;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ const float scaley_factor = 1.f / block_height;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Configure window in case of configured output
+ if(output->total_size() != 0)
+ {
+ AccessWindowRectangle output_access(output,
+ 0, 0,
+ num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y,
+ 1, num_elems_processed_per_iteration_y, scaley_factor);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+
inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
{
const size_t in_stride = input->info()->strides_in_bytes()[1];
@@ -122,20 +176,13 @@ NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
- ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = input->info()->tensor_shape();
- const float interleave_by_f32 = block_height;
- output_shape.set(0, input->info()->dimension(0) * interleave_by_f32);
- output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32));
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height));
_input = input;
_output = output;
@@ -143,20 +190,19 @@ void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *out
_block_width = block_width;
_transpose = transpose;
- const unsigned int num_elems_processed_per_iteration_x = block_width;
- const unsigned int num_elems_processed_per_iteration_y = block_height;
-
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- const float scaley_factor = 1.f / interleave_by_f32;
-
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor);
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- update_window_and_padding(win, output_access, input_access);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, input->info()->valid_region());
+Error NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose)
+{
+ ARM_COMPUTE_UNUSED(transpose);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 5f052f797d..2bc251e91f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -45,6 +45,48 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
+namespace
+{
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ TensorShape in0_shape = input0->tensor_shape();
+ TensorShape in1_shape = input1->tensor_shape();
+ TensorShape out_shape = output->tensor_shape();
+
+ in0_shape.collapse(2);
+ in1_shape.collapse(2);
+ out_shape.collapse(2);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
+ AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
{
@@ -52,42 +94,29 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
- // Check if matrix B should be slidden or not
- // Don't slide matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- TensorShape in0_shape = input0->info()->tensor_shape();
TensorShape in1_shape = input1->info()->tensor_shape();
- TensorShape out_shape = output->info()->tensor_shape();
-
- in0_shape.collapse(2);
in1_shape.collapse(2);
- out_shape.collapse(2);
-
- ARM_COMPUTE_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
- ARM_COMPUTE_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
_input0 = input0;
_input1 = input1;
_output = output;
_slide_matrix_b = in1_shape[2] != 1;
- constexpr unsigned int num_elems_processed_per_iteration_x = 16;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic in0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), input0->info()->dimension(1));
- AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- update_window_and_padding(win, in0_access, in1_access, output_access);
+Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
- INEKernel::configure(win);
+ return Error{};
}
void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index bd550db54c..62f4014acb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -44,106 +44,131 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
-NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+namespace
{
-}
-
-void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+Error validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
// If a_offset == 0, vector_sum_col can be a nullptr
if(a_offset != 0)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
-
- TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
- vector_sum_col_shape.collapse(1);
-
- // Check if vector_sum_col_shape should be slidden or not
- // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
}
// If b_offset == 0, vector_sum_row can be a nullptr
if(b_offset != 0)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
- TensorShape output_shape = mm_result->info()->tensor_shape();
- TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+ TensorShape output_shape = mm_result->tensor_shape();
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
vector_sum_row_shape.collapse(1);
output_shape.collapse(2);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+ "mm_result tensor must have the same number of batches of output tensor");
if(a_offset != 0)
{
- TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
vector_sum_col_shape.collapse(1);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
- && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
}
}
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _a_offset = a_offset;
- _b_offset = b_offset;
- _k_offset = a_offset * b_offset * k;
+ return Error{};
+}
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
+{
constexpr unsigned int num_elems_processed_per_iteration = 16;
+ bool window_changed = false;
// Configure kernel window
- Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win,
+ mm_result_access);
- // Accordingly with a_offset and b_offset, we can have 4 cases:
- // a_offset != 0 && b_offset != 0
- // a_offset = 0 && b_offset != 0
- // a_offset != 0 && b_offset = 0
- // a_offset = 0 && b_offset = 0
- if(a_offset != 0 && b_offset != 0)
+ if(a_offset != 0)
{
- AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- vector_sum_col_access,
- vector_sum_row_access,
- mm_result_access);
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win,
+ vector_sum_col_access);
}
- else if(a_offset == 0 && b_offset != 0)
+ if(b_offset != 0)
{
- AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-
- update_window_and_padding(win,
- vector_sum_row_access,
- mm_result_access);
+ AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+ window_changed = window_changed || update_window_and_padding(win,
+ vector_sum_row_access);
}
- else if(a_offset != 0 && b_offset == 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- vector_sum_col_access,
- mm_result_access);
- }
- else
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
+ : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+{
+}
+
+void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+ a_offset, b_offset)); // NOLINT
+
+ _vector_sum_col = vector_sum_col;
+ _vector_sum_row = vector_sum_row;
+ _mm_result = mm_result;
+ _a_offset = a_offset;
+ _b_offset = b_offset;
+ _k_offset = a_offset * b_offset * k;
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
{
- update_window_and_padding(win,
- mm_result_access);
+ TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT
+ vector_sum_col_shape.collapse(1);
+
+ // Check if vector_sum_col_shape should be slidden or not
+ // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
}
- INEKernel::configure(win);
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+ a_offset, b_offset);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+ vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ a_offset, b_offset)
+ .first); // NOLINT
+
+ return Error{};
}
void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 26aaa2a9d5..670b11fe67 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -40,6 +40,50 @@ using namespace arm_compute;
namespace
{
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+ // Check biases if exist
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ }
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win,
+ input_access,
+ output_result_access);
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+
+ output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+
inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
{
// Add the offset terms to GEMM's result
@@ -185,17 +229,13 @@ NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint
void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON(max > 255);
- ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
- }
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+ (bias != nullptr) ? bias->info() : nullptr,
+ output->info(),
+ min,
+ max));
_input = input;
_bias = bias;
@@ -206,34 +246,25 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp
_min = min;
_max = max;
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
// Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- output_result_access);
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
-
- update_window_and_padding(win,
- bias_access);
- }
-
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
- INEKernel::configure(win);
+ auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+ // Check if we need to clamp the result using min and max
const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+ _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
+}
- // Check if we need to clamp the result using min and max
- _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
+Error NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ (bias != nullptr) ? bias->clone().get() : nullptr,
+ output->clone().get())
+ .first);
+
+ return Error{};
}
void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 81d9b5bb81..a8395a15cb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -44,6 +44,59 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
+namespace
+{
+Error validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ return Error{};
+}
+std::pair<Error, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
+{
+ const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+
+Error validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
: _input(), _output(), _k(0), _is_reshaped(false)
{
@@ -51,29 +104,28 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
_input = mtx_a;
_output = vector_sum_row;
_k = num_mtx_a_cols;
_is_reshaped = is_interleaved4x4;
- const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1;
-
// Configure kernel window
- Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- output_access);
+ auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Error NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+{
+ ARM_COMPUTE_UNUSED(num_mtx_a_cols);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
@@ -200,29 +252,28 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
_input = mtx_b;
_output = vector_sum_col;
_k = num_mtx_b_rows;
_is_reshaped = is_transposed1xW;
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
// Configure kernel window
- Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- output_access);
+ auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Error NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+{
+ ARM_COMPUTE_UNUSED(num_mtx_b_rows);
+ ARM_COMPUTE_UNUSED(is_transposed1xW);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 7f83144e12..7d79c664d1 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -41,45 +41,87 @@
using namespace arm_compute;
-void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
- DataType::F16,
- DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ TensorShape output_shape{ input->tensor_shape() };
+ const size_t transpose_w = 16 / input->element_size();
+ output_shape.set(0, input->dimension(1) * transpose_w);
+ output_shape.set(1, static_cast<size_t>(std::ceil((input->dimension(0) / static_cast<float>(transpose_w)))));
+ return output_shape;
+}
- TensorShape output_shape{ input->info()->tensor_shape() };
- const size_t transpose_w = 16 / input->info()->element_size();
- output_shape.set(0, input->info()->dimension(1) * transpose_w);
- output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ return Error{};
+}
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
const int scale_x = num_elems_processed_per_iteration;
-
- _input = input;
- _output = output;
+ bool window_changed = false;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
- AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
- output_access);
+ // Configure window in case of configured output
+ if(output->total_size() != 0)
+ {
+ AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+ }
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
index ae711af89a..a025cac82d 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
@@ -48,13 +48,47 @@ namespace arm_compute
// Enable only if compiled for AArch64-V8.2-A targets
#ifdef ARM_COMPUTE_AARCH64_V8_2
+namespace
+{
+using namespace arm_compute;
+
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ // Configure kernel window
+ Window win = calculate_max_window(*output);
+
+ AccessWindowRectangle output_access(output, 0, 0, 12, 8);
+
+ const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
+ const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
+
+ bool window_changed = update_window_and_padding(win,
+ AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
+ AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
+ output_access);
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
namespace arm_compute
{
void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
_input0 = input0;
_input1 = input1;
@@ -66,19 +100,17 @@ void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, cons
_transform_1 = transform_1;
// Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
+Error NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
- INEKernel::configure(win);
+ return Error{};
}
void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)