From b3182b19251cd010baad8252e7607de7059ac986 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 4 Sep 2020 08:44:52 +0100 Subject: COMPMID-3157: Remove padding from NEGEMMTranspose1xWKernel - Remove padding from NEGEMMTranspose1xWKernel - Extend test for validating zero padding requirement Change-Id: I9ce4ca95a500229b045dc140cfff21fdf7373700 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3920 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 96 +++++++--------------- tests/validation/NEON/GEMM.cpp | 61 +++++++++----- 2 files changed, 70 insertions(+), 87 deletions(-) diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 951cb19679..a8adc45645 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -60,28 +60,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - // Configure window in case of configured output - if(output->total_size() != 0) - { - AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1)); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - - const bool window_changed = update_window_and_padding(win, input_access); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} } // namespace void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) @@ -97,16 +75,21 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) _input = input; _output = output; + const size_t vector_size = 16 / input->info()->element_size(); + // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*input->info(), Steps(vector_size)); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win); } Status NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } @@ -136,52 +119,29 @@ void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, window); Iterator out(_output, win_out); - switch(_input->info()->element_size()) + const size_t in_width = _input->info()->dimension(0); + const size_t element_size = _input->info()->element_size(); + const size_t out_stride = _output->info()->strides_in_bytes()[1]; + const size_t vector_size = 16 / element_size; + + execute_window_loop(window, [&](const Coordinates & id) { - case 1: - { - const size_t out_stride = _output->info()->strides_in_bytes()[1]; - execute_window_loop(window, [&](const Coordinates & id) - { - // Output address = base addr + (y * 16) + (x / 16 ) * stride - const uint8_t *in_ptr = in.ptr(); - uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride; - vst1q_u8(out_ptr, vld1q_u8(in_ptr)); - }, - in, out); - break; - } - case 2: + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; + + for(size_t k = 0; k < vector_size; ++k) { - const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t); - execute_window_loop(window, [&](const Coordinates & id) + // If the input width is not multiple of W, we fill the reference with 0s + if((id.x() + k) >= in_width) { - // Output address = base addr + (y * 8) + (x / 8 ) * stride - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride; - vst1q_u16(out_ptr, vld1q_u16(in_ptr)); - }, - in, out); - break; - } - case 4: - { - const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float); - execute_window_loop(window, [&](const Coordinates & id) + std::memset(out_ptr + k * element_size, 0, element_size); + } + else { - // Output address = base addr + (y * 4) + (x / 4 ) * stride - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride; - vst1q_u32(out_ptr, vld1q_u32(in_ptr)); - }, - in, out); - break; + std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + } } - default: - { - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } - } + }, + in, out); } } // namespace arm_compute diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp index 9105638a6e..dfac72f3a5 100644 --- a/tests/validation/NEON/GEMM.cpp +++ b/tests/validation/NEON/GEMM.cpp @@ -68,27 +68,23 @@ const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::d const auto data_transpose = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14); /** Zero padding test */ -bool validate_zero_padding(unsigned int m_value, unsigned int k_value) +template +bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value) { - const unsigned int M = m_value; - const unsigned int K = k_value; - - const TensorShape lhs_shape(K, M); - const TensorShape lhs_shape_reshaped(K * 4, std::ceil(M / 4.0f)); + const TensorShape in_shape(dim0_value, dim1_value); // Create tensors - Tensor lhs = create_tensor(lhs_shape, DataType::U32); - Tensor dst = create_tensor(lhs_shape_reshaped, DataType::U32); + Tensor in = create_tensor(in_shape, DataType::U32); + Tensor dst; - ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS); // Validate zero-padding - NEGEMMInterleave4x4Kernel lhs_reshape; + FunctionType func; - lhs_reshape.configure(&lhs, &dst); + func.configure(&in, &dst); - return lhs.info()->padding().empty(); + return in.info()->padding().empty(); } } // namespace @@ -97,15 +93,42 @@ TEST_SUITE(NEON) TEST_SUITE(GEMM) TEST_SUITE(TRANSPOSE_1XW) -using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder; -using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture; -TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::F32)) +using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder; +DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip( + framework::dataset::make("N", { 1, 23, 63, 101 }), + framework::dataset::make("K", { 1, 47, 29, 27 })), + n_value, k_value) +{ + bool status = validate_zero_padding(n_value, k_value); + ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); +} + +TEST_SUITE(U32) +using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // U32 + +TEST_SUITE(U16) +using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // U16 + +TEST_SUITE(U8) +using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8)) { // Validate output validate(Accessor(_target), _reference); } -TEST_SUITE_END() // FP32 +TEST_SUITE_END() // U8 TEST_SUITE_END() // TRANSPOSE_1XW @@ -117,7 +140,7 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip( framework::dataset::make("K", { 1, 47, 29, 27 })), m_value, k_value) { - bool status = validate_zero_padding(m_value, k_value); + bool status = validate_zero_padding(m_value, k_value); ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); } -- cgit v1.2.1