From feaea101da17b383fe85440b0820132d0e0fa97d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 3 Sep 2020 13:20:34 +0100 Subject: COMPMID-3143: Remove padding from NEGEMMInterleave4x4Kernel - Remove padding from NEGEMMInterleave4x4Kernel - Extend test for validating zero padding requirement Change-Id: I94abc271e005f9dd6e1721b185631f55f598dbfd Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3915 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 25 ++- .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 186 ++++++++------------- tests/validation/NEON/GEMM.cpp | 61 ++++++- 3 files changed, 143 insertions(+), 129 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 7ddbf4bca8..322932bab2 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -77,15 +77,26 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - /** Common signature for all the transpose functions + /** Template function to run gemm interleave 4x4 * - * @param[in] input An input tensor. Data types supported: All - * @param[out] output The output tensor. Data type supported: same as @p input - * @param[in] window Region on which to execute the kernel. + * @tparam ScalarType Scalar datatype + * + * @param[in] input Input tensor. Data types supported: uint32_t, uint16_t and uint8_t + * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window); + + /** Common signature for all the specialised gemm interleave 4x4 functions + * + * @param[in] input Input tensor. Data types supported: uint32_t, uint16_t and uint8_t + * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). */ - using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window); + using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window); - GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */ + GEMMInterleaveFunctionFuncPtr _func; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/ diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index 3d178316c6..8b4ad0da23 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,113 +61,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - bool window_changed = false; - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - window_changed = window_changed || update_window_and_padding(win, input_access); - - // Configure window in case of configured output - if(output->total_size() != 0) - { - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); - window_changed = window_changed || update_window_and_padding(win, output_access); - output_access.set_valid_region(win, input->valid_region()); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 32); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x8x4_t data = - { - { - vld1_u8(in.ptr() + 0 * in_stride), - vld1_u8(in.ptr() + 1 * in_stride), - vld1_u8(in.ptr() + 2 * in_stride), - vld1_u8(in.ptr() + 3 * in_stride), - } - }; - vst4_u8(out.ptr(), data); - }, - in, out); -} - -void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 16); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint16x4x4_t data = - { - { - vld1_u16(reinterpret_cast(in.ptr() + 0 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 1 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 2 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 3 * in_stride)), - } - }; - vst4_u16(reinterpret_cast(out.ptr()), data); - }, - in, out); -} - -void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 16); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint32x4x4_t data = - { - { - vld1q_u32(reinterpret_cast(in.ptr() + 0 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 1 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 2 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 3 * in_stride)) - } - }; - vst4q_u32(reinterpret_cast(out.ptr()), data); - }, - in, out); -} } // namespace NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() @@ -191,33 +84,92 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) switch(input->info()->element_size()) { case 1: - _func = &gemm_interleave_8bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; case 2: - _func = &gemm_interleave_16bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; case 4: - _func = &gemm_interleave_32bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; default: ARM_COMPUTE_ERROR_ON("Element size not supported"); break; } - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*input->info(), Steps(1, 4)); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win); } Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } +template +void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window) +{ + const size_t window_start_x = window.x().start(); + const size_t window_end_x = window.x().end(); + + const size_t in_height = input->info()->dimension(1); + const size_t in_stride = input->info()->strides_in_bytes()[1]; + + const size_t partial_y = in_height % 4; + + // Set window for the input tensor + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Set window for the output tensor + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.scale(Window::DimY, 0.25f); + + Iterator in(input, win); + Iterator out(output, win_out); + + execute_window_loop(win, [&](const Coordinates & id) + { + if(id.y() + 4 <= static_cast(in_height)) + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + const ScalarType data[4] = + { + *(reinterpret_cast(in.ptr() + 0 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 1 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 2 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 3 * in_stride) + x), + }; + std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType)); + } + } + else + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + ScalarType data[4] = { 0, 0, 0, 0 }; + + for(size_t y = 0; y < partial_y; ++y) + { + data[y] = *(reinterpret_cast(in.ptr() + y * in_stride) + x); + } + + std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType)); + } + } + }, + in, out); +} + void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -233,5 +185,5 @@ void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info * * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] */ - (*_func)(_input, _output, window); + (this->*_func)(_input, _output, window); } diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp index f817390780..9105638a6e 100644 --- a/tests/validation/NEON/GEMM.cpp +++ b/tests/validation/NEON/GEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,6 +67,30 @@ const auto CNNDataTypes = framework::dataset::make("DataType", const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12); const auto data_transpose = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14); +/** Zero padding test */ +bool validate_zero_padding(unsigned int m_value, unsigned int k_value) +{ + const unsigned int M = m_value; + const unsigned int K = k_value; + + const TensorShape lhs_shape(K, M); + const TensorShape lhs_shape_reshaped(K * 4, std::ceil(M / 4.0f)); + + // Create tensors + Tensor lhs = create_tensor(lhs_shape, DataType::U32); + Tensor dst = create_tensor(lhs_shape_reshaped, DataType::U32); + + ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Validate zero-padding + NEGEMMInterleave4x4Kernel lhs_reshape; + + lhs_reshape.configure(&lhs, &dst); + + return lhs.info()->padding().empty(); +} + } // namespace TEST_SUITE(NEON) @@ -88,14 +112,41 @@ TEST_SUITE_END() // TRANSPOSE_1XW TEST_SUITE(INTERLEAVE_4X4) using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder; -TEST_SUITE(FP32) -using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32)) +DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip( + framework::dataset::make("M", { 1, 23, 63, 101 }), + framework::dataset::make("K", { 1, 47, 29, 27 })), + m_value, k_value) +{ + bool status = validate_zero_padding(m_value, k_value); + ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); +} + +TEST_SUITE(U32) +using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32)) { // Validate output validate(Accessor(_target), _reference); } -TEST_SUITE_END() // FP32 +TEST_SUITE_END() // U32 + +TEST_SUITE(U16) +using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // U16 + +TEST_SUITE(U8) +using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // U8 TEST_SUITE_END() // INTERLEAVE_4X4 -- cgit v1.2.1