From feaea101da17b383fe85440b0820132d0e0fa97d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 3 Sep 2020 13:20:34 +0100 Subject: COMPMID-3143: Remove padding from NEGEMMInterleave4x4Kernel - Remove padding from NEGEMMInterleave4x4Kernel - Extend test for validating zero padding requirement Change-Id: I94abc271e005f9dd6e1721b185631f55f598dbfd Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3915 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 186 ++++++++------------- 1 file changed, 69 insertions(+), 117 deletions(-) (limited to 'src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp') diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index 3d178316c6..8b4ad0da23 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,113 +61,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - bool window_changed = false; - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - window_changed = window_changed || update_window_and_padding(win, input_access); - - // Configure window in case of configured output - if(output->total_size() != 0) - { - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); - window_changed = window_changed || update_window_and_padding(win, output_access); - output_access.set_valid_region(win, input->valid_region()); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 32); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x8x4_t data = - { - { - vld1_u8(in.ptr() + 0 * in_stride), - vld1_u8(in.ptr() + 1 * in_stride), - vld1_u8(in.ptr() + 2 * in_stride), - vld1_u8(in.ptr() + 3 * in_stride), - } - }; - vst4_u8(out.ptr(), data); - }, - in, out); -} - -void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 16); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint16x4x4_t data = - { - { - vld1_u16(reinterpret_cast(in.ptr() + 0 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 1 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 2 * in_stride)), - vld1_u16(reinterpret_cast(in.ptr() + 3 * in_stride)), - } - }; - vst4_u16(reinterpret_cast(out.ptr()), data); - }, - in, out); -} - -void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window) -{ - const size_t in_stride = input->info()->strides_in_bytes()[1]; - - // Set window for output tensor - Window win_out(window); - win_out.scale(Window::DimY, 0.25f); - Iterator in(input, window); - - win_out.set_dimension_step(Window::DimX, 16); - Iterator out(output, win_out); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint32x4x4_t data = - { - { - vld1q_u32(reinterpret_cast(in.ptr() + 0 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 1 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 2 * in_stride)), - vld1q_u32(reinterpret_cast(in.ptr() + 3 * in_stride)) - } - }; - vst4q_u32(reinterpret_cast(out.ptr()), data); - }, - in, out); -} } // namespace NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() @@ -191,33 +84,92 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) switch(input->info()->element_size()) { case 1: - _func = &gemm_interleave_8bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; case 2: - _func = &gemm_interleave_16bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; case 4: - _func = &gemm_interleave_32bit_elements; + _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4; break; default: ARM_COMPUTE_ERROR_ON("Element size not supported"); break; } - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*input->info(), Steps(1, 4)); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win); } Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } +template +void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window) +{ + const size_t window_start_x = window.x().start(); + const size_t window_end_x = window.x().end(); + + const size_t in_height = input->info()->dimension(1); + const size_t in_stride = input->info()->strides_in_bytes()[1]; + + const size_t partial_y = in_height % 4; + + // Set window for the input tensor + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Set window for the output tensor + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.scale(Window::DimY, 0.25f); + + Iterator in(input, win); + Iterator out(output, win_out); + + execute_window_loop(win, [&](const Coordinates & id) + { + if(id.y() + 4 <= static_cast(in_height)) + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + const ScalarType data[4] = + { + *(reinterpret_cast(in.ptr() + 0 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 1 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 2 * in_stride) + x), + *(reinterpret_cast(in.ptr() + 3 * in_stride) + x), + }; + std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType)); + } + } + else + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + ScalarType data[4] = { 0, 0, 0, 0 }; + + for(size_t y = 0; y < partial_y; ++y) + { + data[y] = *(reinterpret_cast(in.ptr() + y * in_stride) + x); + } + + std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType)); + } + } + }, + in, out); +} + void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -233,5 +185,5 @@ void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info * * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] */ - (*_func)(_input, _output, window); + (this->*_func)(_input, _output, window); } -- cgit v1.2.1