From fb92e22c642985a5ea7906e7e7f46285d1d47718 Mon Sep 17 00:00:00 2001 From: David Mansell Date: Wed, 22 Nov 2023 11:33:46 +0000 Subject: arm_gemm: convolution: optimize convolver.hpp. The code in convolver.hpp generates pointers into either the appropriate point in the input activation tensor or the padding buffer for each kernel point of each output point of the convolution. This is done at runtime interspersed with the data transform and matrix multiplication steps. As such, it can have a significant impact on performance, particularly for low input channel counts. This change improves the performance of this code by streamlining the checks for out of range input points (which must be directed to the padding buffer). The previous implementation checked all four borders for every point. The revised code does the checks one at a time, and for any failing check applies the result to as many output points as possible without repeating the other checks. Signed-off-by: David Mansell Change-Id: I36a4fa114b425c1bcba2be40acf36718522519f5 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11004 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/core/NEON/kernels/arm_gemm/convolver.hpp | 88 ++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 12 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp index 879d95f5bb..b15f669132 100644 --- a/src/core/NEON/kernels/arm_gemm/convolver.hpp +++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020,2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -103,11 +103,15 @@ private: return (m_length_remaining == 0); } + // Compute a block of output pointers, accounting for padding. + // This is performance critical. std::tuple next_block(const T ** const row_ptr) { if (finished()) { return std::make_tuple(0, 0); } + const T *pad_ptr = m_convolver.m_pad_row.data(); + // "in_width" in the amount of data that will be read in (copied) // "out_width" is the total amount of data that will be produced (including padding) unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0; @@ -117,23 +121,83 @@ private: unsigned int output_y = m_start_output_y; unsigned int output_x = m_start_output_x; - for (unsigned int row=0; row= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) { - row_ptr[row] = m_convolver.m_pad_row.data(); - } else { - row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride; + // Factor out base pointer computation. + const T *base_ptr = m_parent.m_input_base + + (input_y * m_convolver.m_params.input_width * m_parent.m_input_stride); + + // To start with, check the input row is in-bounds. If + // not, (at least) this entire output row must be + // padding so handle accordingly. + + // If input_y is off the bottom of the input, we are + // going to get padding for every remanining output + // point. + if (input_y >= m_convolver.m_params.input_height) { + while (row < m_active_height) { + row_ptr[row++] = pad_ptr; + } + break; } - output_x++; - if (output_x == m_convolver.m_params.output_width) { - output_y++; - output_x=0; + // If input_y is less than zero, we are going to get + // padding for the rest of this output row. + if (input_y < 0) { + while (output_x < m_convolver.m_params.output_width && row