From 7c850d5ca1aa0dd255eed794134535ece7849f96 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 4 Sep 2020 15:01:15 +0100 Subject: COMPMID-3155: Remove padding from NEGEMMLowpOffsetContributionKernel Change-Id: I93c3b795cf6fe0b27008543b6671a3be0a965603 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3916 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 342 +++++++++++---------- 1 file changed, 178 insertions(+), 164 deletions(-) diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index db6cb10995..5710897329 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -34,16 +34,9 @@ #include "arm_compute/core/Window.h" #include -#include -#include - -using namespace arm_compute; namespace arm_compute { -class Coordinates; -} // namespace arm_compute - namespace { Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, @@ -96,42 +89,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) -{ - constexpr unsigned int num_elems_processed_per_iteration = 16; - bool window_changed = false; - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, mm_result_access); - - if(a_offset != 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access); - } - if(b_offset != 0) - { - AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT - window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - void run_offset_contribution(const Window &window, ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d) { Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + + Iterator mm_result_it(mm_result, collapsed_window); + if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true { // Set window for vector_sum_col @@ -147,7 +120,6 @@ void run_offset_contribution(const Window &window, Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); - Iterator mm_result_it(mm_result, window); const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); @@ -156,67 +128,86 @@ void run_offset_contribution(const Window &window, execute_window_loop(collapsed_window, [&](const Coordinates & id) { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(vector_sum_col_ptr + 0), - vld1q_s32(vector_sum_col_ptr + 4), - vld1q_s32(vector_sum_col_ptr + 8), - vld1q_s32(vector_sum_col_ptr + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + const int batch_id = id.z() / depth_input; + auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() - + (id.z() % depth_input) * height_input); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); + int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = + int x = window_start_x; + for(; x < (window_end_x - window_step_x); x += window_step_x) { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; - - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32)); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32)); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32)); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32)); - - int32x4x4_t in_s32 = - { + { + vld1q_s32(vector_sum_col_ptr + x + 0), + vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), + vld1q_s32(vector_sum_col_ptr + x + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = + { + { + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset) + } + }; + + offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + int32x4x4_t in_s32 = { - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + a_offset_term_s32 *= a_offset; + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; + } }, vector_sum_col_it, vector_sum_row_it, mm_result_it); } @@ -231,40 +222,53 @@ void run_offset_contribution(const Window &window, win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); - Iterator mm_result_it(mm_result, window); const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(collapsed_window, [&](const Coordinates & id) { - const int batch_id = id.z() / depth_input; + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() - + (id.z() % depth_input) * height_input); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); + int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; - int32x4x4_t in_s32 = + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for(; x < (window_end_x - window_step_x); x += window_step_x) { + int32x4x4_t in_s32 = { - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32); - - // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += b_offset_term_s32; + } }, vector_sum_row_it, mm_result_it); } @@ -276,53 +280,68 @@ void run_offset_contribution(const Window &window, win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); - Iterator mm_result_it(mm_result, window); // Offset in case vector_sum_col is batched const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(collapsed_window, [&](const Coordinates & id) { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + const int batch_id = id.z() / depth_input; + auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = + int x = window_start_x; + for(; x < (window_end_x - window_step_x); x += window_step_x) { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { - vld1q_s32(vector_sum_col_ptr + 0), - vld1q_s32(vector_sum_col_ptr + 4), - vld1q_s32(vector_sum_col_ptr + 8), - vld1q_s32(vector_sum_col_ptr + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - int32x4x4_t in_s32 = - { + { + vld1q_s32(vector_sum_col_ptr + x + 0), + vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), + vld1q_s32(vector_sum_col_ptr + x + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + int32x4x4_t in_s32 = { - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset; + } }, vector_sum_col_it, mm_result_it); } @@ -365,23 +384,17 @@ void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITe } // Configure kernel window - auto win_config = validate_and_configure_window(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT - a_offset, b_offset); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*mm_result->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(mm_result->info()->num_dimensions()); + mm_result->info()->set_valid_region(ValidRegion(coord, mm_result->info()->tensor_shape())); + INEKernel::configure(win); } Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - a_offset, b_offset) - .first); // NOLINT return Status{}; } @@ -399,3 +412,4 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); } +} // namespace arm_compute -- cgit v1.2.1