From c226853f80d53619a2f49e646635e04ee0885c3b Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 9 Oct 2020 11:52:10 +0100 Subject: COMPMID-3794: Fix window loops causing performance regression Signed-off-by: Michalis Spyrou Change-Id: Id4d95c6ce5fed91bb079b8bfe1abceedefd20c97 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4117 Reviewed-by: Sheri Zhang Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp | 8 ++++---- src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp | 6 +++--- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index c022fa05a0..8c11574755 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -110,7 +110,7 @@ output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITe execute_window_loop(win, [&](const Coordinates & id) { int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Get bias and pointer to input const auto in_ptr = reinterpret_cast(in.ptr()) + x; @@ -175,7 +175,7 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe execute_window_loop(win, [&](const Coordinates &) { int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Get bias and pointer to input const auto in_ptr = reinterpret_cast(in.ptr()); @@ -238,7 +238,7 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window { int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Get bias and pointer to input const auto in_ptr = reinterpret_cast(in.ptr()) + x; @@ -323,7 +323,7 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window execute_window_loop(win, [&](const Coordinates &) { int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Get bias and pointer to input const auto in_ptr = reinterpret_cast(in.ptr()) + x; diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index 5710897329..4ac33d1e29 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -139,7 +139,7 @@ void run_offset_contribution(const Window &window, const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Compute the leftover term due to a_offset. int32x4x4_t a_offset_term_s32 = @@ -237,7 +237,7 @@ void run_offset_contribution(const Window &window, const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { int32x4x4_t in_s32 = { @@ -291,7 +291,7 @@ void run_offset_contribution(const Window &window, auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + for(; x <= (window_end_x - window_step_x); x += window_step_x) { // Compute the leftover term due to a_offset. int32x4x4_t a_offset_term_s32 = diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 1310ef3521..397eae94ea 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -1316,7 +1316,7 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo (_input->info()->strides_in_bytes().z()); int x_off = window_start_x; - for(; x_off < (window_end_x - window_step_x); x_off += window_step_x) + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { const auto in_x0_ptr = reinterpret_cast(input.ptr() + in_x0_offset) + x_off; const auto in_x1_ptr = reinterpret_cast(input.ptr() + in_x1_offset) + x_off; @@ -1432,7 +1432,7 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x); int x_off = window_start_x; - for(; x_off < (window_end_x - window_step_x); x_off += window_step_x) + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { if(pooling_type != PoolingType::MAX) { @@ -1943,7 +1943,7 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x); int x_off = window_start_x; - for(; x_off < (window_end_x - window_step_x); x_off += window_step_x) + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { if(pooling_type != PoolingType::MAX) { @@ -2113,7 +2113,7 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo (_input->info()->strides_in_bytes().z()); int x_off = window_start_x; - for(; x_off < (window_end_x - window_step_x); x_off += window_step_x) + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { const auto in_x0_ptr = reinterpret_cast(input.ptr() + in_x0_offset); const auto in_x1_ptr = reinterpret_cast(input.ptr() + in_x1_offset); @@ -2337,7 +2337,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x); int x_off = window_start_x; - for(; x_off < (window_end_x - window_step_x); x_off += window_step_x) + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { if(pooling_type != PoolingType::MAX) { -- cgit v1.2.1