COMPMID-3851: Fix regression on NEDepthwiseConvolutionLayerNativeKernel

The exit condition of some for loops in quantized version of the kernel with depth_multiplier=1 is decided during compilation to fix performance issue. Change-Id: I849b3d63b2a2cf5eb374ae681898ae1c296fb4fe Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4392 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Sang-Hoon Park <sang-hoon.park@arm.com> 2020-11-12 17:41:32 +0000
committer: Sang-Hoon Park <sang-hoon.park@arm.com> 2020-11-13 11:02:13 +0000
commit: 1a0a4bc78a12e85e1bd6b3207f244c91566ebdce (patch)
tree: b2807e01171f47b4394476a111007167f54b4e38 /src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
parent: 412b789582c5992431028e9b91c4d8b99d5c4900 (diff)
download: ComputeLibrary-1a0a4bc78a12e85e1bd6b3207f244c91566ebdce.tar.gz
1 files changed, 2 insertions, 2 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index 90a81b30c9..87315909d8 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -372,7 +372,7 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
                                                  out_of_bound_vector;
                     const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
 
-                    for(size_t i = 0; i < run_info.x_step; ++i)
+                    for(size_t i = 0; i < element_per_vector; ++i)
                     {
                         acc.at(i) += input_vals[i] * weights_vals[i];
                         in_sum.at(i) += input_vals[i];
@@ -387,7 +387,7 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
             }
 
             VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < run_info.x_step; ++i)
+            for(size_t i = 0; i < element_per_vector; ++i)
             {
                 acc.at(i) -= in_sum.at(i) * weights_qoffset;
                 acc.at(i) -= we_sum.at(i) * input_qoffset;
author	Sang-Hoon Park <sang-hoon.park@arm.com>	2020-11-12 17:41:32 +0000
committer	Sang-Hoon Park <sang-hoon.park@arm.com>	2020-11-13 11:02:13 +0000
commit	1a0a4bc78a12e85e1bd6b3207f244c91566ebdce (patch)
tree	b2807e01171f47b4394476a111007167f54b4e38 /src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
parent	412b789582c5992431028e9b91c4d8b99d5c4900 (diff)
download	ComputeLibrary-1a0a4bc78a12e85e1bd6b3207f244c91566ebdce.tar.gz