From 642680abde9d9021398695b495f9da63f4688d76 Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Tue, 2 Apr 2019 09:40:08 +0100 Subject: COMPMID-1983 : Optimize NEIm2Col with NHWC data layout Improves NEIm2Col performance by ~45% for single threaded run. Change-Id: I6cfb9e3af51ab0756e880f0e0f82cad4c644702c Signed-off-by: Vidhya Sudhan Loganathan Reviewed-on: https://review.mlplatform.org/c/920 Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- src/core/NEON/kernels/NEIm2ColKernel.cpp | 48 +++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 2e3d9de5f1..34af0cf3fd 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -208,32 +208,48 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, const int end_x = start_x + kernel_width * dilation_x; const int end_y = start_y + kernel_height * dilation_y; const int pad_quant = kernel_width * input_c; - - for(int y = start_y; y < end_y; y += dilation_y) + if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1)) { - if(y < 0 || y >= input_h) + for(int y = start_y; y < end_y; y += dilation_y) { - memset(out_ptr, pad_value, pad_quant * sizeof(T)); - out_ptr += pad_quant; + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T)); + out_ptr += input_c * kernel_width; } - else + } + else + { + for(int y = start_y; y < end_y; y += dilation_y) { - for(int x = start_x; x < end_x; x += dilation_x) + if(y < 0 || y >= input_h) { - if(x < 0 || x >= input_w) - { - memset(out_ptr, pad_value, input_c * sizeof(T)); - out_ptr += input_c; - } - else + memset(out_ptr, pad_value, pad_quant * sizeof(T)); + out_ptr += pad_quant; + } + else if(dilation_x > 1 || start_x < 0 || end_x >= input_w) + { + for(int x = start_x; x < end_x; x += dilation_x) { - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T)); - out_ptr += input_c; + if(x < 0 || x >= input_w) + { + memset(out_ptr, pad_value, input_c * sizeof(T)); + out_ptr += input_c; + } + else + { + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T)); + out_ptr += input_c; + } } } + else + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T)); + out_ptr += input_c * kernel_width; + } } } - // Append 1 if the convolution layer has biases if(has_bias) { -- cgit v1.2.1