From 642680abde9d9021398695b495f9da63f4688d76 Mon Sep 17 00:00:00 2001
From: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Date: Tue, 2 Apr 2019 09:40:08 +0100
Subject: COMPMID-1983 : Optimize NEIm2Col with NHWC data layout

Improves NEIm2Col performance by ~45% for single threaded run.

Change-Id: I6cfb9e3af51ab0756e880f0e0f82cad4c644702c
Signed-off-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Reviewed-on: https://review.mlplatform.org/c/920
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/NEON/kernels/NEIm2ColKernel.cpp | 48 +++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 16 deletions(-)
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 2e3d9de5f1..34af0cf3fd 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -208,32 +208,48 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     const int end_x     = start_x + kernel_width * dilation_x;
     const int end_y     = start_y + kernel_height * dilation_y;
     const int pad_quant = kernel_width * input_c;
-
-    for(int y = start_y; y < end_y; y += dilation_y)
+    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1))
     {
-        if(y < 0 || y >= input_h)
+        for(int y = start_y; y < end_y; y += dilation_y)
         {
-            memset(out_ptr, pad_value, pad_quant * sizeof(T));
-            out_ptr += pad_quant;
+            //optimized for no dilation and no boundary pixels
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+            out_ptr += input_c * kernel_width;
         }
-        else
+    }
+    else
+    {
+        for(int y = start_y; y < end_y; y += dilation_y)
         {
-            for(int x = start_x; x < end_x; x += dilation_x)
+            if(y < 0 || y >= input_h)
             {
-                if(x < 0 || x >= input_w)
-                {
-                    memset(out_ptr, pad_value, input_c * sizeof(T));
-                    out_ptr += input_c;
-                }
-                else
+                memset(out_ptr, pad_value, pad_quant * sizeof(T));
+                out_ptr += pad_quant;
+            }
+            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w)
+            {
+                for(int x = start_x; x < end_x; x += dilation_x)
                 {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
-                    out_ptr += input_c;
+                    if(x < 0 || x >= input_w)
+                    {
+                        memset(out_ptr, pad_value, input_c * sizeof(T));
+                        out_ptr += input_c;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
+                        out_ptr += input_c;
+                    }
                 }
             }
+            else
+            {
+                //optimized for no dilation and no boundary pixels
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+                out_ptr += input_c * kernel_width;
+            }
         }
     }
-
     // Append 1 if the convolution layer has biases
     if(has_bias)
     {
-- 
cgit v1.2.1