From 95f9361416a0db7fee2e35bd0f7afbecbb7352c2 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Thu, 13 Jun 2019 15:58:32 +0100
Subject: COMPMID-2185: Implement direct convolution 9x9 (NHWC) on NEON

Change-Id: I3f47947bdbc80946668165c8fc2ce52eca7722a6
Signed-off-by: giuros01 <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1409
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../kernels/NEDirectConvolutionLayerKernel.cpp     | 344 ++++++++++++++++++++-
 1 file changed, 333 insertions(+), 11 deletions(-)

(limited to 'src/core/NEON/kernels')

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index d557cfa1bd..bcf70b3ad8 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -865,6 +865,256 @@ public:
     }
 };
 
+inline void convolve_row1x9_nhwc(const float *row_ptr, const float *weights_ptr, size_t src_stride_y, size_t weights_stride_y,
+                                 float32x4_t &acc0, float32x4_t &acc1, float32x4_t &acc2, float32x4_t &acc3)
+{
+    // Load 4 channels for each of the 12 inputs values along the same X spatial dimension
+    const float32x4_t src0  = wrapper::vloadq(row_ptr);
+    const float32x4_t src1  = wrapper::vloadq(row_ptr + 1 * src_stride_y);
+    const float32x4_t src2  = wrapper::vloadq(row_ptr + 2 * src_stride_y);
+    const float32x4_t src3  = wrapper::vloadq(row_ptr + 3 * src_stride_y);
+    const float32x4_t src4  = wrapper::vloadq(row_ptr + 4 * src_stride_y);
+    const float32x4_t src5  = wrapper::vloadq(row_ptr + 5 * src_stride_y);
+    const float32x4_t src6  = wrapper::vloadq(row_ptr + 6 * src_stride_y);
+    const float32x4_t src7  = wrapper::vloadq(row_ptr + 7 * src_stride_y);
+    const float32x4_t src8  = wrapper::vloadq(row_ptr + 8 * src_stride_y);
+    const float32x4_t src9  = wrapper::vloadq(row_ptr + 9 * src_stride_y);
+    const float32x4_t src10 = wrapper::vloadq(row_ptr + 10 * src_stride_y);
+    const float32x4_t src11 = wrapper::vloadq(row_ptr + 11 * src_stride_y);
+
+    // Load 4 channels for each of the 9 weights values along the same X spatial dimension
+    const float32x4_t w0 = wrapper::vloadq(weights_ptr);
+    const float32x4_t w1 = wrapper::vloadq(weights_ptr + 1 * weights_stride_y);
+    const float32x4_t w2 = wrapper::vloadq(weights_ptr + 2 * weights_stride_y);
+    const float32x4_t w3 = wrapper::vloadq(weights_ptr + 3 * weights_stride_y);
+    const float32x4_t w4 = wrapper::vloadq(weights_ptr + 4 * weights_stride_y);
+    const float32x4_t w5 = wrapper::vloadq(weights_ptr + 5 * weights_stride_y);
+    const float32x4_t w6 = wrapper::vloadq(weights_ptr + 6 * weights_stride_y);
+    const float32x4_t w7 = wrapper::vloadq(weights_ptr + 7 * weights_stride_y);
+    const float32x4_t w8 = wrapper::vloadq(weights_ptr + 8 * weights_stride_y);
+
+    // Store 4 channels for each of the 4 output values along the same X spatial dimension
+    acc0 = wrapper::vmla(acc0, w0, src0);
+    acc0 = wrapper::vmla(acc0, w1, src1);
+    acc0 = wrapper::vmla(acc0, w2, src2);
+    acc0 = wrapper::vmla(acc0, w3, src3);
+    acc0 = wrapper::vmla(acc0, w4, src4);
+    acc0 = wrapper::vmla(acc0, w5, src5);
+    acc0 = wrapper::vmla(acc0, w6, src6);
+    acc0 = wrapper::vmla(acc0, w7, src7);
+    acc0 = wrapper::vmla(acc0, w8, src8);
+
+    acc1 = wrapper::vmla(acc1, w0, src1);
+    acc1 = wrapper::vmla(acc1, w1, src2);
+    acc1 = wrapper::vmla(acc1, w2, src3);
+    acc1 = wrapper::vmla(acc1, w3, src4);
+    acc1 = wrapper::vmla(acc1, w4, src5);
+    acc1 = wrapper::vmla(acc1, w5, src6);
+    acc1 = wrapper::vmla(acc1, w6, src7);
+    acc1 = wrapper::vmla(acc1, w7, src8);
+    acc1 = wrapper::vmla(acc1, w8, src9);
+
+    acc2 = wrapper::vmla(acc2, w0, src2);
+    acc2 = wrapper::vmla(acc2, w1, src3);
+    acc2 = wrapper::vmla(acc2, w2, src4);
+    acc2 = wrapper::vmla(acc2, w3, src5);
+    acc2 = wrapper::vmla(acc2, w4, src6);
+    acc2 = wrapper::vmla(acc2, w5, src7);
+    acc2 = wrapper::vmla(acc2, w6, src8);
+    acc2 = wrapper::vmla(acc2, w7, src9);
+    acc2 = wrapper::vmla(acc2, w8, src10);
+
+    acc3 = wrapper::vmla(acc3, w0, src3);
+    acc3 = wrapper::vmla(acc3, w1, src4);
+    acc3 = wrapper::vmla(acc3, w2, src5);
+    acc3 = wrapper::vmla(acc3, w3, src6);
+    acc3 = wrapper::vmla(acc3, w4, src7);
+    acc3 = wrapper::vmla(acc3, w5, src8);
+    acc3 = wrapper::vmla(acc3, w6, src9);
+    acc3 = wrapper::vmla(acc3, w7, src10);
+    acc3 = wrapper::vmla(acc3, w8, src11);
+}
+
+float vreduce(const float32x4_t &v)
+{
+    auto v0    = wrapper::vgethigh(v);
+    auto v1    = wrapper::vgetlow(v);
+    auto v_out = wrapper::vadd(v0, v1);
+
+    float a = wrapper::vgetlane(v_out, 0);
+    float b = wrapper::vgetlane(v_out, 1);
+    return a + b;
+}
+
+template <typename V>
+class convolver_9x9_nhwc
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        // Declare useful types
+        using vector_type = typename V::type;
+        using scalar_type = typename V::scalar_type;
+        using tag_type    = typename V::tag_type;
+
+        // Scalar quantities
+        const int          element_size    = input->info()->element_size();
+        const int          input_width     = input->info()->dimension(0);
+        const int          input_depth     = input->info()->dimension(2);
+        const int          input_stride_y  = input->info()->strides_in_bytes().y() / element_size;
+        const int          input_stride_z  = input->info()->strides_in_bytes().z() / element_size;
+        const int          input_stride_w  = input->info()->strides_in_bytes()[3];
+        const int          output_stride_x = output->info()->strides_in_bytes().x();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y() / element_size;
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z() / element_size;
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+
+        // Setup input window for the input iterator
+        Window window_in = window;
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        // Setup input window for the output iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        // Setup input window for the weights iterator
+        Window window_k = calculate_max_window(*weights->info(), Steps());
+        window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
+        window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
+        window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
+
+        Iterator in(input, window_in);
+        Iterator out(output, window_out);
+        Iterator k(weights, window_k);
+
+        // Calculate the max_offset.
+        // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
+        //  |******************|
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane0      |
+        //  |      batch0      |
+        //  |__________________|
+        //  |******************|       Batch 0
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane1      |
+        //  |      batch0      |
+        //  |__________________|-----> max_offset
+        //  |******************|
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane0      |
+        //  |      batch1      |
+        //  |__________________|
+        //  |******************|       Batch 1
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane1      |
+        //  |      batch1      |
+        //  |__________________|
+        //  |     pad_bottom   |
+        //  |******************|
+        const int max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y;
+        execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size
+        {
+
+            execute_window_loop(window_out, [&](const Coordinates & id)
+            {
+                const auto y_offset = int(id.y() - conv_pad_left) * input_stride_y;
+
+                // Buffer pointers
+                const scalar_type *in_ptr      = reinterpret_cast<scalar_type *>(input->buffer() + input->info()->offset_first_element_in_bytes() + id[3] * input_stride_w);
+                const scalar_type *weights_ptr = reinterpret_cast<scalar_type *>(k.ptr());
+                uint8_t           *out_ptr     = out.ptr() + id_k[3] * output_stride_x;
+
+                // Output elements
+                vector_type out0 = wrapper::vdup_n(scalar_type(0), tag_type());
+                vector_type out1 = wrapper::vdup_n(scalar_type(0), tag_type());
+                vector_type out2 = wrapper::vdup_n(scalar_type(0), tag_type());
+                vector_type out3 = wrapper::vdup_n(scalar_type(0), tag_type());
+
+                // Reduce along the feature maps
+                for(int x = 0; x < input_width; x += num_elems_read_per_iteration)
+                {
+                    // z == 0
+                    auto in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
+                    in_z        = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+                    auto offset = y_offset + in_z * input_stride_z;
+                    offset      = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 1
+                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 1);
+                    in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+                    offset = y_offset + in_z * input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 2
+                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 2);
+                    in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+                    offset = y_offset + in_z * input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 3
+                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 3);
+                    offset = y_offset + in_z * input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 4
+                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 4);
+                    offset = y_offset + in_z * input_stride_z;
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 5
+                    offset += input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 5 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 6
+                    offset += input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 6 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 7
+                    offset += input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 7 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+                    // z == 8
+                    offset += input_stride_z;
+                    offset = std::min(offset, max_offset);
+                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 8 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+                }
+
+                *(reinterpret_cast<scalar_type *>(out_ptr + 0 * output_stride_y)) = vreduce(out0);
+                *(reinterpret_cast<scalar_type *>(out_ptr + 1 * output_stride_y)) = vreduce(out1);
+                *(reinterpret_cast<scalar_type *>(out_ptr + 2 * output_stride_y)) = vreduce(out2);
+                *(reinterpret_cast<scalar_type *>(out_ptr + 3 * output_stride_y)) = vreduce(out3);
+            },
+            in, out);
+        },
+        k);
+    }
+};
+
 template <typename T1, typename T2>
 inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -969,6 +1219,21 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i
     }
 }
 
+template <typename V>
+inline void convolve_9x9_nhwc(const Window &window, unsigned int num_elems_read_per_iteration,
+                              const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_9x9_nhwc<V>::convolve(window, num_elems_read_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -1122,18 +1387,62 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
     else
     {
-        border_size.left   = 0;
-        border_size.top    = conv_info.pad_left();
-        border_size.right  = 0;
-        border_size.bottom = conv_info.pad_right();
+        if(kernel_size == 9)
+        {
+            border_size.left = 0;
+            border_size.top  = conv_info.pad_left();
+
+            const int num_elems_read_per_iteration_x    = 4;
+            const int num_elems_written_per_iteration_x = 1;
+            const int num_elems_read_per_iteration_y    = 12;
+            const int num_elems_written_per_iteration_y = 4;
+
+            num_elems_read_per_iteration    = num_elems_read_per_iteration_x;
+            num_elems_written_per_iteration = num_elems_written_per_iteration_x;
+
+            border_size.right = num_elems_read_per_iteration_x;
+            if((conv_info.pad_bottom() != 0) || (conv_info.pad_top() != 0))
+            {
+                // If bottom or top padding are set, we need to read num_elems_read_per_iteration_y rows to zero.
+                // Since num_elems_read_per_iteration_y is always greater than conv_info.pad_right() we can set
+                // the bottom padding to num_elems_read_per_iteration_y
+                border_size.bottom = num_elems_read_per_iteration_y;
+            }
+            else if(conv_info.pad_right() != 0)
+            {
+                // Convetional border padding. Fill the bottom paddings so that we can read in batch of num_elems_read_per_iteration_y
+                border_size.bottom = ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y) - input->dimension(1);
+            }
+            else
+            {
+                // No padding
+                border_size.bottom = 0;
+            }
 
-        num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
+            win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
 
-        win = calculate_max_window(*output, Steps());
+            AccessWindowStatic input_access(input, 0, -border_size.top,
+                                            ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x),
+                                            input->dimension(1) + border_size.bottom);
 
-        AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
-        AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
-        window_changed = update_window_and_padding(win, input_access, weights_access);
+            AccessWindowStatic    weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1));
+            AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+            window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        }
+        else
+        {
+            border_size.left             = 0;
+            border_size.top              = conv_info.pad_left();
+            border_size.right            = 0;
+            border_size.bottom           = conv_info.pad_right();
+            num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
+            win                          = calculate_max_window(*output, Steps());
+
+            AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
+            AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
+            window_changed = update_window_and_padding(win, input_access, weights_access);
+        }
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -1268,7 +1577,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
                 }
                 break;
             }
-
             default:
             {
                 ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
@@ -1278,11 +1586,25 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
     }
     else
     {
+        const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
+        const int stride_x    = std::get<0>(_conv_info.stride());
+        const int stride_y    = std::get<1>(_conv_info.stride());
+
         switch(_input->info()->data_type())
         {
             case DataType::F32:
-                convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+            {
+                if(kernel_size == 9 && stride_x == 1 && stride_y == 1)
+                {
+                    using vtype = wrapper::traits::neon_vector<float, 4>;
+                    convolve_9x9_nhwc<vtype>(window, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+                }
+                else
+                {
+                    convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+                }
                 break;
+            }
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
-- 
cgit v1.2.1