From 2b3129ebb9e4366e91de5031d1e1d3759cc42c8e Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Wed, 25 Apr 2018 18:10:13 +0100
Subject: COMPMID-1041 NEON Winograd: update function to use GEMM function

Change-Id: I1ecdf10e02193de7f47a72b75cce0d58a1fa1a1c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/128411
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 .../runtime/NEON/functions/NEWinogradLayer.h       |  7 +-
 src/runtime/NEON/functions/NEWinogradLayer.cpp     | 88 ++++++++++++++--------
 2 files changed, 61 insertions(+), 34 deletions(-)
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
index 27b1e84201..8010810253 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -93,8 +94,9 @@ public:
     NEWinogradLayer &operator=(const NEWinogradLayer &) = delete;
 
 private:
-    MemoryGroup                _memory_group;
-    std::unique_ptr<INEKernel> _batched_gemm_kernel;
+    MemoryGroup _memory_group;
+    std::unique_ptr<arm_gemm::GemmCommon<float, float>> _arm_gemm;
+    std::unique_ptr<INEKernel> _gemm_kernel;
     std::unique_ptr<INEKernel> _transform_input_kernel;
     std::unique_ptr<INEKernel> _transform_output_kernel;
     std::unique_ptr<INEKernel> _transform_weights_kernel;
@@ -109,6 +111,7 @@ private:
     Tensor         _input_nhwc;
     Tensor         _output_nhwc;
     Tensor         _weights_hwio;
+    Tensor         _workspace;
     const ITensor *_input;
     const ITensor *_weights;
     ITensor       *_output;
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index 264b97f7c1..7d93bcff07 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
@@ -79,9 +80,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 } //namespace
 
 NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+    : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
       _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
-      _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+      _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
 {
 } /* arm_compute */
 
@@ -95,27 +96,40 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
     _input   = input;
     _output  = output;
 
-    std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
     std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
     std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
     std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
 
-    switch(weights->info()->dimension(0))
+    const int weights_width  = weights->info()->dimension(0);
+    const int weights_height = weights->info()->dimension(1);
+
+    int output_tile_rows = 0;
+    int output_tile_cols = 0;
+    int n_gemms          = 0;
+    int N_BLOCK          = 0; // Size of block used by GEMM.
+
+    switch(weights_width)
     {
         case 3:
         {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
             transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
             transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
             transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+            output_tile_rows         = 2;
+            output_tile_cols         = 2;
+            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
             break;
         }
         case 5:
         {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
             transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
             transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
             transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+            output_tile_rows         = 2;
+            output_tile_cols         = 2;
+            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
             break;
         }
         default:
@@ -170,8 +184,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
     _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
     _input_nhwc.allocator()->allocate();
 
-    const int         weights_width  = weights->info()->dimension(0);
-    const int         weights_height = weights->info()->dimension(1);
     const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
 
     // Configure the InputTransform
@@ -192,27 +204,41 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
                                        output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
                                        in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
 
-    // Configure Batched GEMMs
-    const int      output_tile_rows         = batched_gemm_kernel->get_output_tile_rows();
-    const int      output_tile_cols         = batched_gemm_kernel->get_output_tile_cols();
-    const int      n_block                  = batched_gemm_kernel->get_number_blocks();
-    const int      tile_rows                = iceildiv(output_shape.n_rows, output_tile_rows);
-    const int      tile_cols                = iceildiv(output_shape.n_cols, output_tile_cols);
-    const int      m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int      k                        = in_shape.n_channels;
-    const int      n                        = out_channels;
-    const int      input_matrix_row_stride  = in_shape.n_channels;
-    const int      kernel_matrix_row_stride = roundup(out_channels, n_block);
-    const int      output_matrix_row_stride = kernel_matrix_row_stride;
-    const unsigned n_gemms                  = batched_gemm_kernel->get_number_gemms();
-
-    batched_gemm_kernel->configure(n_gemms, m, k, n,
-                                   input_matrix_stride, input_matrix_row_stride,
-                                   kernel_matrix_stride, kernel_matrix_row_stride,
-                                   output_matrix_stride, output_matrix_row_stride,
-                                   reinterpret_cast<float *>(_input_workspace.buffer()),
-                                   reinterpret_cast<float *>(_kernel_storage.buffer()),
-                                   reinterpret_cast<float *>(_output_workspace.buffer()));
+    // Configure GEMM
+    const int    tile_rows                = iceildiv(output_shape.n_rows, output_tile_rows);
+    const int    tile_cols                = iceildiv(output_shape.n_cols, output_tile_cols);
+    const int    m                        = in_shape.n_batches * tile_rows * tile_cols;
+    const int    k                        = in_shape.n_channels;
+    const int    n                        = out_channels;
+    const int    input_matrix_row_stride  = in_shape.n_channels;
+    const int    kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+    const int    output_matrix_row_stride = kernel_matrix_row_stride;
+    unsigned int num_threads              = NEScheduler::get().num_threads();
+
+    _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+    _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+                          kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+    auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+    acl_gemm_wrapper->configure(_arm_gemm.get());
+    const size_t workspace_size = _arm_gemm->get_working_size();
+
+    // Allocate workspace
+    if(workspace_size > 0)
+    {
+        const unsigned int alignment = 4096;
+        allocate_workspace(workspace_size, _workspace, _memory_group, alignment, 1);
+        _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+    }
+
+    const unsigned int window_size = _arm_gemm->get_window_size();
+    if(window_size < num_threads)
+    {
+        num_threads = window_size;
+        _arm_gemm->set_nthreads(num_threads);
+    }
+
+    _gemm_kernel = std::move(acl_gemm_wrapper);
 
     // Reorder the convoluted output to ACL's ordering NCHW
     _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
@@ -220,7 +246,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
     _transform_input_kernel   = std::move(transform_input_kernel);
     _transform_weights_kernel = std::move(transform_weights_kernel);
     _transform_output_kernel  = std::move(transform_output_kernel);
-    _batched_gemm_kernel      = std::move(batched_gemm_kernel);
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
@@ -246,7 +271,7 @@ void NEWinogradLayer::run()
     NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
+    NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
 
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
@@ -258,7 +283,6 @@ void NEWinogradLayer::run()
     {
         _activationlayer_function.run();
     }
-
     _memory_group.release();
 }
 
-- 
cgit v1.2.1