From ef776a844741826fc4740ad24714866aaeb35b8e Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 25 Jul 2018 17:57:49 +0100
Subject: COMPMID-1386: Add FC convert weights on NEON

Change-Id: I7a3c6db9285e3899494f496b2562d80cec1b6521
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141407
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../runtime/NEON/functions/NEFullyConnectedLayer.h |   4 +
 examples/graph_lenet.cpp                           |   1 -
 examples/graph_vgg16.cpp                           |   1 -
 examples/graph_vgg19.cpp                           |   1 -
 .../NEON/functions/NEFullyConnectedLayer.cpp       | 128 ++++++++++++++-------
 5 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 92ca17a3a4..fe0f2f03f7 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
@@ -129,6 +130,7 @@ private:
 
     MemoryGroup                                         _memory_group;
     NEIm2ColKernel                                      _im2col_kernel;
+    NEConvertFullyConnectedWeights                      _convert_weights;
     NEFullyConnectedLayerReshapeWeights                 _reshape_weights_function;
     NEGEMM                                              _mm_gemm;
     NEGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
@@ -136,8 +138,10 @@ private:
     NEGEMMMatrixAccumulateBiasesKernel                  _accumulate_biases_kernel;
     Tensor                                              _im2col_output;
     Tensor                                              _gemmlowp_output;
+    Tensor                                              _converted_weights_output;
     Tensor                                              _reshape_weights_output;
     const ITensor                                      *_original_weights;
+    bool                                                _are_weights_converted;
     bool                                                _are_weights_reshaped;
     bool                                                _is_fc_after_conv;
     bool                                                _accumulate_biases;
diff --git a/examples/graph_lenet.cpp b/examples/graph_lenet.cpp
index f3aa266c50..c658d1b361 100644
--- a/examples/graph_lenet.cpp
+++ b/examples/graph_lenet.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC && common_params.target != Target::CL, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index e23ea65dd7..808ce08ce7 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -60,7 +60,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC && common_params.target != Target::CL, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index 6cb6b1fae2..96a1d8a9d4 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp
@@ -59,7 +59,6 @@ public:
 
         // Checks
         ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "Unsupported data type!");
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_layout == DataLayout::NHWC && common_params.target != Target::CL, "Unsupported data layout!");
 
         // Print parameter values
         std::cout << common_params << std::endl;
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 9d3cb31c9a..34cabb5c2e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -74,8 +74,9 @@ Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c
 }
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
-      _gemmlowp_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+      _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
+      _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
 {
 }
 
@@ -146,11 +147,12 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
                                                                output->info(),
                                                                fc_info));
 
-    _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv     = true;
-    _accumulate_biases    = false;
-    _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _original_weights     = weights;
+    _are_weights_converted = true;
+    _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    _is_fc_after_conv      = true;
+    _accumulate_biases     = false;
+    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _original_weights      = weights;
 
     // Configure gemmlowp output
     if(_is_quantized)
@@ -175,17 +177,8 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
 
     const ITensor *weights_to_use = weights;
 
-    if(!_are_weights_reshaped)
-    {
-        weights_to_use = &_reshape_weights_output;
-
-        // Reshape the weights
-        _reshape_weights_function.configure(weights, &_reshape_weights_output);
-    }
-
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-
     if(is_batched_fc_layer)
     {
         _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
@@ -197,6 +190,27 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
         _is_fc_after_conv = input->info()->num_dimensions() > 1;
     }
 
+    // Reshape weights if needed
+    if(!_are_weights_reshaped)
+    {
+        // Reshape the weights
+        _reshape_weights_function.configure(weights, &_reshape_weights_output);
+        weights_to_use = &_reshape_weights_output;
+    }
+
+    // Convert weights if needed
+    if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights.configure(weights_to_use,
+                                   &_converted_weights_output,
+                                   input->info()->tensor_shape(),
+                                   fc_info.weights_trained_layout);
+
+        weights_to_use         = &_converted_weights_output;
+        _are_weights_converted = false;
+    }
+
     ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
     if(_is_fc_after_conv)
     {
@@ -235,9 +249,10 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
     bool is_fc_after_conv = true;
     bool is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
-    const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    const ITensorInfo &im2col_input      = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
+    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = TensorInfo(reshaped_weights.clone()->set_is_resizable(true).reset_padding());
+    const ITensorInfo &gemmlowp_output   = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
     // Configure accumulate biases kernel for non quantized asymmetric types
     if(biases != nullptr && !is_quantized)
@@ -256,13 +271,6 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
     const ITensorInfo *weights_to_use = weights;
     const ITensorInfo *tmp_output     = (is_quantized) ? &gemmlowp_output : output;
 
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = output->dimension(1) > 1;
 
@@ -277,6 +285,23 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
         is_fc_after_conv = input->num_dimensions() > 1;
     }
 
+    if(!weights_reshaped)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
+                                                                             &converted_weights,
+                                                                             input->tensor_shape(),
+                                                                             fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
     if(is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
@@ -345,29 +370,54 @@ void NEFullyConnectedLayer::prepare()
 {
     if(!_is_prepared)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        auto release_unused = [](Tensor * w)
+        {
+            if(!w->is_used())
+            {
+                w->allocator()->free();
+            }
+        };
+
+        // Pointer to current weights
+        const ITensor *cur_weights = _original_weights;
+
         // Reshape of the weights (happens only once)
         if(!_are_weights_reshaped)
         {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
             // Run reshape weights kernel and mark weights as unused
             _reshape_weights_output.allocator()->allocate();
             _reshape_weights_function.run();
-            _original_weights->mark_as_unused();
-
-            // Prepare GEMM prepare and release unused weights
-            if(!_is_quantized)
-            {
-                _mm_gemm.prepare();
-                if(!_reshape_weights_output.is_used())
-                {
-                    _reshape_weights_output.allocator()->free();
-                }
-            }
 
+            cur_weights->mark_as_unused();
+            cur_weights           = &_reshape_weights_output;
             _are_weights_reshaped = true;
         }
 
+        // Convert weights if needed (happens only once)
+        if(!_are_weights_converted)
+        {
+            _converted_weights_output.allocator()->allocate();
+            _convert_weights.run();
+
+            cur_weights->mark_as_unused();
+            _are_weights_converted = true;
+        }
+
+        // Release reshaped weights if unused
+        release_unused(&_reshape_weights_output);
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+
+        // Release converted weights if unused
+        release_unused(&_reshape_weights_output);
+        release_unused(&_converted_weights_output);
+
         _is_prepared = true;
     }
 }
-- 
cgit v1.2.1