From 932b561159cd6a8c9230bbd0343790c85755846e Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 3 May 2018 13:44:35 +0100
Subject: COMPMID-959: Perform pretranspose if allowed on NEON assembly

Change-Id: I281699ce7270aec1317c47b5a13799954cf6c9e8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/130010
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/runtime/NEON/functions/NEGEMM.cpp                          |  5 +++--
 src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp          | 10 ++++++++--
 .../NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp    |  6 +++---
 src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp    |  8 ++++----
 src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp      |  2 +-
 5 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'src/runtime/NEON')
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index e0859be93e..9168ed4327 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -39,7 +39,7 @@
 namespace arm_compute
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
       _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
@@ -66,7 +66,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
-    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f) && setup_assembly_kernel(a, b, d, alpha, beta, _workspace, _memory_group, _asm_glue);
+    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
+                               && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
 
     // Check if the first input tensor is a vector.
     // If so, all the kernels for reshaping the tensors can be skipped
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 3c48d691ed..1ffeaf227d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -212,7 +212,8 @@ Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInf
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
       _output_col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(),
-      _workspace(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false), _is_activationlayer_enabled(false)
+      _workspace(), _B_pretransposed(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false),
+      _is_activationlayer_enabled(false)
 {
 }
 
@@ -365,7 +366,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
     // Configure matrix multiply
     if(run_optimised)
     {
-        if(!setup_assembly_kernel(&_input_im2col_reshaped, weights, &_gemm_output, 1.f, 0.f, _workspace, _memory_group, _asm_glue))
+        if(!setup_assembly_kernel(&_input_im2col_reshaped, weights, &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
         {
             ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
         }
@@ -559,6 +560,11 @@ void NEGEMMConvolutionLayer::run()
     if(_asm_glue._optimised_kernel != nullptr)
     {
         _asm_glue.run();
+        // Release weights in case buffer is pretransposed
+        if(_B_pretransposed.buffer() != nullptr && _weights_reshaped.is_used())
+        {
+            _weights_reshaped.allocator()->free();
+        }
     }
     else
     {
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 27dd6c51d7..bd81bf202f 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -39,7 +39,7 @@ using namespace arm_compute;
 
 NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
-      _workspace()
+      _workspace(), _B_pretransposed()
 {
 }
 
@@ -58,13 +58,13 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe
     {
         case DataType::S8:
         {
-            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 1.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
             break;
         }
         default:
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index cbec73fc31..30dd289326 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@ using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
-      _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
@@ -62,13 +62,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     {
         case DataType::S8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 1.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
             break;
         }
         default:
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index a1256ac8cb..f4640fb0b6 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -227,7 +227,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     if(workspace_size > 0)
     {
         const unsigned int alignment = 4096;
-        allocate_workspace(workspace_size, _workspace, _memory_group, alignment, 1);
+        allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
         _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
     }
 
-- 
cgit v1.2.1