5 files changed, 45 insertions, 53 deletions
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index d815a73b93..504200e9ce 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -44,7 +44,6 @@ struct NEFullyConnectedLayer::Impl
     const ITensor *original_weights{ nullptr };
 
     ITensorPack                      run_pack{};
-    ITensorPack                      prep_pack{};
     WorkspaceData<Tensor>            workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
@@ -79,8 +78,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
 
     _impl->aux_mem_req = _impl->op->workspace();
     _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->prep_pack   = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
 Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -101,20 +99,7 @@ void NEFullyConnectedLayer::prepare()
 {
     if(!_impl->is_prepared)
     {
-        _impl->op->prepare(_impl->prep_pack);
-
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
-
-        if(has_reshape != std::end(_impl->aux_mem_req))
-        {
-            _impl->original_weights->mark_as_unused();
-        }
-        else
-        {
-            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_weights);
-        }
+        _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
         release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.cpp b/src/runtime/cpu/operators/CpuFullyConnected.cpp
index e7808fbc82..eeabce0753 100644
--- a/src/runtime/cpu/operators/CpuFullyConnected.cpp
+++ b/src/runtime/cpu/operators/CpuFullyConnected.cpp
@@ -150,9 +150,11 @@ CpuFullyConnected::CpuFullyConnected()
       _flattened_src(),
       _converted_weights(),
       _reshaped_weights(),
+      _trans_weights(),
+      _trans_weights_idx(AuxTensorIdx::Count),
       _aux_mem(Count),
-      _are_weights_converted(false),
-      _are_weights_reshaped(false),
+      _needs_weights_conversion(false),
+      _needs_weights_reshape(false),
       _is_fc_after_conv(false),
       _is_quantized_asymmetric(false),
       _is_prepared(false)
@@ -230,11 +232,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
                                                            dst,
                                                            fc_info));
 
-    _are_weights_converted   = true;
-    _are_weights_reshaped    = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv        = true;
-    _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-    _is_prepared             = false;
+    _needs_weights_conversion = false;
+    _needs_weights_reshape    = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    _needs_weights_reshape    = _needs_weights_reshape && !fc_info.retain_internal_weights;
+    _is_fc_after_conv         = true;
+    _is_quantized_asymmetric  = is_data_type_quantized_asymmetric(src->data_type());
+    _is_prepared              = false;
+    _trans_weights_idx        = AuxTensorIdx::Count;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -258,12 +262,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Reshape weights if needed
-    if(!_are_weights_reshaped)
+    if(_needs_weights_reshape)
     {
         // Reshape the weights
         _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
         _transpose_weights->configure(weights, &_reshaped_weights);
-        weights_to_use = &_reshaped_weights;
+        weights_to_use     = &_reshaped_weights;
+        _trans_weights_idx = AuxTensorIdx::TransposedWeights;
     }
 
     // Convert weights if needed
@@ -276,8 +281,9 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
                                     src->tensor_shape(),
                                     fc_info.weights_trained_layout);
 
-        weights_to_use         = &_converted_weights;
-        _are_weights_converted = false;
+        weights_to_use            = &_converted_weights;
+        _needs_weights_conversion = true;
+        _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
     }
 
     if(_is_fc_after_conv)
@@ -291,7 +297,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
         configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
     }
 
-    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+    // Retain the tensorinfo with the weights to use
+    if(_needs_weights_reshape || _needs_weights_conversion)
+    {
+        _trans_weights = *weights_to_use;
+    }
 
     // Set auxiliary memory requirements
     auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
@@ -308,7 +318,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
     else
     {
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Persistent, _reshaped_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size());
         _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size());
     }
     _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
@@ -401,6 +411,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
     auto src = tensors.get_const_tensor(ACL_SRC_0);
 
     CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+    CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
 
     // Linearize src if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -411,6 +422,10 @@ void CpuFullyConnected::run(ITensorPack &tensors)
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+    if(_needs_weights_reshape || _needs_weights_conversion)
+    {
+        gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
+    }
 
     // Run matrix multiply
     if(_is_quantized_asymmetric)
@@ -436,7 +451,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape of the weights (happens only once)
-        if(!_are_weights_reshaped)
+        if(_needs_weights_reshape)
         {
             // Run reshape weights kernel and mark weights as unused
             ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
@@ -444,32 +459,29 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
-
-            _are_weights_reshaped = true;
         }
 
         // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
+        if(_needs_weights_conversion)
         {
             ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
             cur_weights = converted_weights.get();
-
-            _are_weights_converted = true;
         }
 
-        tensors.add_const_tensor(ACL_SRC_1, cur_weights);
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
         if(!_is_quantized_asymmetric)
         {
-            _mm_gemm->prepare(tensors);
+            _mm_gemm->prepare(gemm_pack);
         }
         else
         {
-            _mm_gemmlowp->prepare(tensors);
+            _mm_gemmlowp->prepare(gemm_pack);
         }
 
         _is_prepared = true;
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.h b/src/runtime/cpu/operators/CpuFullyConnected.h
index 954a7b7ffc..498ceae68d 100644
--- a/src/runtime/cpu/operators/CpuFullyConnected.h
+++ b/src/runtime/cpu/operators/CpuFullyConnected.h
@@ -128,14 +128,16 @@ private:
     std::unique_ptr<CpuGemm>                         _mm_gemm;
     std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
 
-    TensorInfo _flattened_src;
-    TensorInfo _converted_weights;
-    TensorInfo _reshaped_weights;
+    TensorInfo   _flattened_src;
+    TensorInfo   _converted_weights;
+    TensorInfo   _reshaped_weights;
+    TensorInfo   _trans_weights;
+    AuxTensorIdx _trans_weights_idx;
 
     experimental::MemoryRequirements _aux_mem;
 
-    bool _are_weights_converted;
-    bool _are_weights_reshaped;
+    bool _needs_weights_conversion;
+    bool _needs_weights_reshape;
     bool _is_fc_after_conv;
     bool _is_quantized_asymmetric;
     bool _is_prepared;
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8adf7047fd..f22446863c 100644
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -672,15 +672,6 @@ void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
         if(_asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
-
-            auto has_reshape = std::find_if(_aux_mem.begin(),
-                                            _aux_mem.end(),
-                                            [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
-
-            if(has_reshape != std::end(_aux_mem))
-            {
-                original_b->mark_as_unused();
-            }
         }
         // Run non-assembly reshape
         else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index bbbd5ac458..9786161dee 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -424,6 +424,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
             _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
+
+            b->mark_as_unused();
         }
 
         if(_gemm_info.method == AsmConvMethod::Indirect)