7 files changed, 78 insertions, 27 deletions
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 8434611f7a..fd45bbaa1e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1833,7 +1833,7 @@ struct FullyConnectedLayerInfo
     /* Information about weights */
     DataLayout weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
     bool       transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool       are_weights_reshaped{ false };              /**<  Reshape the weights tensor if false. */
+    bool       are_weights_reshaped{ false };              /**<  @deprecated Reshape the weights tensor if false. */
     bool       retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
     bool       enable_fast_math{ false };                  /**<  Enable fast math computation. */
     /* Other parameters */
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 8afd036e7c..b289cc0104 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                          false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
+                                         !_dynamic_weights,               // reshape_b_only_on_first_run
                                          0,                               // depth_output_gemm3d
                                          false,                           // reinterpret_input_as_3d
                                          fc_info.retain_internal_weights, // retain_internal_weights
@@ -240,6 +240,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     _is_prepared           = fc_info.retain_internal_weights;
     _weights_to_use        = TensorInfo(*weights);
     _weights_to_use_idx    = ACL_SRC_1;
+    _dynamic_weights       = !weights->are_values_constant() && !_are_weights_reshaped;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -310,8 +311,15 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
     {
         // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
+        // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
+        _aux_mem[TransposedWeights] = MemoryInfo(
+            offset_int_vec(TransposedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+            _reshaped_weights.total_size());
+        _aux_mem[ConvertedWeights]  = MemoryInfo(
+            offset_int_vec(ConvertedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+            _converted_weights.total_size());
     }
     else
     {
@@ -319,8 +327,14 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
         const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
         const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
 
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(
+            offset_int_vec(TransposedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : transposed_wei_lft,
+            _reshaped_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(
+            offset_int_vec(ConvertedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : converted_wei_lft,
+            _converted_weights.total_size());
     }
     _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
@@ -334,7 +348,6 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
                                 && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights->are_values_constant() && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
@@ -420,6 +433,11 @@ void ClFullyConnected::run(ITensorPack &tensors)
 {
     prepare(tensors);
 
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    ++_asrt_run_count;
+    ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
     auto src = tensors.get_const_tensor(ACL_SRC_0);
 
     CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
@@ -452,8 +470,13 @@ void ClFullyConnected::run(ITensorPack &tensors)
 
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if(!_is_prepared || _dynamic_weights)
     {
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+        ++_asrt_prepare_count;
+        ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
         auto weights = tensors.get_const_tensor(ACL_SRC_1);
 
         CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
@@ -462,7 +485,7 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         // Pointer to current weights
         const ITensor *cur_weights = weights;
 
-        // Reshape of the weights if needed (happens only once)
+        // Reshape of the weights if needed
         if(!_are_weights_reshaped)
         {
             // Run reshape weights kernel and mark weights as unused
@@ -471,11 +494,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
-
-            _are_weights_reshaped = true;
         }
 
-        // Convert weights if needed (happens only once)
+        // Convert weights if needed
         if(!_are_weights_converted)
         {
             ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
@@ -483,20 +504,19 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
 
             cur_weights->mark_as_unused();
             cur_weights = converted_weights.get();
-
-            _are_weights_converted = true;
         }
 
-        tensors.add_const_tensor(ACL_SRC_1, cur_weights);
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
         if(!_is_quantized)
         {
-            _mm_gemm->prepare(tensors);
+            _mm_gemm->prepare(gemm_pack);
         }
         else
         {
-            _mm_gemmlowp->prepare(tensors);
+            _mm_gemmlowp->prepare(gemm_pack);
         }
         _is_prepared = true;
     }
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index b5ac70c93b..d08d5db8a4 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,6 +132,12 @@ private:
     bool _is_fc_after_conv{ true };
     bool _is_quantized{ false };
     bool _is_prepared{ false };
+    bool _dynamic_weights{ false };
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    int  _asrt_run_count{};
+    int  _asrt_prepare_count{};
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 02b2042a6c..1c162db79a 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,7 @@ struct CLFullyConnectedLayer::Impl
     experimental::MemoryRequirements aux_mem_req{};
 
     bool is_prepared{ false };
+    bool dynamic_weights{ false };
 };
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -96,6 +97,12 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c
         _impl->run_pack.add_tensor(ACL_SRC_0, input);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
+
+    _impl->dynamic_weights =
+        !weights->info()->are_values_constant() &&
+        fc_info.transpose_weights &&
+        !fc_info.are_weights_reshaped &&
+        !fc_info.retain_internal_weights;
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -106,7 +113,10 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
 
 void CLFullyConnectedLayer::run()
 {
-    prepare();
+    if(!_impl->dynamic_weights)
+    {
+        prepare();
+    }
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
     _impl->op->run(_impl->run_pack);
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index fcfae4e156..9213ab541d 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -150,6 +150,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
@@ -173,9 +179,9 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerMixedDataLayoutF
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { true })))
+                                                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false, true })))
 {
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
@@ -223,6 +229,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>,
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
 TEST_SUITE_END() /* QASYMM8 */
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
@@ -246,6 +258,12 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDa
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // FullyConnectedLayer
diff --git a/tests/validation/fixtures/FullyConnectedLayerFixture.h b/tests/validation/fixtures/FullyConnectedLayerFixture.h
index 7d1aa494ba..75bef144ad 100644
--- a/tests/validation/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/validation/fixtures/FullyConnectedLayerFixture.h
@@ -333,7 +333,6 @@ private:
         validate(AccessorType(target), ref, rel_tolerance_f32, 0, abs_tolerance_f32);
     }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     void validate_with_tolerance(TensorType &target, SimpleTensor<half_float::half> &ref)
     {
         constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.3f);
@@ -342,7 +341,6 @@ private:
 
         validate(AccessorType(target), ref, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
     }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
     void validate_with_tolerance(TensorType &target, SimpleTensor<uint8_t> &ref)
     {
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 9b9c7b5b34..d40c3eb3d7 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -2772,7 +2772,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FullyConnectedLayerI
        << "transpose_weights=" << layer_info.transpose_weights << ", "
        << "are_weights_reshaped=" << layer_info.are_weights_reshaped << ", "
        << "retain_internal_weights=" << layer_info.retain_internal_weights << ", "
-       << "constant_weights=" << layer_info.transpose_weights << ", "
        << "fp_mixed_precision=" << layer_info.fp_mixed_precision << "}";
     return os;
 }