Support dynamic weights for Fully Connected layers on GPU

The fully connected function and operator running on GPU have been adapted to support dynamic weights. Dynamic weights require the reshape and data layout conversion of weight tensors at runtime in the prepare stage of the operator. The implementation for GPU is identical to the CPU implementation. This patch also deprecates the `are_weights_reshaped` option in Fully Connected. Resolves: COMPMID-5870 Change-Id: I28f967695879d82cc91a928d95308a4e0e52a597 Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9403 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Jakub Sujak <jakub.sujak@arm.com> 2023-03-29 11:16:18 +0100
committer: Jakub Sujak <jakub.sujak@arm.com> 2023-04-04 14:09:21 +0000
commit: 617ed50000532877296fff93973590a8ab67f96d (patch)
tree: d571654a2c31dfb6d5ec1529160b9f3d3348938c
parent: f26ea2f8cc957a1e6faf0361dea805fb2e236061 (diff)
download: ComputeLibrary-617ed50000532877296fff93973590a8ab67f96d.tar.gz
7 files changed, 78 insertions, 27 deletions
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 8434611f7a..fd45bbaa1e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1833,7 +1833,7 @@ struct FullyConnectedLayerInfo
     /* Information about weights */
     DataLayout weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
     bool       transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool       are_weights_reshaped{ false };              /**<  Reshape the weights tensor if false. */
+    bool       are_weights_reshaped{ false };              /**<  @deprecated Reshape the weights tensor if false. */
     bool       retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
     bool       enable_fast_math{ false };                  /**<  Enable fast math computation. */
     /* Other parameters */
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 8afd036e7c..b289cc0104 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                          false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
+                                         !_dynamic_weights,               // reshape_b_only_on_first_run
                                          0,                               // depth_output_gemm3d
                                          false,                           // reinterpret_input_as_3d
                                          fc_info.retain_internal_weights, // retain_internal_weights
@@ -240,6 +240,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     _is_prepared           = fc_info.retain_internal_weights;
     _weights_to_use        = TensorInfo(*weights);
     _weights_to_use_idx    = ACL_SRC_1;
+    _dynamic_weights       = !weights->are_values_constant() && !_are_weights_reshaped;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -310,8 +311,15 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
     {
         // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
+        // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
+        _aux_mem[TransposedWeights] = MemoryInfo(
+            offset_int_vec(TransposedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+            _reshaped_weights.total_size());
+        _aux_mem[ConvertedWeights]  = MemoryInfo(
+            offset_int_vec(ConvertedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+            _converted_weights.total_size());
     }
     else
     {
@@ -319,8 +327,14 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
         const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
         const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
 
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(
+            offset_int_vec(TransposedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : transposed_wei_lft,
+            _reshaped_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(
+            offset_int_vec(ConvertedWeights),
+            _dynamic_weights ? MemoryLifetime::Temporary : converted_wei_lft,
+            _converted_weights.total_size());
     }
     _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
@@ -334,7 +348,6 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
                                 && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights->are_values_constant() && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
@@ -420,6 +433,11 @@ void ClFullyConnected::run(ITensorPack &tensors)
 {
     prepare(tensors);
 
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    ++_asrt_run_count;
+    ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
     auto src = tensors.get_const_tensor(ACL_SRC_0);
 
     CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
@@ -452,8 +470,13 @@ void ClFullyConnected::run(ITensorPack &tensors)
 
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if(!_is_prepared || _dynamic_weights)
     {
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+        ++_asrt_prepare_count;
+        ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
         auto weights = tensors.get_const_tensor(ACL_SRC_1);
 
         CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
@@ -462,7 +485,7 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         // Pointer to current weights
         const ITensor *cur_weights = weights;
 
-        // Reshape of the weights if needed (happens only once)
+        // Reshape of the weights if needed
         if(!_are_weights_reshaped)
         {
             // Run reshape weights kernel and mark weights as unused
@@ -471,11 +494,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
-
-            _are_weights_reshaped = true;
         }
 
-        // Convert weights if needed (happens only once)
+        // Convert weights if needed
         if(!_are_weights_converted)
         {
             ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
@@ -483,20 +504,19 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
 
             cur_weights->mark_as_unused();
             cur_weights = converted_weights.get();
-
-            _are_weights_converted = true;
         }
 
-        tensors.add_const_tensor(ACL_SRC_1, cur_weights);
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
         if(!_is_quantized)
         {
-            _mm_gemm->prepare(tensors);
+            _mm_gemm->prepare(gemm_pack);
         }
         else
         {
-            _mm_gemmlowp->prepare(tensors);
+            _mm_gemmlowp->prepare(gemm_pack);
         }
         _is_prepared = true;
     }
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index b5ac70c93b..d08d5db8a4 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,6 +132,12 @@ private:
     bool _is_fc_after_conv{ true };
     bool _is_quantized{ false };
     bool _is_prepared{ false };
+    bool _dynamic_weights{ false };
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    int  _asrt_run_count{};
+    int  _asrt_prepare_count{};
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 02b2042a6c..1c162db79a 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,7 @@ struct CLFullyConnectedLayer::Impl
     experimental::MemoryRequirements aux_mem_req{};
 
     bool is_prepared{ false };
+    bool dynamic_weights{ false };
 };
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -96,6 +97,12 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c
         _impl->run_pack.add_tensor(ACL_SRC_0, input);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
+
+    _impl->dynamic_weights =
+        !weights->info()->are_values_constant() &&
+        fc_info.transpose_weights &&
+        !fc_info.are_weights_reshaped &&
+        !fc_info.retain_internal_weights;
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -106,7 +113,10 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
 
 void CLFullyConnectedLayer::run()
 {
-    prepare();
+    if(!_impl->dynamic_weights)
+    {
+        prepare();
+    }
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
     _impl->op->run(_impl->run_pack);
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index fcfae4e156..9213ab541d 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -150,6 +150,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
@@ -173,9 +179,9 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerMixedDataLayoutF
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
-                       framework::dataset::make("WeightsReshaped", { true })))
+                                                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false, true })))
 {
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
@@ -223,6 +229,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>,
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
 TEST_SUITE_END() /* QASYMM8 */
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
@@ -246,6 +258,12 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDa
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                                                framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                                                                                                                        framework::dataset::make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // FullyConnectedLayer
diff --git a/tests/validation/fixtures/FullyConnectedLayerFixture.h b/tests/validation/fixtures/FullyConnectedLayerFixture.h
index 7d1aa494ba..75bef144ad 100644
--- a/tests/validation/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/validation/fixtures/FullyConnectedLayerFixture.h
@@ -333,7 +333,6 @@ private:
         validate(AccessorType(target), ref, rel_tolerance_f32, 0, abs_tolerance_f32);
     }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     void validate_with_tolerance(TensorType &target, SimpleTensor<half_float::half> &ref)
     {
         constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.3f);
@@ -342,7 +341,6 @@ private:
 
         validate(AccessorType(target), ref, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
     }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
     void validate_with_tolerance(TensorType &target, SimpleTensor<uint8_t> &ref)
     {
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 9b9c7b5b34..d40c3eb3d7 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -2772,7 +2772,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FullyConnectedLayerI
        << "transpose_weights=" << layer_info.transpose_weights << ", "
        << "are_weights_reshaped=" << layer_info.are_weights_reshaped << ", "
        << "retain_internal_weights=" << layer_info.retain_internal_weights << ", "
-       << "constant_weights=" << layer_info.transpose_weights << ", "
        << "fp_mixed_precision=" << layer_info.fp_mixed_precision << "}";
     return os;
 }
author	Jakub Sujak <jakub.sujak@arm.com>	2023-03-29 11:16:18 +0100
committer	Jakub Sujak <jakub.sujak@arm.com>	2023-04-04 14:09:21 +0000
commit	617ed50000532877296fff93973590a8ab67f96d (patch)
tree	d571654a2c31dfb6d5ec1529160b9f3d3348938c
parent	f26ea2f8cc957a1e6faf0361dea805fb2e236061 (diff)
download	ComputeLibrary-617ed50000532877296fff93973590a8ab67f96d.tar.gz