From aed63ee175e0d64c934389e9d1b2edd0cb1a5cdd Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Mon, 26 Jul 2021 13:18:50 +0100
Subject: Add support for non-constant weights and biases in CpuFullyConnected

Changing the approach for specifying that weights and biases tensors are
non-constant by making it a member of TensorInfo rather than an option
of the functions.

Resolves: COMPMID-4222

Change-Id: I96e6f3868f51785c9700a3ef6a1fe7b05747862c
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6162
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 src/cpu/kernels/assembly/gemm_common.hpp           |  3 +++
 src/cpu/operators/CpuFullyConnected.cpp            | 27 +++++++++++++++----
 .../operators/internal/CpuGemmAssemblyDispatch.cpp | 31 ++++++++++++++++++++++
 3 files changed, 56 insertions(+), 5 deletions(-)

(limited to 'src/cpu')

diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 378f1041be..ece9ca5802 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -212,6 +212,9 @@ public:
 
     /*** "Pretransposed" interface ***/
 
+    /* Compute col sums over all columns */
+    virtual void requantize_bias(void *, const To *, const int, const int) {};
+
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
     virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index cafb3484b6..d952724cdc 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -312,9 +312,14 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
 
     if(_aux_mem[Pretranspose].size > 0)
     {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
+        // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
+        // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), (_is_quantized_asymmetric
+                                                                                     && !(biases->are_values_constant())) ?
+                                                 MemoryLifetime::Persistent :
+                                                 MemoryLifetime::Prepare,
+                                                 _reshaped_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
     }
     else
     {
@@ -332,10 +337,9 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
                                 && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!fc_info.constant_weights, "Non-constant weights are currently not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(!weights->are_values_constant() && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
@@ -356,6 +360,19 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
 
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        if(is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+    }
+
     if(is_batched_fc_layer)
     {
         is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 97893b0672..1dd6286dbf 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -206,6 +206,7 @@ private:
     std::vector<TypeInput>           _indirect_pad{};
     arm_gemm::ConvolutionParameters  _cp{};
     experimental::MemoryRequirements _aux_mem{ Count };
+    bool                             _B_pretranspose_required{ false };
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -391,6 +392,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
         _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+        _B_pretranspose_required               = true;
     }
 
     // Handle indirect GEMM convolution
@@ -485,6 +487,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
         in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
     }
 
+    // If necessary, run pretranspose every time if either weights or biases are non-constant
+    if((b && !b->info()->are_values_constant()) || (c && !c->info()->are_values_constant() && c->info()->data_type() == DataType::S32))
+    {
+        if(c && c->info()->data_type() == DataType::S32)
+        {
+            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+        }
+
+        // Pretranspose B if required
+        if(_B_pretranspose_required)
+        {
+            const int  ldb            = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+            CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
+            ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
+
+            if(b->info()->are_values_constant())
+            {
+                _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
+            }
+            else
+            {
+                _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
+            }
+        }
+    }
+
     const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
-- 
cgit v1.2.1