From fb45e2f86a6c6ba7ff08554c872c8876820f0a7f Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Fri, 18 Oct 2019 16:51:57 +0100 Subject: MLCE-133 Fixed ASR hero use-case * Added workaround in FC to deal with non const weights * Added workaround in LSTM to deal with non const weights Signed-off-by: Pablo Tello Signed-off-by: Matteo Martincigh Change-Id: I854eea6a74a6959606ff25b52a0ed80b3e0a18ab --- 1.2/ArmnnDriverImpl.cpp | 12 +++ 1.2/HalPolicy.cpp | 82 +++++--------------- ConversionUtils.hpp | 201 +++++++++++++++++++++++++++++++++++++++++++++--- Utils.cpp | 1 + 4 files changed, 222 insertions(+), 74 deletions(-) diff --git a/1.2/ArmnnDriverImpl.cpp b/1.2/ArmnnDriverImpl.cpp index 8a444e5d..7309c2a1 100644 --- a/1.2/ArmnnDriverImpl.cpp +++ b/1.2/ArmnnDriverImpl.cpp @@ -38,6 +38,12 @@ const char *g_OperandTypeTensorQuant16SymmPerformanceExecTime = const char *g_OperandTypeTensorQuant16SymmPerformancePowerUsage = "Armnn.operandTypeTensorQuant16SymmPerformance.powerUsage"; +const char *g_OperandTypeTensorQuant8SymmPerformanceExecTime = + "Armnn.operandTypeTensorQuant8SymmPerformance.execTime"; +const char *g_OperandTypeTensorQuant8SymmPerformancePowerUsage = + "Armnn.operandTypeTensorQuant8SymmPerformance.powerUsage"; + + const char *g_OperandTypeTensorInt32PerformanceExecTime = "Armnn.operandTypeTensorInt32Performance.execTime"; const char *g_OperandTypeTensorInt32PerformancePowerUsage = "Armnn.operandTypeTensorInt32Performance.powerUsage"; @@ -256,6 +262,12 @@ Return ArmnnDriverImpl::getCapabilities_1_2(const armnn::IRuntimePtr& runt .powerUsage = ParseSystemProperty(g_OperandTypeTensorQuant8AsymmPerformancePowerUsage, defaultValue) }); + update(&capabilities.operandPerformance, OperandType::TENSOR_QUANT8_SYMM, + { + .execTime = ParseSystemProperty(g_OperandTypeTensorQuant8SymmPerformanceExecTime, defaultValue), + .powerUsage = ParseSystemProperty(g_OperandTypeTensorQuant8SymmPerformancePowerUsage, defaultValue) + }); + update(&capabilities.operandPerformance, OperandType::TENSOR_QUANT16_SYMM, { .execTime = ParseSystemProperty(g_OperandTypeTensorQuant16SymmPerformanceExecTime, defaultValue), diff --git a/1.2/HalPolicy.cpp b/1.2/HalPolicy.cpp index 7e9a2233..5d6274fc 100644 --- a/1.2/HalPolicy.cpp +++ b/1.2/HalPolicy.cpp @@ -2010,6 +2010,8 @@ bool HalPolicy::ConvertTanH(const Operation& operation, const Model& model, Conv bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, ConversionData& data) { + ALOGV("hal_1_2::HalPolicy::ConvertLstm()"); + // Inputs: // 00: The input: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [batch_size, input_size], where // “batch_size” corresponds to the batching dimension, and “input_size” is the size of the input. @@ -2035,27 +2037,27 @@ bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, Conv // 02: The input-to-forget weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, input_size]. const ConstTensorPin inputToForgetWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 2, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 2)); // 03: The input-to-cell weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, input_size]. const ConstTensorPin inputToCellWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 3, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 3)); // 04: The input-to-output weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, input_size]. const ConstTensorPin inputToOutputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 4, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 4)); // 06: The recurrent-to-forget weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, output_size]. const ConstTensorPin recurrentToForgetWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 6, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 6)); // 07: The recurrent-to-cell weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, output_size]. const ConstTensorPin recurrentToCellWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 7, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 7)); // 08: The recurrent-to-output weights: A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, output_size]. const ConstTensorPin recurrentToOutputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, 8, model, data); + (DequantizeAndMakeConstTensorPin(operation, model, data, 8)); // 13: The forget gate bias: A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [num_units]. const ConstTensorPin forgetGateBiasPin = ConvertOperationInputToConstTensorPin(operation, 13, model, data); @@ -2083,56 +2085,21 @@ bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, Conv // 01: The input-to-input weights: Optional. A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, input_size], where “num_units” corresponds to the number of cell units. const ConstTensorPin inputToInputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 1, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 1, true)); // 05: The recurrent-to-input weights: Optional. A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [num_units, output_size], where “output_size” corresponds to either the number of cell units (i.e., // “num_units”), or the second dimension of the “projection_weights”, if defined. const ConstTensorPin recurrentToInputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 5, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 5, true)); // 09: The cell-to-input weights: Optional. A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [num_units]. const ConstTensorPin cellToInputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 9, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 9, true)); // 10: The cell-to-forget weights: Optional. A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [num_units]. const ConstTensorPin cellToForgetWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 10, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 10, true)); // 11: The cell-to-output weights: Optional. A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [num_units]. const ConstTensorPin cellToOutputWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 11, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 11, true)); // 12: The input gate bias: Optional. A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [num_units]. const ConstTensorPin inputGateBiasPin = ConvertOperationInputToConstTensorPin(operation, @@ -2146,14 +2113,7 @@ bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, Conv // 16: The projection weights: Optional. A 2-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape // [output_size, num_units]. const ConstTensorPin projectionWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 16, - model, - data, - g_DontPermute, - nullptr, - true); - + (DequantizeAndMakeConstTensorPin(operation, model, data, 16, true)); // 17: The projection bias: Optional. A 1-D tensor of ANEURALNETWORKS_TENSOR_FLOAT32, of shape [output_size]. const ConstTensorPin projectionBiasPin = ConvertOperationInputToConstTensorPin(operation, @@ -2196,14 +2156,8 @@ bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, Conv // Get the normalization tensors // 23: The input layer normalization weights. A 1-D tensor of shape [num_units]. // Used to rescale normalized inputs to activation at input gate. - const ConstTensorPin inputLayerNormWeightsPin = - ConvertOperationInputToConstTensorPin(operation, - 23, - model, - data, - g_DontPermute, - nullptr, - true); + const ConstTensorPin inputLayerNormWeightsPin + (DequantizeAndMakeConstTensorPin(operation, model, data, 23, true)); // 24: The forget layer normalization weights. A 1-D tensor of shape [num_units]. // Used to rescale normalized inputs to activation at forget gate. @@ -2357,7 +2311,9 @@ bool HalPolicy::ConvertLstm(const Operation& operation, const Model& model, Conv IsDynamicTensor(cellStateOutInfo) || IsDynamicTensor(outputInfo)) { - return Fail("%s: Dynamic output tensors are not supported", __func__); + return Fail("%s: Dynamic output tensors are not supported %d %d %d %d", __func__, + IsDynamicTensor(scratchBufferInfo), IsDynamicTensor(outputStateOutInfo), + IsDynamicTensor(cellStateOutInfo), IsDynamicTensor(outputInfo)); } // Basic parameters diff --git a/ConversionUtils.hpp b/ConversionUtils.hpp index 1975434a..88c15375 100644 --- a/ConversionUtils.hpp +++ b/ConversionUtils.hpp @@ -187,6 +187,7 @@ inline bool IsOperandTypeSupportedForTensors(V1_2::OperandType type) type == V1_2::OperandType::TENSOR_FLOAT16 || type == V1_2::OperandType::TENSOR_FLOAT32 || type == V1_2::OperandType::TENSOR_QUANT8_ASYMM || + type == V1_2::OperandType::TENSOR_QUANT8_SYMM || type == V1_2::OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL || type == V1_2::OperandType::TENSOR_QUANT16_SYMM || type == V1_2::OperandType::TENSOR_INT32; @@ -646,8 +647,8 @@ const HalOperand* GetOutputOperand(const HalOperation& operation, } template + typename HalOperand = typename HalPolicy::Operand, + typename HalModel = typename HalPolicy::Model> const void* GetOperandValueReadOnlyAddress(const HalOperand& operand, const HalModel& model, const ConversionData& data, @@ -2117,6 +2118,30 @@ bool ConvertDepthwiseConv2d(const HalOperation& operation, const HalModel& model return SetupAndTrackLayerOutputSlot(operation, 0, *endLayer, model, data); } +template +bool IsOperandConstant(const HalOperation& operation, + uint32_t inputIndex, + const HalModel& model, + bool& isConstant) +{ + using HalOperand = typename HalPolicy::Operand; + using HalOperandLifeTime = typename HalPolicy::OperandLifeTime; + + const HalOperand* operand = GetInputOperand(operation, inputIndex, model); + if (!operand) + { + return Fail("%s: invalid input operand at index %i", __func__, inputIndex); + } + + isConstant = operand->lifetime == HalOperandLifeTime::CONSTANT_COPY || + operand->lifetime == HalOperandLifeTime::CONSTANT_REFERENCE || + operand->lifetime == HalOperandLifeTime::NO_VALUE; + + return true; +} + template @@ -2136,6 +2161,43 @@ bool ConvertDequantize(const Operation& operation, const Model& model, Conversio return Fail("%s: Operation has invalid outputs", __func__); } + // If the output is going into the FC weights and input is const just return true + const size_t outputIndex = operation.outputs[0]; + bool input_is_constant = false; + if (!IsOperandConstant(operation,0,model,input_is_constant) && input_is_constant) + { + return Fail("Non const input not supported"); + } + + // Iterate through the nodes and find the operation feeding from the Dequantize output operand + for (uint32_t operationIdx = 0; operationIdx < model.operations.size(); ++operationIdx) + { + // Search for the FC op which consumes the output of Dequantize with index equal to outputIndex + const auto& operationIt = model.operations[operationIdx]; + switch (operationIt.type) + { + case HalPolicy::OperationType::FULLY_CONNECTED: + if (outputIndex == operationIt.inputs[1]) // Weights are bound to slot 1 + { + // If the output is going into the FC weights and input is const just return true + return true; + } + break; + case HalPolicy::OperationType::LSTM: + for (size_t k = 0; k < operationIt.inputs.size(); ++k) + { + if (outputIndex == operationIt.inputs[k]) + { + // If the output is going into the LSTM weights and input is const just return true + return true; + } + } + break; + default: + break; + } + } + const armnn::TensorInfo& outputInfo = GetTensorInfoForOperand(*outputOperand); if (IsDynamicTensor(outputInfo)) { @@ -2269,13 +2331,125 @@ bool ConvertFloor(const Operation& operation, const Model& model, ConversionData return SetupAndTrackLayerOutputSlot(operation, 0, *layer, model, data); } +inline bool IsQSymm8(const V1_0::Operand&) +{ + return false; +} + +#ifdef ARMNN_ANDROID_NN_V1_2 + +inline bool IsQSymm8(const V1_2::Operand& operand) +{ + return operand.type == V1_2::OperandType::TENSOR_QUANT8_SYMM; +} + +#endif + template -bool ConvertFullyConnected(const Operation& operation, const Model& model, ConversionData& data) +std::tuple, size_t, armnn::TensorInfo> +DequantizeIfRequired(size_t operand_index, const Operation& operation, const Model& model, const ConversionData& data) { using Operand = typename HalPolicy::Operand; + bool weights_constant = false; + if (!(IsOperandConstant(operation, operand_index, model, weights_constant) && !weights_constant)) + { + return { nullptr, 0, armnn::TensorInfo() }; + } + + const size_t weightsInputIndex = operation.inputs[operand_index]; + + // The weights are a non const tensor, this indicates they might be the output of a dequantize op. + // Iterate over the nodes and find the previous operation which should be DEQUANTIZE + for (uint32_t operationIdx = 0; operationIdx < model.operations.size(); ++operationIdx) + { + const auto& operationIt = model.operations[operationIdx]; + size_t outOpIndex = weightsInputIndex + 1; + + // Search for the DEQUANTIZE op which has the operand with index equal to operandIndex + if (operationIt.type != HalPolicy::OperationType::DEQUANTIZE) + { + continue; + } + + for (size_t i = 0; outOpIndex != weightsInputIndex && i < operation.outputs.size(); ++i) + { + outOpIndex = operationIt.outputs[i]; + break; + } + + if (outOpIndex != weightsInputIndex) + { + break; + } + + const Operand* operand = GetInputOperand(operationIt, 0, model); + BOOST_ASSERT(operand); + + armnn::TensorInfo tensorInfo = GetTensorInfoForOperand(*operand); + if (!IsQSymm8(*operand)) + { + // Only supporting dequantize from QSYMM8 to FLOAT + break; + } + + // Allocate a new buffer for the dequantized data and manually dequantize + const void* startValue = GetOperandValueReadOnlyAddress(*operand, model, data); + if (!startValue) + { + // Failed to get the operand address + break; + } + + const uint8_t* quantizedBuffer = reinterpret_cast(startValue); + size_t dequantizedBufferLength = operand->location.length; + const float quantizationScale = tensorInfo.GetQuantizationScale(); + auto dequantizedBuffer = std::make_unique(dequantizedBufferLength + 1); + for (size_t i = 0; i < dequantizedBufferLength; ++i) + { + float* dstPtr = dequantizedBuffer.get(); + BOOST_ASSERT(dstPtr); + *dstPtr++ = quantizedBuffer[i] * quantizationScale; + } + + tensorInfo.SetDataType(armnn::DataType::Float32); + return { std::move(dequantizedBuffer), dequantizedBufferLength * sizeof(float), std::move(tensorInfo) }; + } + + return { nullptr, 0, armnn::TensorInfo() }; +} + +template +ConstTensorPin DequantizeAndMakeConstTensorPin(const Operation& operation, + const Model& model, + const ConversionData& data, + size_t operandIndex, + bool optional = false) +{ + auto dequantized = DequantizeIfRequired(operandIndex,operation, model, data); + if (std::get<1>(dequantized) == 0 && optional) + { + // Optional tensor with no values is not really an error. Return it as invalid, but marked as optional + return ConstTensorPin(true); + } + + return std::get<1>(dequantized) ? + ConstTensorPin(std::get<2>(dequantized), std::get<0>(dequantized).get(), + std::get<1>(dequantized), g_DontPermute): + ConvertOperationInputToConstTensorPin(operation, operandIndex, model, data); +} + + +template +bool ConvertFullyConnected(const Operation& operation, const Model& model, ConversionData& data) +{ + using Operand = typename HalPolicy::Operand; LayerInputHandle input = ConvertToLayerInputHandle(operation, 0, model, data); if (!input.IsValid()) { @@ -2296,15 +2470,18 @@ bool ConvertFullyConnected(const Operation& operation, const Model& model, Conve return Fail("%s: Dynamic output tensors are not supported", __func__); } - // ArmNN does not currently support non-fixed weights or bias - ConstTensorPin weightsPin = - ConvertOperationInputToConstTensorPin(operation, 1, model, data); // 2D - ConstTensorPin biasPin = - ConvertOperationInputToConstTensorPin(operation, 2, model, data); // 1D + ConstTensorPin weightsPin = DequantizeAndMakeConstTensorPin(operation, model, data, 1); - if (!weightsPin.IsValid() || !biasPin.IsValid()) + ConstTensorPin biasPin = ConvertOperationInputToConstTensorPin(operation, 2, model, data); // 1D + + if (!weightsPin.IsValid()) { - return Fail("%s: Operation has invalid inputs", __func__); + return Fail("%s: Operation has invalid weights", __func__); + } + + if (!biasPin.IsValid()) + { + return Fail("%s: Operation has invalid bias", __func__); } armnn::ConstTensor weights = weightsPin.GetConstTensor(); @@ -2314,7 +2491,9 @@ bool ConvertFullyConnected(const Operation& operation, const Model& model, Conve try { reshapedInfo.SetShape(FlattenFullyConnectedInput(inputInfo.GetShape(), weights.GetInfo().GetShape())); - } catch (const std::exception &e) { + } + catch (const std::exception& e) + { return Fail("%s: %s", __func__, e.what()); } diff --git a/Utils.cpp b/Utils.cpp index 246d6415..555039ca 100644 --- a/Utils.cpp +++ b/Utils.cpp @@ -127,6 +127,7 @@ armnn::TensorInfo GetTensorInfoForOperand(const V1_2::Operand& operand) type = armnn::DataType::QuantizedSymm8PerAxis; break; case V1_2::OperandType::TENSOR_QUANT8_ASYMM: + case V1_2::OperandType::TENSOR_QUANT8_SYMM: type = armnn::DataType::QuantisedAsymm8; break; case V1_2::OperandType::TENSOR_QUANT16_SYMM: -- cgit v1.2.1