From 1299496996bc332f02218f926640a9255ed60310 Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Thu, 21 Apr 2022 11:57:09 +0100 Subject: IVGCVSW-6806 Add Unidirectional Sequence Lstm support to Neon * Corrected TensorInfo order for IsUnidirectionalSequenceLstmSupported * outputStateOut TensorInfo is not optional. * cellStateOut TensorInfo is not optional. * TensorInfo Order matches other QLSTM/LSTM layers. * Added missing parameters to UnidirectionalSequenceLstmOperator for delegate. * Added quantized UnidirectionalSequenceLstm support to Neon !android-nn-driver:7457 Signed-off-by: Mike Kelly Change-Id: I26dde1bb96793dd25eb9081ca5ae5f63752288c4 --- delegate/src/UnidirectionalSequenceLstm.hpp | 49 +- include/armnn/BackendHelper.hpp | 4 +- include/armnn/backends/ILayerSupport.hpp | 4 +- src/armnn/BackendHelper.cpp | 8 +- src/armnn/ILayerSupport.cpp | 68 +- .../layers/UnidirectionalSequenceLstmLayer.cpp | 4 +- src/armnnDeserializer/Deserializer.cpp | 12 +- .../test/LstmSerializationTests.cpp | 40 +- src/backends/backendsCommon/LayerSupportBase.cpp | 4 +- src/backends/backendsCommon/LayerSupportBase.hpp | 4 +- src/backends/backendsCommon/WorkloadData.cpp | 26 +- src/backends/backendsCommon/WorkloadFactory.cpp | 11 +- .../UnidirectionalSequenceLstmTestImpl.cpp | 175 +++- src/backends/cl/ClLayerSupport.cpp | 8 +- src/backends/cl/ClLayerSupport.hpp | 4 +- .../ClUnidirectionalSequenceLstmFloatWorkload.cpp | 6 +- src/backends/neon/NeonLayerSupport.cpp | 46 +- src/backends/neon/NeonLayerSupport.hpp | 4 +- src/backends/neon/NeonWorkloadFactory.cpp | 15 +- src/backends/neon/backend.mk | 3 +- src/backends/neon/workloads/CMakeLists.txt | 2 + ...NeonUnidirectionalSequenceLstmFloatWorkload.cpp | 40 +- ...NeonUnidirectionalSequenceLstmFloatWorkload.hpp | 5 +- .../NeonUnidirectionalSequenceLstmWorkload.cpp | 879 +++++++++++++++++++++ .../NeonUnidirectionalSequenceLstmWorkload.hpp | 90 +++ src/backends/neon/workloads/NeonWorkloads.hpp | 1 + src/backends/reference/RefLayerSupport.cpp | 112 +-- src/backends/reference/RefLayerSupport.hpp | 4 +- .../RefUnidirectionalSequenceLstmWorkload.cpp | 8 +- 29 files changed, 1373 insertions(+), 263 deletions(-) create mode 100644 src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp create mode 100644 src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.hpp diff --git a/delegate/src/UnidirectionalSequenceLstm.hpp b/delegate/src/UnidirectionalSequenceLstm.hpp index a923874a74..bcf01cf2a9 100644 --- a/delegate/src/UnidirectionalSequenceLstm.hpp +++ b/delegate/src/UnidirectionalSequenceLstm.hpp @@ -151,6 +151,36 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, || params.m_OutputLayerNormWeights != nullptr); desc.m_TimeMajor = nodeParams->time_major; + if (tfLiteNode->intermediates->size > 3 && desc.m_LayerNormEnabled) + { + auto inputIntermediateTensorInfo = GetTensorInfoForTfLiteTensor( + tfLiteTensors[tfLiteNode->intermediates->data[0]]); + auto forgetIntermediateTensorInfo = GetTensorInfoForTfLiteTensor( + tfLiteTensors[tfLiteNode->intermediates->data[1]]); + auto cellIntermediateTensorInfo = GetTensorInfoForTfLiteTensor( + tfLiteTensors[tfLiteNode->intermediates->data[2]]); + auto outputIntermediateTensorInfo = GetTensorInfoForTfLiteTensor( + tfLiteTensors[tfLiteNode->intermediates->data[3]]); + + desc.m_InputIntermediateScale = inputIntermediateTensorInfo.GetQuantizationScale(); + desc.m_ForgetIntermediateScale = forgetIntermediateTensorInfo.GetQuantizationScale(); + desc.m_CellIntermediateScale = cellIntermediateTensorInfo.GetQuantizationScale(); + desc.m_OutputIntermediateScale = outputIntermediateTensorInfo.GetQuantizationScale(); + } + else + { + float defaultIntermediate = std::pow(2, -12); + desc.m_InputIntermediateScale = defaultIntermediate; + desc.m_ForgetIntermediateScale = defaultIntermediate; + desc.m_CellIntermediateScale = defaultIntermediate; + desc.m_OutputIntermediateScale = defaultIntermediate; + } + if (tfLiteNode->intermediates->size > 4) + { + auto hiddentensorInfo = GetTensorInfoForTfLiteTensor(tfLiteTensors[tfLiteNode->intermediates->data[4]]); + desc.m_HiddenStateScale = hiddentensorInfo.GetQuantizationScale(); + desc.m_HiddenStateZeroPoint = hiddentensorInfo.GetQuantizationOffset(); + } const armnn::TensorInfo& inputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteInputTensor); const armnn::TensorInfo& outputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteOutputTensor); @@ -167,7 +197,11 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, { scratchBufferTensorInfo = armnn::TensorInfo({batchSize, numUnits * 4}, dataType, qScale, qOffset); } - armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, dataType, qScale, qOffset); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, + cellStateInInfo.GetDataType(), + cellStateInInfo.GetQuantizationScale(), + cellStateInInfo.GetQuantizationOffset()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, dataType, qScale, qOffset); armnn::LstmInputParamsInfo paramsInfo; @@ -218,9 +252,6 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, paramsInfo.m_OutputLayerNormWeights = &(params.m_OutputLayerNormWeights->GetInfo()); } - // hiddenStateOutput and cellStateOutput do not present in TfLite UnidirectionalSequenceLstm - armnn::Optional optionalTensor; - bool isSupported = false; auto validateFunc = [&](const armnn::TensorInfo& outputInfo, bool& isSupported) { @@ -232,9 +263,9 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, inputTensorInfo, outputStateInInfo, cellStateInInfo, + outputStateOutTensorInfo, + cellStateOutTensorInfo, outputInfo, - optionalTensor, - optionalTensor, desc, paramsInfo); }; @@ -248,7 +279,9 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, armnn::IConnectableLayer* layer = delegateData.m_Network->AddUnidirectionalSequenceLstmLayer(desc, params); ARMNN_ASSERT(layer != nullptr); - layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + layer->GetOutputSlot(0).SetTensorInfo(outputStateOutTensorInfo); + layer->GetOutputSlot(1).SetTensorInfo(cellStateOutTensorInfo); + layer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo); // Connect the inputs // input_layer @@ -258,7 +291,7 @@ TfLiteStatus VisitUnidirectionalSequenceLstmOperator(DelegateData& delegateData, //outputStateIn delegateData.m_OutputSlotForNode[tfLiteNode->inputs->data[19]]->Connect(layer->GetInputSlot(2)); - armnn::IOutputSlot& outputSlot = layer->GetOutputSlot(0); + armnn::IOutputSlot& outputSlot = layer->GetOutputSlot(2); delegateData.m_OutputSlotForNode[static_cast(tfLiteNode->outputs->data[0])] = &outputSlot; return kTfLiteOk; } diff --git a/include/armnn/BackendHelper.hpp b/include/armnn/BackendHelper.hpp index 4772ca97cd..8bcdbc4d3c 100644 --- a/include/armnn/BackendHelper.hpp +++ b/include/armnn/BackendHelper.hpp @@ -415,9 +415,9 @@ public: const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const LstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported = EmptyOptional()); diff --git a/include/armnn/backends/ILayerSupport.hpp b/include/armnn/backends/ILayerSupport.hpp index bfdede686f..a31e398a67 100644 --- a/include/armnn/backends/ILayerSupport.hpp +++ b/include/armnn/backends/ILayerSupport.hpp @@ -560,9 +560,9 @@ public: const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const LstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported = EmptyOptional()) const; diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp index 056fbb08fa..e2aa67275f 100644 --- a/src/armnn/BackendHelper.cpp +++ b/src/armnn/BackendHelper.cpp @@ -1332,16 +1332,14 @@ bool LayerSupportHandle::IsTransposeSupported(const TensorInfo& input, bool LayerSupportHandle::IsUnidirectionalSequenceLstmSupported(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const LstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) { - TensorInfo hiddenStateOutputVal = hiddenStateOutput.has_value() ? hiddenStateOutput.value() : TensorInfo(); - TensorInfo cellStateOutputVal = cellStateOutput.has_value() ? cellStateOutput.value() : TensorInfo(); - TensorInfos infos{input, outputStateIn, cellStateIn, hiddenStateOutputVal, cellStateOutputVal, output}; + TensorInfos infos{input, outputStateIn, cellStateIn, outputStateOut, cellStateOut, output}; return m_LayerSupport->IsLayerSupported(LayerType::UnidirectionalSequenceLstm, infos, diff --git a/src/armnn/ILayerSupport.cpp b/src/armnn/ILayerSupport.cpp index bf54223414..5366b13088 100644 --- a/src/armnn/ILayerSupport.cpp +++ b/src/armnn/ILayerSupport.cpp @@ -488,57 +488,15 @@ bool ILayerSupport::IsLayerSupported(const LayerType& type, "hiddenStateOutputVal, cellStateOutputVal, output}"); } auto desc = *(PolymorphicDowncast(&descriptor)); - - bool isHiddenStateOutputOptional = (infos[4] == TensorInfo()); - bool isCellStateOutput = (infos[5] == TensorInfo()); - if (isHiddenStateOutputOptional && isCellStateOutput) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - EmptyOptional(), - EmptyOptional(), - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else if (isHiddenStateOutputOptional) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - EmptyOptional(), - infos[5], - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else if (isCellStateOutput) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - infos[4], - EmptyOptional(), - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - infos[4], - infos[5], - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } + return IsUnidirectionalSequenceLstmSupported(infos[0], + infos[1], + infos[2], + infos[3], + infos[4], + infos[5], + desc, + lstmParamsInfo.value(), + reasonIfUnsupported); } case LayerType::ChannelShuffle: return IsChannelShuffleSupported(infos[0], @@ -1285,9 +1243,9 @@ bool ILayerSupport::IsUnidirectionalSequenceLstmSupported( const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const LstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const @@ -1295,9 +1253,9 @@ bool ILayerSupport::IsUnidirectionalSequenceLstmSupported( IgnoreUnused(input, outputStateIn, cellStateIn, + outputStateOut, + cellStateOut, output, - hiddenStateOutput, - cellStateOutput, descriptor, paramsInfo, reasonIfUnsupported); diff --git a/src/armnn/layers/UnidirectionalSequenceLstmLayer.cpp b/src/armnn/layers/UnidirectionalSequenceLstmLayer.cpp index 199961449e..e5f89bd017 100644 --- a/src/armnn/layers/UnidirectionalSequenceLstmLayer.cpp +++ b/src/armnn/layers/UnidirectionalSequenceLstmLayer.cpp @@ -15,7 +15,7 @@ namespace armnn { UnidirectionalSequenceLstmLayer::UnidirectionalSequenceLstmLayer(const LstmDescriptor& param, const char* name) - : LayerWithParameters(3, 1, LayerType::UnidirectionalSequenceLstm, param, name) + : LayerWithParameters(3, 3, LayerType::UnidirectionalSequenceLstm, param, name) { } @@ -171,7 +171,7 @@ void UnidirectionalSequenceLstmLayer::ValidateTensorShapesFromInputs() { VerifyLayerConnections(3, CHECK_LOCATION()); - const TensorShape& outputShape = GetOutputSlot(0).GetTensorInfo().GetShape(); + const TensorShape& outputShape = GetOutputSlot(2).GetTensorInfo().GetShape(); VerifyShapeInferenceType(outputShape, m_ShapeInferenceMethod); diff --git a/src/armnnDeserializer/Deserializer.cpp b/src/armnnDeserializer/Deserializer.cpp index 75c60cc906..93fa99dcc3 100644 --- a/src/armnnDeserializer/Deserializer.cpp +++ b/src/armnnDeserializer/Deserializer.cpp @@ -3616,7 +3616,7 @@ void IDeserializer::DeserializerImpl::ParseUnidirectionalSequenceLstm(GraphPtr g CHECK_VALID_SIZE(inputs.size(), 3); auto outputs = GetOutputs(graph, layerIndex); - CHECK_VALID_SIZE(outputs.size(), 1); + CHECK_VALID_SIZE(outputs.size(), 3); auto flatBufferLayer = graph->layers()->Get(layerIndex)->layer_as_UnidirectionalSequenceLstmLayer(); auto layerName = GetLayerName(graph, layerIndex); @@ -3714,8 +3714,14 @@ void IDeserializer::DeserializerImpl::ParseUnidirectionalSequenceLstm(GraphPtr g lstmInputParams, layerName.c_str()); - armnn::TensorInfo outputTensorInfo1 = ToTensorInfo(outputs[0]); - layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo1); + armnn::TensorInfo outputTensorInfo0 = ToTensorInfo(outputs[0]); + layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo0); + + armnn::TensorInfo outputTensorInfo1 = ToTensorInfo(outputs[1]); + layer->GetOutputSlot(1).SetTensorInfo(outputTensorInfo1); + + armnn::TensorInfo outputTensorInfo2 = ToTensorInfo(outputs[2]); + layer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo2); RegisterInputSlots(graph, layerIndex, layer); RegisterOutputSlots(graph, layerIndex, layer); diff --git a/src/armnnSerializer/test/LstmSerializationTests.cpp b/src/armnnSerializer/test/LstmSerializationTests.cpp index d8f8967bcd..ae2d813fc0 100644 --- a/src/armnnSerializer/test/LstmSerializationTests.cpp +++ b/src/armnnSerializer/test/LstmSerializationTests.cpp @@ -2299,6 +2299,8 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio armnn::TensorInfo inputTensorInfo({ batchSize, timeSize, inputSize }, armnn::DataType::Float32); armnn::TensorInfo cellStateTensorInfo({ batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({ batchSize, timeSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({ batchSize, timeSize, outputSize }, armnn::DataType::Float32); inputLayer->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(0)); @@ -2310,8 +2312,10 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio cellStateIn->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(2)); cellStateIn->GetOutputSlot(0).SetTensorInfo(cellStateTensorInfo); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(1).SetTensorInfo(cellStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).Connect(outputLayer->GetInputSlot(0)); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo); armnn::INetworkPtr deserializedNetwork = DeserializeNetwork(SerializeNetwork(*network)); CHECK(deserializedNetwork); @@ -2319,7 +2323,7 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio VerifyLstmLayer checker( layerName, {inputTensorInfo, outputStateTensorInfo, cellStateTensorInfo}, - {outputTensorInfo}, + {outputStateOutTensorInfo, cellStateOutTensorInfo, outputTensorInfo}, descriptor, params); deserializedNetwork->ExecuteStrategy(checker); @@ -2436,6 +2440,8 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeAndPr armnn::TensorInfo inputTensorInfo({ batchSize, timeSize, inputSize }, armnn::DataType::Float32); armnn::TensorInfo cellStateTensorInfo({ batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({ batchSize, timeSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({ batchSize, timeSize, outputSize }, armnn::DataType::Float32); inputLayer->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(0)); @@ -2447,8 +2453,10 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeAndPr cellStateIn->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(2)); cellStateIn->GetOutputSlot(0).SetTensorInfo(cellStateTensorInfo); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(1).SetTensorInfo(cellStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).Connect(outputLayer->GetInputSlot(0)); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo); armnn::INetworkPtr deserializedNetwork = DeserializeNetwork(SerializeNetwork(*network)); CHECK(deserializedNetwork); @@ -2456,7 +2464,7 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeAndPr VerifyLstmLayer checker( layerName, {inputTensorInfo, outputStateTensorInfo, cellStateTensorInfo}, - {outputTensorInfo}, + {outputStateOutTensorInfo, cellStateOutTensorInfo, outputTensorInfo}, descriptor, params); deserializedNetwork->ExecuteStrategy(checker); @@ -2592,6 +2600,8 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeWithP armnn::TensorInfo inputTensorInfo({ batchSize, timeSize, inputSize }, armnn::DataType::Float32); armnn::TensorInfo cellStateTensorInfo({ batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({ batchSize, timeSize, outputSize }, armnn::DataType::Float32); inputLayer->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(0)); @@ -2603,8 +2613,10 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeWithP cellStateIn->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(2)); cellStateIn->GetOutputSlot(0).SetTensorInfo(cellStateTensorInfo); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(1).SetTensorInfo(cellStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).Connect(outputLayer->GetInputSlot(0)); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo); armnn::INetworkPtr deserializedNetwork = DeserializeNetwork(SerializeNetwork(*network)); CHECK(deserializedNetwork); @@ -2612,7 +2624,7 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmNoCifgWithPeepholeWithP VerifyLstmLayer checker( layerName, {inputTensorInfo, outputStateTensorInfo, cellStateTensorInfo}, - {outputTensorInfo}, + {outputStateOutTensorInfo, cellStateOutTensorInfo, outputTensorInfo}, descriptor, params); deserializedNetwork->ExecuteStrategy(checker); @@ -2697,6 +2709,8 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio armnn::TensorInfo inputTensorInfo({ timeSize, batchSize, inputSize }, armnn::DataType::Float32); armnn::TensorInfo cellStateTensorInfo({ batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateTensorInfo({ batchSize, outputSize }, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({ timeSize, batchSize, outputSize }, armnn::DataType::Float32); inputLayer->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(0)); @@ -2708,8 +2722,10 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio cellStateIn->GetOutputSlot(0).Connect(unidirectionalSequenceLstmLayer->GetInputSlot(2)); cellStateIn->GetOutputSlot(0).SetTensorInfo(cellStateTensorInfo); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(0).SetTensorInfo(outputStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(1).SetTensorInfo(cellStateOutTensorInfo); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).Connect(outputLayer->GetInputSlot(0)); + unidirectionalSequenceLstmLayer->GetOutputSlot(2).SetTensorInfo(outputTensorInfo); armnn::INetworkPtr deserializedNetwork = DeserializeNetwork(SerializeNetwork(*network)); CHECK(deserializedNetwork); @@ -2717,7 +2733,7 @@ TEST_CASE("SerializeDeserializeUnidirectionalSequenceLstmCifgPeepholeNoProjectio VerifyLstmLayer checker( layerName, {inputTensorInfo, outputStateTensorInfo, cellStateTensorInfo}, - {outputTensorInfo}, + {outputStateOutTensorInfo, cellStateOutTensorInfo, outputTensorInfo}, descriptor, params); deserializedNetwork->ExecuteStrategy(checker); diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp index 89a0772602..001037908d 100644 --- a/src/backends/backendsCommon/LayerSupportBase.cpp +++ b/src/backends/backendsCommon/LayerSupportBase.cpp @@ -680,9 +680,9 @@ bool LayerSupportBase::IsTransposeSupported(const TensorInfo&, // input bool LayerSupportBase::IsUnidirectionalSequenceLstmSupported(const TensorInfo&, // input const TensorInfo&, // outputStateIn const TensorInfo&, // cellStateIn + const TensorInfo&, // outputStateOut + const TensorInfo&, // cellStateOut const TensorInfo&, // output - const Optional&, // hiddenStateOut - const Optional&, // cellStateOut const LstmDescriptor&, // descriptor const LstmInputParamsInfo&, // paramsInfo Optional reasonIfUnsupported) const diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp index e911c00f95..618d21e5be 100644 --- a/src/backends/backendsCommon/LayerSupportBase.hpp +++ b/src/backends/backendsCommon/LayerSupportBase.hpp @@ -465,9 +465,9 @@ public: const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const LstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported = EmptyOptional()) const override; diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index 70d7641f41..a2dcd63726 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -3860,38 +3860,20 @@ void UnidirectionalSequenceLstmQueueDescriptor::Validate(const WorkloadInfo& wor { throw InvalidArgumentException(descriptorName + ": Invalid number of inputs."); } - if (workloadInfo.m_OutputTensorInfos.size() != 1) + if (workloadInfo.m_OutputTensorInfos.size() != 3) { throw InvalidArgumentException(descriptorName + ": Invalid number of outputs."); } std::vector supportedTypes = { - DataType::Float32 + DataType::Float32, + DataType::QAsymmS8 }; // check for supported type of one input and match them with all the other input and output ValidateDataTypes(workloadInfo.m_InputTensorInfos[0], supportedTypes, descriptorName); - // type matches all other inputs - for (uint32_t i = 1u; i < workloadInfo.m_InputTensorInfos.size(); ++i) - { - ValidateTensorDataTypesMatch(workloadInfo.m_InputTensorInfos[0], - workloadInfo.m_InputTensorInfos[i], - descriptorName, - "input_0", - "input_" + std::to_string(i)); - } - // type matches all other outputs - for (uint32_t i = 0u; i < workloadInfo.m_OutputTensorInfos.size(); ++i) - { - ValidateTensorDataTypesMatch(workloadInfo.m_InputTensorInfos[0], - workloadInfo.m_OutputTensorInfos[i], - "LstmQueueDescriptor", - "input_0", - "output_" + std::to_string(i)); - } - // Making sure clipping parameters have valid values. // == 0 means no clipping // > 0 means clipping @@ -3936,7 +3918,7 @@ void UnidirectionalSequenceLstmQueueDescriptor::Validate(const WorkloadInfo& wor descriptorName + " input_2"); // outputTensor - ValidateTensorNumDimNumElem(workloadInfo.m_OutputTensorInfos[0], 3, (timeStep * n_batch * n_output), + ValidateTensorNumDimNumElem(workloadInfo.m_OutputTensorInfos[2], 3, (timeStep * n_batch * n_output), descriptorName + " output_0"); // check that dimensions of inputs/outputs and QueueDescriptor data match with each other diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp index f955aec30f..5847e8cc21 100644 --- a/src/backends/backendsCommon/WorkloadFactory.cpp +++ b/src/backends/backendsCommon/WorkloadFactory.cpp @@ -1367,7 +1367,9 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId, const TensorInfo& cellStateIn = OverrideDataType(layer.GetInputSlot(2).GetConnection()->GetTensorInfo(), dataType); // Outputs - const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + const TensorInfo& outputStateOut = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + const TensorInfo& cellStateOut = OverrideDataType(layer.GetOutputSlot(1).GetTensorInfo(), dataType); + const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(2).GetTensorInfo(), dataType); // Basic parameters const TensorInfo& inputToForgetWeights @@ -1481,15 +1483,12 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId, paramsInfo.m_OutputLayerNormWeights = &optOutputLayerNormWeights; } - Optional hiddenStateOut; - Optional cellStateOut; - result = layerSupportObject.IsUnidirectionalSequenceLstmSupported(input, outputStateIn, cellStateIn, - output, - hiddenStateOut, + outputStateOut, cellStateOut, + output, descriptor, paramsInfo, reason); diff --git a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp index c719472711..6effa9c85d 100644 --- a/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp +++ b/src/backends/backendsCommon/test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp @@ -31,7 +31,7 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( armnn::DataType constantDataType = armnn::DataType::Float32) { IgnoreUnused(memoryManager); - unsigned int batchSize = armnn::numeric_cast(inputShape[1]); + unsigned int batchSize = armnn::numeric_cast(inputShape[0]); unsigned int inputSize = armnn::numeric_cast(inputShape[2]); unsigned int outputSize = armnn::numeric_cast(outputExpectedShape[2]); unsigned numUnits = outputSize; @@ -39,7 +39,8 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( armnn::TensorInfo inputTensorInfo({1, batchSize , inputSize}, ArmnnType, qScale, qOffset ); armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, ArmnnType, qScale, qOffset); - + armnn::TensorInfo outputStateOutTensorInfo({ batchSize, 1, outputSize }, ArmnnType, qScale, qOffset); + armnn::TensorInfo cellStateOutTensorInfo({ batchSize, 1, outputSize }, ArmnnType, qScale, qOffset); armnn::TensorInfo outputTensorInfo({1, batchSize, outputSize}, ArmnnType, qScale, qOffset); std::vector inputVector; @@ -48,6 +49,8 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( std::vector cellStateInVector(batchSize * numUnits, T()); std::vector outputStateInVector(batchSize * outputSize, T()); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); std::vector outputVector; @@ -59,6 +62,10 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -68,6 +75,8 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo4({numUnits}, constantDataType , qScale, qOffset); @@ -184,6 +193,8 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -192,6 +203,8 @@ UnidirectionalSequenceLstmTimeMajorSingleBatchTestImpl( workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -222,7 +235,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, ArmnnType, qScale, qOffset); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, ArmnnType, qScale, qOffset); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, ArmnnType, qScale, qOffset); std::vector inputVector; @@ -231,6 +245,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( std::vector cellStateInVector(batchSize * numUnits, T()); std::vector outputStateInVector(batchSize * outputSize, T()); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); std::vector outputVector; @@ -242,6 +258,10 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -251,6 +271,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo4({numUnits}, constantDataType, qScale, qOffset); @@ -359,6 +381,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -367,6 +391,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerFloat32TestImpl( workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -398,7 +424,8 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( armnn::TensorInfo inputTensorInfo({timeSize, batchSize, inputSize}, ArmnnType, qScale, qOffset); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, ArmnnType, qScale, qOffset); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({timeSize, batchSize, outputSize}, ArmnnType, qScale, qOffset); std::vector inputVector; @@ -407,6 +434,8 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( std::vector cellStateInVector(batchSize * numUnits, T()); std::vector outputStateInVector(batchSize * outputSize, T()); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); std::vector outputVector; @@ -418,6 +447,10 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -427,6 +460,8 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo4({numUnits}, constantDataType, qScale, qOffset); @@ -535,6 +570,8 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -543,6 +580,8 @@ UnidirectionalSequenceLstmLayerFloat32TimeMajorTestImpl( workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -644,6 +683,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 1., 2., 3., 4., 5., 4., @@ -654,6 +695,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector expectedOutput = { -0.0135612f, -0.0263441f, 0.0314008f, -0.00883455f, 0.00763052f, @@ -668,6 +711,11 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP tensorHandleFactory.CreateTensorHandle(cellStateInTensorInfo); std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -676,6 +724,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo5({outputSize}, armnn::DataType::Float32); @@ -849,6 +900,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP inputHandle->Allocate(); outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -857,6 +911,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -880,6 +936,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 1., 2., 3., 4., 5., 4., @@ -889,6 +947,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector expectedOutput = { 0.0642256f, 0.0343966f, 0.184122f, 0.114717f, @@ -904,6 +964,10 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -913,6 +977,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo4({outputSize}, armnn::DataType::Float32); @@ -1074,6 +1140,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP inputHandle->Allocate(); outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1082,6 +1151,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerNoCifgWithPeepholeWithP workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1105,7 +1176,8 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::DataType::Float32); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); std::vector inputVector = { 1., 2., 3., 4., 5., 4., @@ -1115,6 +1187,8 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); std::vector outputVector = { -0.0129257f, -0.070531f, -0.153508f, -0.0392391f, @@ -1130,6 +1204,10 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -1139,6 +1217,8 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfo4({numUnits}, armnn::DataType::Float32); @@ -1236,6 +1316,8 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1244,6 +1326,8 @@ LayerTestResult UnidirectionalSequenceLstmWithCifgWithPeepholeNoProjec workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1267,7 +1351,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::DataType::Float32); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.4f, @@ -1277,6 +1362,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector outputVector = { -0.0142517f, -0.0198845f, -0.0120569f, -0.0116868f, @@ -1292,8 +1379,13 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); + armnn::UnidirectionalSequenceLstmQueueDescriptor data; armnn::WorkloadInfo info; @@ -1301,6 +1393,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfoNumFp({numUnits}, armnn::DataType::Float32); @@ -1376,6 +1470,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1384,6 +1480,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8Test( workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1407,7 +1505,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( armnn::TensorInfo inputTensorInfo({timeSize, batchSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::DataType::Float32); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({timeSize, batchSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.4f, @@ -1417,6 +1516,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector outputVector = { -0.0142517f, -0.0198845f, -0.0120122f, -0.0116868f, @@ -1431,8 +1532,13 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); + armnn::UnidirectionalSequenceLstmQueueDescriptor data; armnn::WorkloadInfo info; @@ -1440,6 +1546,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfoNumFp({numUnits}, armnn::DataType::Float32); @@ -1516,6 +1624,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1524,6 +1634,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8TimeMajorTest( workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1547,6 +1659,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.4f, @@ -1556,6 +1670,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector expectedOutput = { 0.612103f, 1.56788f, 0.31966f, 1.42956f, @@ -1570,6 +1686,11 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW tensorHandleFactory.CreateTensorHandle(cellStateInTensorInfo); std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -1578,6 +1699,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfoOut({outputSize}, armnn::DataType::Float32); @@ -1679,6 +1803,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW inputHandle->Allocate(); outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1687,6 +1814,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1710,6 +1839,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::DataType::Float32); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 1., 8., 3., 4., 5., 4., @@ -1719,6 +1850,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector expectedOutput = { 0.0471276f, 0.0168155f, 0.0789885f, 0.16550f, @@ -1734,6 +1867,10 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -1743,6 +1880,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfoOut({outputSize}, armnn::DataType::Float32); @@ -1871,6 +2010,9 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW inputHandle->Allocate(); outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -1879,6 +2021,8 @@ LayerTestResult UnidirectionalSequenceLstmLayerInt8NoCifgWithPeepholeW workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, @@ -1902,7 +2046,8 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr armnn::TensorInfo inputTensorInfo({batchSize, timeSize, inputSize}, armnn::DataType::Float32); armnn::TensorInfo cellStateInTensorInfo({batchSize, numUnits}, armnn::DataType::Float32); armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::DataType::Float32); - + armnn::TensorInfo outputStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo({batchSize, timeSize, outputSize}, armnn::DataType::Float32); const std::vector inputVector = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.4f, @@ -1912,6 +2057,8 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr std::vector cellStateInVector(batchSize * numUnits, 0.f); std::vector outputStateInVector(batchSize * outputSize, 0.f); + std::vector actualOutputStateOut(outputStateOutTensorInfo.GetNumElements()); + std::vector actualCellStateOut(cellStateOutTensorInfo.GetNumElements()); std::vector actualOutput(outputTensorInfo.GetNumElements()); const std::vector outputVector = { -0.0072104f, -0.00991171f, -0.00650478f, -0.00713055f, @@ -1927,6 +2074,10 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr std::unique_ptr outputStateInHandle = tensorHandleFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr outputStateOutHandle = + tensorHandleFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + tensorHandleFactory.CreateTensorHandle(cellStateOutTensorInfo); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::UnidirectionalSequenceLstmQueueDescriptor data; @@ -1936,6 +2087,8 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); armnn::TensorInfo tensorInfoNumFp({numUnits}, armnn::DataType::Float32); @@ -2009,6 +2162,8 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr outputStateInHandle->Allocate(); cellStateInHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); outputHandle->Allocate(); CopyDataToITensorHandle(inputHandle.get(), inputVector.data()); @@ -2017,10 +2172,12 @@ LayerTestResult UnidirectionalSequenceLstmInt8WithCifgWithPeepholeNoPr workload->Execute(); + CopyDataFromITensorHandle(actualOutputStateOut.data(), outputStateOutHandle.get()); + CopyDataFromITensorHandle(actualCellStateOut.data(), cellStateOutHandle.get()); CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get()); return LayerTestResult(actualOutput, outputVector, outputHandle->GetShape(), outputTensorInfo.GetShape()); -} \ No newline at end of file +} diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index 4dcaca9576..09b3f43838 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -1444,9 +1444,9 @@ bool ClLayerSupport::IsTransposeSupported(const TensorInfo& input, bool ClLayerSupport::IsUnidirectionalSequenceLstmSupported(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const @@ -1456,9 +1456,9 @@ bool ClLayerSupport::IsUnidirectionalSequenceLstmSupported(const TensorInfo& inp input, outputStateIn, cellStateIn, + outputStateOut, + cellStateOut, output, - hiddenStateOutput, - cellStateOutput, descriptor, paramsInfo); } diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp index b4d0e8298a..4f4e64e113 100644 --- a/src/backends/cl/ClLayerSupport.hpp +++ b/src/backends/cl/ClLayerSupport.hpp @@ -334,9 +334,9 @@ public: bool IsUnidirectionalSequenceLstmSupported(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const override; diff --git a/src/backends/cl/workloads/ClUnidirectionalSequenceLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClUnidirectionalSequenceLstmFloatWorkload.cpp index cc9aea8486..ac24120804 100644 --- a/src/backends/cl/workloads/ClUnidirectionalSequenceLstmFloatWorkload.cpp +++ b/src/backends/cl/workloads/ClUnidirectionalSequenceLstmFloatWorkload.cpp @@ -41,17 +41,17 @@ ClUnidirectionalSequenceLstmFloatWorkload::ClUnidirectionalSequenceLstmFloatWork GetGuid()); const arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[2])->GetTensor(); TensorInfo inputInfo = info.m_InputTensorInfos[0]; - TensorInfo outputInfo = info.m_OutputTensorInfos[0]; + TensorInfo outputInfo = info.m_OutputTensorInfos[2]; arm_compute::DataType armComputeDataType = static_cast(m_Data.m_Inputs[0])->GetDataType(); armnn::DataType armnnDataType = GetArmNNDataType(armComputeDataType); TensorShape inputLayerShape = static_cast(m_Data.m_Inputs[0])->GetShape(); TensorShape cellStateLayerShape = static_cast(m_Data.m_Inputs[2])->GetShape(); - TensorShape outputLayerShape = static_cast(m_Data.m_Outputs[0])->GetShape(); + TensorShape outputLayerShape = static_cast(m_Data.m_Outputs[2])->GetShape(); unsigned int maxTime = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1]; unsigned int batchSize = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0]; diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index e2098a310f..210535536e 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -77,6 +77,7 @@ #include "workloads/NeonTransposeConvolution2dWorkload.hpp" #include "workloads/NeonTransposeWorkload.hpp" #include "workloads/NeonUnidirectionalSequenceLstmFloatWorkload.hpp" +#include "workloads/NeonUnidirectionalSequenceLstmWorkload.hpp" #endif namespace armnn @@ -1436,23 +1437,44 @@ bool NeonLayerSupport::IsTransposeSupported(const TensorInfo& input, bool NeonLayerSupport::IsUnidirectionalSequenceLstmSupported(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const { - FORWARD_WORKLOAD_VALIDATE_FUNC(NeonUnidirectionalSequenceLstmFloatWorkloadValidate, - reasonIfUnsupported, - input, - outputStateIn, - cellStateIn, - output, - hiddenStateOutput, - cellStateOutput, - descriptor, - paramsInfo); + if (input.GetDataType() == armnn::DataType::QAsymmS8 && + outputStateIn.GetDataType() == armnn::DataType::QAsymmS8 && + cellStateIn.GetDataType() == armnn::DataType::QSymmS16 && + outputStateOut.GetDataType() == armnn::DataType::QAsymmS8 && + cellStateOut.GetDataType() == armnn::DataType::QSymmS16 && + output.GetDataType() == armnn::DataType::QAsymmS8) + { + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonUnidirectionalSequenceLstmWorkloadValidate, + reasonIfUnsupported, + input, + outputStateIn, + cellStateIn, + outputStateOut, + cellStateOut, + output, + descriptor, + paramsInfo); + } + else + { + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonUnidirectionalSequenceLstmFloatWorkloadValidate, + reasonIfUnsupported, + input, + outputStateIn, + cellStateIn, + outputStateOut, + cellStateOut, + output, + descriptor, + paramsInfo); + } } } // namespace armnn diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp index 1eef41fda5..511bb035d2 100644 --- a/src/backends/neon/NeonLayerSupport.hpp +++ b/src/backends/neon/NeonLayerSupport.hpp @@ -339,9 +339,9 @@ public: bool IsUnidirectionalSequenceLstmSupported(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const override; diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index 7d94dafc9a..c83e8b3e6d 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -558,7 +558,20 @@ std::unique_ptr NeonWorkloadFactory::CreateWorkload(LayerType type, case LayerType::UnidirectionalSequenceLstm : { auto desc = PolymorphicDowncast(&descriptor); - return MakeWorkloadHelper(*desc, info); + + if ((info.m_InputTensorInfos[0].GetDataType() == armnn::DataType::Float32) && + (info.m_InputTensorInfos[1].GetDataType() == armnn::DataType::Float32) && + (info.m_InputTensorInfos[2].GetDataType() == armnn::DataType::Float32) && + (info.m_OutputTensorInfos[0].GetDataType() == armnn::DataType::Float32) && + (info.m_OutputTensorInfos[1].GetDataType() == armnn::DataType::Float32) && + (info.m_OutputTensorInfos[2].GetDataType() == armnn::DataType::Float32)) + { + return std::make_unique(*desc, info); + } + else + { + return std::make_unique(*desc, info); + } } default: return nullptr; diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk index d43426f7f4..0d6fd8035c 100644 --- a/src/backends/neon/backend.mk +++ b/src/backends/neon/backend.mk @@ -85,7 +85,8 @@ BACKEND_SOURCES := \ workloads/NeonSubtractionWorkload.cpp \ workloads/NeonTransposeConvolution2dWorkload.cpp \ workloads/NeonTransposeWorkload.cpp \ - workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp + workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp \ + workloads/NeonUnidirectionalSequenceLstmWorkload.cpp else diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt index 41c5f5a950..33a18e38da 100644 --- a/src/backends/neon/workloads/CMakeLists.txt +++ b/src/backends/neon/workloads/CMakeLists.txt @@ -133,6 +133,8 @@ list(APPEND armnnNeonBackendWorkloads_sources NeonTransposeWorkload.hpp NeonUnidirectionalSequenceLstmFloatWorkload.cpp NeonUnidirectionalSequenceLstmFloatWorkload.hpp + NeonUnidirectionalSequenceLstmWorkload.cpp + NeonUnidirectionalSequenceLstmWorkload.hpp NeonWorkloads.hpp NeonWorkloadUtils.hpp ) diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp index c911afb237..8dba719d91 100644 --- a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp +++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.cpp @@ -39,7 +39,7 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat GetGuid()); const arm_compute::ITensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ITensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + arm_compute::ITensor& output = static_cast(m_Data.m_Outputs[2])->GetTensor(); TensorInfo inputInfo = info.m_InputTensorInfos[0]; TensorInfo outputInfo = info.m_OutputTensorInfos[0]; @@ -49,7 +49,7 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat TensorShape inputLayerShape = static_cast(m_Data.m_Inputs[0])->GetShape(); TensorShape cellStateLayerShape = static_cast(m_Data.m_Inputs[2])->GetShape(); - TensorShape outputLayerShape = static_cast(m_Data.m_Outputs[0])->GetShape(); + TensorShape outputLayerShape = static_cast(m_Data.m_Outputs[2])->GetShape(); unsigned int maxTime = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1]; unsigned int batchSize = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0]; @@ -288,7 +288,7 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo. if (maxTime == 1 && m_Data.m_Parameters.m_TimeMajor) { - TensorShape inputShape = GetTensorShape((&input)->info()->tensor_shape(), 1U); + TensorShape inputShape = GetTensorShape(input.info()->tensor_shape(), 1U); TensorShape outputShape = GetTensorShape((&output)->info()->tensor_shape(), 1U); TensorShape inputShapeShrink({inputShape[1], inputShape[2]}); @@ -297,10 +297,10 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink); auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink); - (&input)->info()->set_tensor_shape(acl_input_shape_shrink); + input.info()->set_tensor_shape(acl_input_shape_shrink); inputLSTM = const_cast(&input); - (&output)->info()->set_tensor_shape(acl_output_shape_shrink); + output.info()->set_tensor_shape(acl_output_shape_shrink); outputLSTM = &output; } // If there is only one LSTM batch major batch, we will not concat, only permute. @@ -432,9 +432,9 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat unsigned int aclAxisConcat = CalcAclAxis(concatDescriptor.GetNumDimensions(), concatDescriptor.GetConcatAxis()); if (!m_Data.m_Parameters.m_TimeMajor) { - TensorInfo concatOuputTensorInfo = outputInfo; - concatOuputTensorInfo.SetShape(timeMajorShapeOutput); - BuildArmComputeTensor(concat_out, concatOuputTensorInfo); + TensorInfo concatOutputTensorInfo = outputInfo; + concatOutputTensorInfo.SetShape(timeMajorShapeOutput); + BuildArmComputeTensor(concat_out, concatOutputTensorInfo); armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_out); m_Concat->configure(m_ConcatInputs, &concat_out, aclAxisConcat); @@ -452,11 +452,11 @@ NeonUnidirectionalSequenceLstmFloatWorkload::NeonUnidirectionalSequenceLstmFloat { if (!m_Data.m_Parameters.m_TimeMajor) { - (&output)->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandBatchMajor)); + output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandBatchMajor)); } else { - (&output)->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor)); + output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor)); } } @@ -510,14 +510,12 @@ arm_compute::Status NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo) { - IgnoreUnused(hiddenStateOutput, cellStateOutput); - TensorShape inputLayerShape = input.GetShape(); TensorShape outputLayerShape = outputStateIn.GetShape(); @@ -612,8 +610,6 @@ NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, arm_compute::LSTMParams lstm_params_info; const TensorInfo& scratchBuffer = TensorInfo(cellStateIn.GetShape(), input.GetDataType()); - const TensorInfo& outputStateOut = TensorInfo(outputStateIn.GetShape(), input.GetDataType()); - const TensorInfo& cellStateOut = TensorInfo(cellStateIn.GetShape(), input.GetDataType()); // The inputs and outputs const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn); @@ -704,7 +700,7 @@ NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, aclOutputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetOutputLayerNormWeights()); lstm_params_info.set_layer_normalization_params(descriptor.m_CifgEnabled ? nullptr : - &aclInputLayerNormWeightsInfo, + &aclInputLayerNormWeightsInfo, &aclForgetLayerNormWeightsInfo, &aclCellLayerNormWeightsInfo, &aclOutputLayerNormWeightsInfo); @@ -803,9 +799,9 @@ NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]}); TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]}); - TensorInfo concatOuputTensorInfo = TensorInfo(output); - concatOuputTensorInfo.SetShape(timeMajorShapeOutput); - arm_compute::TensorInfo aclConcatOuputTensorInfo= BuildArmComputeTensorInfo(concatOuputTensorInfo); + TensorInfo concatOutputTensorInfo = TensorInfo(output); + concatOutputTensorInfo.SetShape(timeMajorShapeOutput); + arm_compute::TensorInfo aclConcatOutputTensorInfo= BuildArmComputeTensorInfo(concatOutputTensorInfo); if (maxTime != 1) // ACL concat does not work with only one element to concatenate. { @@ -819,7 +815,7 @@ NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, if (!descriptor.m_TimeMajor) { statusConcat = arm_compute::NEConcatenateLayer::validate(concatInputsTensorInfosPtr, - &aclConcatOuputTensorInfo, + &aclConcatOutputTensorInfo, aclAxisConcat); } else @@ -853,7 +849,7 @@ NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, // Output now time major. Permute output back to batch major. if (maxTime != 1) { - statusPermute2 = arm_compute::NEPermute::validate(&aclConcatOuputTensorInfo, + statusPermute2 = arm_compute::NEPermute::validate(&aclConcatOutputTensorInfo, &aclOutputInfo, arm_compute::PermutationVector(0U, 2U, 1U)); } diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.hpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.hpp index 776afd3965..48cf7dc7e4 100644 --- a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.hpp +++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmFloatWorkload.hpp @@ -10,7 +10,6 @@ #include #include -#include "arm_compute/graph/Tensor.h" #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/runtime/NEON/functions/NESplit.h" @@ -86,9 +85,9 @@ arm_compute::Status NeonUnidirectionalSequenceLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo); diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp new file mode 100644 index 0000000000..dfbbb3c879 --- /dev/null +++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.cpp @@ -0,0 +1,879 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "NeonUnidirectionalSequenceLstmWorkload.hpp" +#include "NeonWorkloadUtils.hpp" + +#include +#include + +#include +#include +#include +#include + +#include "neon/NeonTensorHandle.hpp" + +namespace +{ + +unsigned int CalcAclAxis(unsigned int numDimensions, unsigned int axis) +{ + return (numDimensions - axis) - 1; +} +} //namespace + +namespace armnn +{ +using namespace armcomputetensorutils; + +NeonUnidirectionalSequenceLstmWorkload::NeonUnidirectionalSequenceLstmWorkload + (const UnidirectionalSequenceLstmQueueDescriptor& descriptor, const WorkloadInfo& info) + : NeonBaseWorkload(descriptor, info) +{ + // Report Profiling Details + ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonUnidirectionalSequenceLstmWorkload_Construct", + descriptor.m_Parameters, + info, + GetGuid()); + + // Input/Output tensors + const arm_compute::ITensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ITensor& outputStateIn = static_cast(m_Data.m_Inputs[1])->GetTensor(); + const arm_compute::ITensor& cellStateIn = static_cast(m_Data.m_Inputs[2])->GetTensor(); + + arm_compute::ITensor& outputStateOut = static_cast(m_Data.m_Outputs[0])->GetTensor(); + arm_compute::ITensor& cellStateOut = static_cast(m_Data.m_Outputs[1])->GetTensor(); + arm_compute::ITensor& output = static_cast(m_Data.m_Outputs[2])->GetTensor(); + + TensorInfo inputInfo = info.m_InputTensorInfos[0]; + TensorInfo outputInfo = info.m_OutputTensorInfos[2]; + + TensorShape inputLayerShape = static_cast(m_Data.m_Inputs[0])->GetShape(); + TensorShape outputLayerShape = static_cast(m_Data.m_Outputs[2])->GetShape(); + + unsigned int maxTime = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1]; + unsigned int batchSize = m_Data.m_Parameters.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0]; + unsigned int inputSize = inputLayerShape[2]; + unsigned int outputSize = outputLayerShape[2]; + + const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize}); + const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize}); + + // + // Permute: performed if Unidirectional Sequence Layer inputs/outputs are in batch major format. + // + if (!m_Data.m_Parameters.m_TimeMajor) + { + std::unique_ptr layer(new arm_compute::NEPermute()); + + TensorInfo permuteOutInfo = inputInfo; + permuteOutInfo.SetShape(timeMajorShapeInput); + BuildArmComputeTensor(m_PermuteFirstOut, permuteOutInfo); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_PermuteFirstOut); + + // Permute to time major format. + layer->configure(&input, &m_PermuteFirstOut, arm_compute::PermutationVector(0U,2U,1U)); + m_Permute1.reset(layer.release()); + } + + // + // Split and Concat Tensors + // + for (unsigned int i = 0; i < maxTime; ++i) + { + arm_compute::Tensor splitter_out; + arm_compute::Tensor concat_in; + + auto splitterTensorInfo = inputInfo; + auto concatTensorInfo = outputInfo; + splitterTensorInfo.SetShape({batchSize, inputSize}); + concatTensorInfo.SetShape({batchSize, outputSize}); + BuildArmComputeTensor(splitter_out, splitterTensorInfo); + BuildArmComputeTensor(concat_in, concatTensorInfo); + + armcomputetensorutils::InitialiseArmComputeTensorEmpty(splitter_out); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_in); + + // append to std::vector + m_SplitterOutputsTensors.push_back(std::move(splitter_out)); + m_ConcatInputsTensors.push_back(std::move(concat_in)); + } + + for (unsigned int i = 0; i < maxTime; ++i) + { + // append to std::vector + m_SplitterOutputs.push_back(&m_SplitterOutputsTensors[i]); + m_ConcatInputs.push_back(&m_ConcatInputsTensors[i]); + } + + // + // Split + // + unsigned int numberDimensions = 3; + unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension) + + if (maxTime != 1) // ACL split does not work with only one element to split. + { + ViewsDescriptor splitterDesc(maxTime, numberDimensions); + unsigned int splitterDimSizes[3] = {1, batchSize, inputSize}; + for (unsigned int outputIdx = 0u; outputIdx < maxTime; ++outputIdx) + { + splitterDesc.SetViewOriginCoord(outputIdx, dimension, splitterDimSizes[dimension] * outputIdx); + for (unsigned int dimIdx = 0u; dimIdx < numberDimensions; ++dimIdx) + { + splitterDesc.SetViewSize(outputIdx, dimIdx, splitterDimSizes[dimIdx]); + } + } + + std::set splitAxis = ComputeSplitAxis(splitterDesc, timeMajorShapeInput); + + std::unique_ptr split_layer(new arm_compute::NESplit()); + unsigned int aclAxisSplit = CalcAclAxis(splitterDesc.GetNumDimensions(), + *splitAxis.begin()); + if (!m_Data.m_Parameters.m_TimeMajor) + { + split_layer->configure(&m_PermuteFirstOut, m_SplitterOutputs, aclAxisSplit); + } else + { + split_layer->configure(&input, m_SplitterOutputs, aclAxisSplit); + } + + split_layer->prepare(); + m_Splitter.reset(split_layer.release()); + } + + // + // Lstm + // + arm_compute::LSTMParams lstm_param; + + lstm_param.set_cell_clip_params(descriptor.m_Parameters.m_ClippingThresCell); + lstm_param.set_projection_clip_params(descriptor.m_Parameters.m_ClippingThresProj); + + lstm_param.set_matmul_scale_params(descriptor.m_Parameters.m_InputIntermediateScale, + descriptor.m_Parameters.m_ForgetIntermediateScale, + descriptor.m_Parameters.m_CellIntermediateScale, + descriptor.m_Parameters.m_OutputIntermediateScale); + + lstm_param.set_hidden_state_params(descriptor.m_Parameters.m_HiddenStateZeroPoint, + descriptor.m_Parameters.m_HiddenStateScale); + + m_InputToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo()); + + m_InputToCellWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo()); + + m_InputToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo()); + + m_RecurrentToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo()); + + m_RecurrentToCellWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo()); + + m_RecurrentToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo()); + + m_ForgetGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo()); + + m_CellBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo()); + + m_OutputGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo()); + + // for future reference: check the AndroidNN API for the logic here + if (!m_Data.m_Parameters.m_CifgEnabled) + { + m_InputToInputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo()); + + m_RecurrentToInputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo()); + + m_CellToInputWeightsTensor = std::make_unique(); + if (m_Data.m_CellToInputWeights != nullptr) + { + BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo()); + } + + m_InputGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo()); + lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(), + m_RecurrentToInputWeightsTensor.get(), + m_Data.m_CellToInputWeights ? m_CellToInputWeightsTensor.get() : nullptr, + m_InputGateBiasTensor.get()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + m_ProjectionWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo()); + + m_ProjectionBiasTensor = std::make_unique(); + if (m_Data.m_ProjectionBias != nullptr) + { + BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo()); + } + + lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(), + m_Data.m_ProjectionBias ? m_ProjectionBiasTensor.get() : nullptr); + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + m_CellToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo()); + + m_CellToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo()); + + lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get()); + } + + if (m_Data.m_Parameters.m_LayerNormEnabled) + { + m_InputLayerNormWeightsTensor = std::make_unique(); + if (!m_Data.m_Parameters.m_CifgEnabled) + { + BuildArmComputeTensor(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights->GetTensorInfo()); + } + + m_ForgetLayerNormWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights->GetTensorInfo()); + + m_CellLayerNormWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights->GetTensorInfo()); + + m_OutputLayerNormWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights->GetTensorInfo()); + + auto inputNormWeightTensor = m_Data.m_Parameters.m_CifgEnabled ? nullptr : m_InputLayerNormWeightsTensor.get(); + lstm_param.set_layer_normalization_params(inputNormWeightTensor, + m_ForgetLayerNormWeightsTensor.get(), + m_CellLayerNormWeightsTensor.get(), + m_OutputLayerNormWeightsTensor.get()); + } + + for (unsigned int i = 0; i != maxTime; ++i) + { + // Set LSTM input and output ITensors depending on: + // input format (timeMajor) & number of LSTM batches (maxTime). + arm_compute::ITensor* outputLSTM; + arm_compute::ITensor* inputLSTM; + + // If there is only one LSTM time major batch, we will not concat OR permute. + // Set input of LSTM to be first input ITensor. + // Set output of LSTM to be final output ITensor. + // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo. + if (maxTime == 1 && m_Data.m_Parameters.m_TimeMajor) + { + TensorShape inputShape = GetTensorShape(input.info()->tensor_shape(), 1U); + TensorShape outputShape = GetTensorShape(output.info()->tensor_shape(), 1U); + + TensorShape inputShapeShrink({inputShape[1], inputShape[2]}); + TensorShape outputShapeShrink({outputShape[1], outputShape[2]}); + + auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink); + auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink); + + input.info()->set_tensor_shape(acl_input_shape_shrink); + inputLSTM = const_cast(&input); + + output.info()->set_tensor_shape(acl_output_shape_shrink); + outputLSTM = &output; + } + // If there is only one LSTM batch major batch, we will not concat, only permute. + // Set input of LSTM to be output of initial permute. + // Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute. + // LSTM output cannot be > 2 dimensions so need to resize its TensorInfo. + else if (maxTime == 1 && !m_Data.m_Parameters.m_TimeMajor) + { + TensorShape inputShape = GetTensorShape(m_PermuteFirstOut.info()->tensor_shape(), 1U); + TensorShape inputShapeShrink({inputShape[1], inputShape[2]}); + auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink); + m_PermuteFirstOut.info()->set_tensor_shape(acl_input_shape_shrink); + inputLSTM = &m_PermuteFirstOut; + + outputLSTM = const_cast(m_ConcatInputs[i]); + } + // Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on. + else + { + inputLSTM = m_SplitterOutputs[i]; + outputLSTM = const_cast(m_ConcatInputs[i]); + } + + std::unique_ptr lstm_layer(new arm_compute::NEQLSTMLayer()); + + lstm_layer->configure(inputLSTM, + m_InputToForgetWeightsTensor.get(), + m_InputToCellWeightsTensor.get(), + m_InputToOutputWeightsTensor.get(), + m_RecurrentToForgetWeightsTensor.get(), + m_RecurrentToCellWeightsTensor.get(), + m_RecurrentToOutputWeightsTensor.get(), + m_ForgetGateBiasTensor.get(), + m_CellBiasTensor.get(), + m_OutputGateBiasTensor.get(), + &cellStateIn, + &outputStateIn, + &cellStateOut, + &outputStateOut, + outputLSTM, + lstm_param); + + m_Layers.emplace_back(std::move(lstm_layer)); + } + + InitializeArmComputeTensorData(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights); + InitializeArmComputeTensorData(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights); + InitializeArmComputeTensorData(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights); + InitializeArmComputeTensorData(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights); + InitializeArmComputeTensorData(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights); + InitializeArmComputeTensorData(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights); + InitializeArmComputeTensorData(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias); + InitializeArmComputeTensorData(*m_CellBiasTensor, m_Data.m_CellBias); + InitializeArmComputeTensorData(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias); + + if (!m_Data.m_Parameters.m_CifgEnabled) + { + InitializeArmComputeTensorData(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights); + InitializeArmComputeTensorData(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights); + if (m_Data.m_CellToInputWeights != nullptr) + { + InitializeArmComputeTensorData(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights); + } + InitializeArmComputeTensorData(*m_InputGateBiasTensor, m_Data.m_InputGateBias); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + InitializeArmComputeTensorData(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights); + if (m_Data.m_ProjectionBias != nullptr) + { + InitializeArmComputeTensorData(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias); + } + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + InitializeArmComputeTensorData(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights); + InitializeArmComputeTensorData(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights); + } + + if (m_Data.m_Parameters.m_LayerNormEnabled) + { + if (!m_Data.m_Parameters.m_CifgEnabled) + { + InitializeArmComputeTensorData(*m_InputLayerNormWeightsTensor, m_Data.m_InputLayerNormWeights); + } + InitializeArmComputeTensorData(*m_ForgetLayerNormWeightsTensor, m_Data.m_ForgetLayerNormWeights); + InitializeArmComputeTensorData(*m_CellLayerNormWeightsTensor, m_Data.m_CellLayerNormWeights); + InitializeArmComputeTensorData(*m_OutputLayerNormWeightsTensor, m_Data.m_OutputLayerNormWeights); + } + + // Force Compute Library to perform the necessary copying and reshaping. + // After which delete all the input tensors that will no longer be needed. + for (uint32_t i = 0; i < m_Layers.size(); ++i) + { + m_Layers[i]->prepare(); + } + + // + // Concat + // + + // Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs. + TensorShape shape = GetTensorShape(m_ConcatInputs[0]->info()->tensor_shape(), 1U); + TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]}); + TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]}); + + if (maxTime != 1) // ACL concat does not work with only one element to concatenate. + { + for (unsigned int i = 0; i < maxTime; ++i) + { + m_ConcatInputs[i]->info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor)); + } + ConcatDescriptor concatDescriptor(maxTime, numberDimensions); // maxTime = num inputs (aka. number of views). + + for (unsigned int inputIdx = 0u; inputIdx < maxTime; ++inputIdx) + { + concatDescriptor.SetViewOriginCoord(inputIdx, dimension, inputIdx); + concatDescriptor.SetConcatAxis(dimension); + } + m_Concat.reset(new arm_compute::NEConcatenateLayer()); + + unsigned int aclAxisConcat = CalcAclAxis(concatDescriptor.GetNumDimensions(), concatDescriptor.GetConcatAxis()); + if (!m_Data.m_Parameters.m_TimeMajor) + { + TensorInfo concatOutputTensorInfo = outputInfo; + concatOutputTensorInfo.SetShape(timeMajorShapeOutput); + BuildArmComputeTensor(concat_out, concatOutputTensorInfo); + armcomputetensorutils::InitialiseArmComputeTensorEmpty(concat_out); + + m_Concat->configure(m_ConcatInputs, &concat_out, aclAxisConcat); + } + else + { + m_Concat->configure(m_ConcatInputs, &output, aclAxisConcat); + } + + m_Concat->prepare(); + } + // If only one LSTM batch, we do not concat and/or permute. + // Must ensure final output info is expanded to correct batch major dimensions. + else + { + if (!m_Data.m_Parameters.m_TimeMajor) + { + output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandBatchMajor)); + } + else + { + output.info()->set_tensor_shape(BuildArmComputeTensorShape(shapeExpandTimeMajor)); + } + } + + // + // Permute: only done if input/output are in batch major format. + // + if (!m_Data.m_Parameters.m_TimeMajor) + { + // Output now time major. Permute output back to batch major. + std::unique_ptr layer(new arm_compute::NEPermute()); + if (maxTime != 1) + { + layer->configure(&concat_out, &output, arm_compute::PermutationVector(0U, 2U, 1U)); + } + else + { + layer->configure(m_ConcatInputs[0], &output, arm_compute::PermutationVector(0U, 2U, 1U)); + } + m_Permute2.reset(layer.release()); + } + + FreeUnusedTensors(); +} + +void NeonUnidirectionalSequenceLstmWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonUnidirectionalSequenceLstmWorkload_Execute", GetGuid()); + if (m_Permute1) + { + m_Permute1->run(); + } + if (m_Splitter) + { + m_Splitter->run(); + } + for (uint32_t i = 0; i < m_Layers.size(); ++i) + { + m_Layers[i]->run(); + } + if (m_Concat) + { + m_Concat->run(); + } + if (m_Permute2) + { + m_Permute2->run(); + } +} + +arm_compute::Status +NeonUnidirectionalSequenceLstmWorkloadValidate(const TensorInfo& input, + const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, + const TensorInfo& output, + const UnidirectionalSequenceLstmDescriptor& descriptor, + const LstmInputParamsInfo& paramsInfo) +{ + TensorShape inputLayerShape = input.GetShape(); + TensorShape outputLayerShape = output.GetShape(); + + unsigned int maxTime = descriptor.m_TimeMajor ? inputLayerShape[0] : inputLayerShape[1]; + unsigned int batchSize = descriptor.m_TimeMajor ? inputLayerShape[1] : inputLayerShape[0]; + unsigned int inputSize = inputLayerShape[2]; + unsigned int outputSize = outputLayerShape[2]; + + const TensorShape timeMajorShapeInput({maxTime, batchSize, inputSize}); + const TensorShape timeMajorShapeOutput({maxTime, batchSize, outputSize}); + + arm_compute::Status statusPermute1 = arm_compute::Status(arm_compute::ErrorCode::OK, + "Permute1 status"); + arm_compute::Status statusSplit = arm_compute::Status(arm_compute::ErrorCode::OK, + "Split status"); + arm_compute::Status statusLSTM = arm_compute::Status(arm_compute::ErrorCode::OK, + "LSTM status"); + arm_compute::Status statusConcat = arm_compute::Status(arm_compute::ErrorCode::OK, + "Concat status"); + arm_compute::Status statusPermute2 = arm_compute::Status(arm_compute::ErrorCode::OK, + "Permute2 status"); + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // + // Permute validate + // + TensorInfo permuteOutInfo = TensorInfo(input); + arm_compute::TensorInfo aclPermuteOutInfo = armcomputetensorutils::BuildArmComputeTensorInfo(permuteOutInfo); + if (!descriptor.m_TimeMajor) + { + statusPermute1 = arm_compute::NEPermute::validate(&aclInputInfo, + &aclPermuteOutInfo, + arm_compute::PermutationVector(0U, 2U, 1U)); + } + + // + // Split and Concat Tensors validate + // + std::vector splitterOutputsTensorInfos; + std::vector concatInputsTensorInfos; + std::vector splitterOutputsTensorInfosPtr; + std::vector concatInputsTensorInfosPtr; + splitterOutputsTensorInfos.reserve(maxTime); + concatInputsTensorInfos.reserve(maxTime); + for (unsigned int i = 0; i < maxTime; ++i) + { + arm_compute::TensorInfo splitter_out; + arm_compute::TensorInfo concat_in; + + auto splitterTensorInfo = TensorInfo(input); + auto concatTensorInfo = TensorInfo(output); + splitterTensorInfo.SetShape({batchSize, inputSize}); + concatTensorInfo.SetShape({batchSize, outputSize}); + + arm_compute::TensorInfo aclSplitterTensorInfo + = armcomputetensorutils::BuildArmComputeTensorInfo(splitterTensorInfo); + arm_compute::TensorInfo aclConcatTensorInfo + = armcomputetensorutils::BuildArmComputeTensorInfo(concatTensorInfo); + + splitterOutputsTensorInfos.emplace_back(aclSplitterTensorInfo); + concatInputsTensorInfos.emplace_back(aclConcatTensorInfo); + splitterOutputsTensorInfosPtr.emplace_back(&splitterOutputsTensorInfos[i]); + concatInputsTensorInfosPtr.emplace_back(&concatInputsTensorInfos[i]); + } + + // + // Split validate + // + unsigned int numberDimensions = 3; + unsigned int dimension = 0; // splitting on 0-dimension (i.e. maxTime dimension) + unsigned int aclAxisSplit = CalcAclAxis(numberDimensions, dimension); + + if (maxTime != 1) // ACL split does not work with only one element to split. + { + if (!descriptor.m_TimeMajor) + { + statusSplit = arm_compute::NESplit::validate(&aclPermuteOutInfo, + splitterOutputsTensorInfosPtr, + aclAxisSplit); + } else + { + statusSplit = arm_compute::NESplit::validate(&aclInputInfo, splitterOutputsTensorInfosPtr, aclAxisSplit); + } + } + + // + // LSTM validate + // + + arm_compute::LSTMParams lstm_params_info; + + const TensorInfo& scratchBuffer = TensorInfo(cellStateIn.GetShape(), input.GetDataType()); + + lstm_params_info.set_cell_clip_params(descriptor.m_ClippingThresCell); + lstm_params_info.set_projection_clip_params(descriptor.m_ClippingThresProj); + // The inputs and outputs + const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn); + const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn); + const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer); + const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut); + const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut); + + // Basic parameters + const arm_compute::TensorInfo aclInputToForgetWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetInputToForgetWeights()); + const arm_compute::TensorInfo aclInputToCellWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetInputToCellWeights()); + const arm_compute::TensorInfo aclInputToOutputWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetInputToOutputWeights()); + const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToForgetWeights()); + const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToCellWeights()); + const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo + = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToOutputWeights()); + const arm_compute::TensorInfo aclForgetGateBiasInfo + = BuildArmComputeTensorInfo(paramsInfo.GetForgetGateBias()); + const arm_compute::TensorInfo aclCellBiasInfo + = BuildArmComputeTensorInfo(paramsInfo.GetCellBias()); + const arm_compute::TensorInfo aclOutputGateBiasInfo + = BuildArmComputeTensorInfo(paramsInfo.GetOutputGateBias()); + + arm_compute::TensorInfo aclInputToInputWeightsInfo; + arm_compute::TensorInfo aclRecurrentToInputWeightsInfo; + arm_compute::TensorInfo aclCellToInputWeightsInfo; + arm_compute::TensorInfo aclInputGateBiasInfo; + arm_compute::TensorInfo aclProjectionWeightsInfo; + arm_compute::TensorInfo aclProjectionBiasInfo; + arm_compute::TensorInfo aclCellToForgetWeightsInfo; + arm_compute::TensorInfo aclCellToOutputWeightsInfo; + + arm_compute::TensorInfo aclInputLayerNormWeightsInfo; + arm_compute::TensorInfo aclForgetLayerNormWeightsInfo; + arm_compute::TensorInfo aclCellLayerNormWeightsInfo; + arm_compute::TensorInfo aclOutputLayerNormWeightsInfo; + + if (!descriptor.m_CifgEnabled) + { + if (descriptor.m_PeepholeEnabled) + { + aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToInputWeights()); + } + aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputToInputWeights()); + aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetRecurrentToInputWeights()); + aclInputGateBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputGateBias()); + + lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, + &aclRecurrentToInputWeightsInfo, + descriptor.m_PeepholeEnabled ? &aclCellToInputWeightsInfo : nullptr, + &aclInputGateBiasInfo); + } + + if (descriptor.m_ProjectionEnabled) + { + if (paramsInfo.m_ProjectionBias != nullptr) + { + aclProjectionBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionBias()); + } + aclProjectionWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionWeights()); + + lstm_params_info.set_projection_params(&aclProjectionWeightsInfo, + paramsInfo.m_ProjectionBias ? &aclProjectionBiasInfo : nullptr); + } + + if (descriptor.m_PeepholeEnabled) + { + aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToForgetWeights()); + aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellToOutputWeights()); + + lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo); + } + + if (descriptor.m_LayerNormEnabled) + { + if (!descriptor.m_CifgEnabled) + { + aclInputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputLayerNormWeights()); + } + aclForgetLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetForgetLayerNormWeights()); + aclCellLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetCellLayerNormWeights()); + aclOutputLayerNormWeightsInfo = BuildArmComputeTensorInfo(paramsInfo.GetOutputLayerNormWeights()); + + lstm_params_info.set_layer_normalization_params(descriptor.m_CifgEnabled ? nullptr : + &aclInputLayerNormWeightsInfo, + &aclForgetLayerNormWeightsInfo, + &aclCellLayerNormWeightsInfo, + &aclOutputLayerNormWeightsInfo); + } + + lstm_params_info.set_matmul_scale_params(descriptor.m_InputIntermediateScale, + descriptor.m_ForgetIntermediateScale, + descriptor.m_CellIntermediateScale, + descriptor.m_OutputIntermediateScale); + + lstm_params_info.set_hidden_state_params(descriptor.m_HiddenStateZeroPoint, descriptor.m_HiddenStateScale); + + for (unsigned int i = 0; i != maxTime; ++i) + { + + // Set LSTM input and output ITensors depending on: + // input format (timeMajor) & number of LSTM batches (maxTime). + arm_compute::ITensorInfo* outputLSTM; + arm_compute::ITensorInfo* inputLSTM; + + // If there is only one LSTM time major batch, we will not concat OR permute. + // Set input of LSTM to be first input ITensor. + // Set output of LSTM to be final output ITensor. + // LSTM input/output cannot be > 2 dimensions so need to resize its TensorInfo. + if (maxTime == 1 && !descriptor.m_TimeMajor) + { + TensorShape inputShape = GetTensorShape(aclInputInfo.tensor_shape(), 1U); + TensorShape outputShape = GetTensorShape(aclOutputInfo.tensor_shape(), 1U); + + TensorShape inputShapeShrink({inputShape[1], inputShape[2]}); + TensorShape outputShapeShrink({outputShape[1], outputShape[2]}); + + auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink); + auto acl_output_shape_shrink = BuildArmComputeTensorShape(outputShapeShrink); + + const_cast(&aclInputInfo)->set_tensor_shape(acl_input_shape_shrink); + inputLSTM = const_cast(&aclInputInfo); + + const_cast(&aclOutputInfo)->set_tensor_shape(acl_output_shape_shrink); + outputLSTM = const_cast(&aclOutputInfo); + } + // If there is only one LSTM batch major batch, we will not concat, only permute. + // Set input of LSTM to be output of initial permute. + // Set output of LSTM to be first element of m_ConcatInputs & use that value later in permute. + // LSTM output cannot be > 2 dimensions so need to resize its TensorInfo. + else if (maxTime == 1 && !descriptor.m_TimeMajor) + { + TensorShape inputShape = GetTensorShape(aclPermuteOutInfo.tensor_shape(), 1U); + TensorShape inputShapeShrink({inputShape[1], inputShape[2]}); + auto acl_input_shape_shrink = BuildArmComputeTensorShape(inputShapeShrink); + aclPermuteOutInfo.set_tensor_shape(acl_input_shape_shrink); + inputLSTM = &aclPermuteOutInfo; + + outputLSTM = const_cast(concatInputsTensorInfosPtr[i]); + } + // Batch major AND/OR 2+ LSTM batches so will use concat AND/OR permute later on. + else + { + inputLSTM = splitterOutputsTensorInfosPtr[i]; + outputLSTM = const_cast(concatInputsTensorInfosPtr[i]); + } + + statusLSTM = arm_compute::NEQLSTMLayer::validate(inputLSTM, + &aclInputToForgetWeightsInfo, + &aclInputToCellWeightsInfo, + &aclInputToOutputWeightsInfo, + &aclRecurrentToForgetWeightsInfo, + &aclRecurrentToCellWeightsInfo, + &aclRecurrentToOutputWeightsInfo, + &aclForgetGateBiasInfo, + &aclCellBiasInfo, + &aclOutputGateBiasInfo, + &aclCellStateInInfo, + &aclOutputStateInInfo, + &aclCellStateOutInfo, + &aclOutputStateOutInfo, + outputLSTM, + lstm_params_info); + } + + // + // Concat validate + // + + // Expand dimensions of LSTM outputs adding one empty dimension to fit concatenate inputs. + TensorShape shape = GetTensorShape(concatInputsTensorInfosPtr[0]->tensor_shape(), 1U); + TensorShape shapeExpandTimeMajor({1, shape[0], shape[1]}); + TensorShape shapeExpandBatchMajor({shape[0], 1, shape[1]}); + + TensorInfo concatOutputTensorInfo = TensorInfo(output); + concatOutputTensorInfo.SetShape(timeMajorShapeOutput); + arm_compute::TensorInfo aclConcatOutputTensorInfo= BuildArmComputeTensorInfo(concatOutputTensorInfo); + + if (maxTime != 1) // ACL concat does not work with only one element to concatenate. + { + for (unsigned int i = 0; i < maxTime; ++i) + { + auto acl_shape_expand = BuildArmComputeTensorShape(shapeExpandTimeMajor); + concatInputsTensorInfos[i].set_tensor_shape(acl_shape_expand); + } + + unsigned int aclAxisConcat = CalcAclAxis(numberDimensions, dimension); + if (!descriptor.m_TimeMajor) + { + statusConcat = arm_compute::NEConcatenateLayer::validate(concatInputsTensorInfosPtr, + &aclConcatOutputTensorInfo, + aclAxisConcat); + } + else + { + statusConcat = arm_compute::NEConcatenateLayer::validate(concatInputsTensorInfosPtr, + &aclOutputInfo, + aclAxisConcat); + } + } + // If only one LSTM batch, we do not concat and/or permute. + // Must ensure final output info is expanded to correct batch major dimensions. + else + { + if (!descriptor.m_TimeMajor) + { + const_cast(&aclInputInfo)->set_tensor_shape( + BuildArmComputeTensorShape(shapeExpandBatchMajor)); + } + else + { + const_cast(&aclInputInfo)->set_tensor_shape( + BuildArmComputeTensorShape(shapeExpandTimeMajor)); + } + } + + // + // Permute validate + // + if (!descriptor.m_TimeMajor) + { + // Output now time major. Permute output back to batch major. + if (maxTime != 1) + { + statusPermute2 = arm_compute::NEPermute::validate(&aclConcatOutputTensorInfo, + &aclOutputInfo, + arm_compute::PermutationVector(0U, 2U, 1U)); + } + else + { + statusPermute2 = arm_compute::NEPermute::validate(concatInputsTensorInfosPtr[0], + &aclOutputInfo, + arm_compute::PermutationVector(0U, 2U, 1U)); + } + } + + auto okCode = arm_compute::ErrorCode::OK; + if (statusPermute1.error_code() == okCode && + statusSplit.error_code() == okCode && + statusLSTM .error_code() == okCode && + statusConcat.error_code() == okCode && + statusPermute2.error_code() == okCode) + { + return arm_compute::Status(arm_compute::ErrorCode::OK, + "All Unidirectional Sequence LSTM layer validate status OK."); + } + else + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, + "Unidirectional Sequence LSTM layer validate status failed."); + } +} + +void NeonUnidirectionalSequenceLstmWorkload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_InputToInputWeightsTensor); + FreeTensorIfUnused(m_InputToForgetWeightsTensor); + FreeTensorIfUnused(m_InputToCellWeightsTensor); + FreeTensorIfUnused(m_InputToOutputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToInputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor); + FreeTensorIfUnused(m_RecurrentToCellWeightsTensor); + FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor); + FreeTensorIfUnused(m_CellToInputWeightsTensor); + FreeTensorIfUnused(m_CellToForgetWeightsTensor); + FreeTensorIfUnused(m_CellToOutputWeightsTensor); + FreeTensorIfUnused(m_InputGateBiasTensor); + FreeTensorIfUnused(m_ForgetGateBiasTensor); + FreeTensorIfUnused(m_CellBiasTensor); + FreeTensorIfUnused(m_OutputGateBiasTensor); + FreeTensorIfUnused(m_ProjectionWeightsTensor); + FreeTensorIfUnused(m_ProjectionBiasTensor); + FreeTensorIfUnused(m_InputLayerNormWeightsTensor); + FreeTensorIfUnused(m_ForgetLayerNormWeightsTensor); + FreeTensorIfUnused(m_CellLayerNormWeightsTensor); + FreeTensorIfUnused(m_OutputLayerNormWeightsTensor); +} + +} //namespace armnn diff --git a/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.hpp b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.hpp new file mode 100644 index 0000000000..f0122589a4 --- /dev/null +++ b/src/backends/neon/workloads/NeonUnidirectionalSequenceLstmWorkload.hpp @@ -0,0 +1,90 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include +#include +#include +#include "NeonBaseWorkload.hpp" + +#include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NESplit.h" +#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" + +namespace armnn +{ + +class NeonUnidirectionalSequenceLstmWorkload : public NeonBaseWorkload +{ +public: + NeonUnidirectionalSequenceLstmWorkload(const UnidirectionalSequenceLstmQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + +private: + + // + // ACL layers required to fully form a Unidirectional Sequence LSTM layer. + // + mutable std::unique_ptr m_Permute1; + mutable std::unique_ptr m_Splitter; + mutable std::vector> m_Layers; + mutable std::unique_ptr m_Concat; + mutable std::unique_ptr m_Permute2; + + // + // ACL LSTM arm_compute::Tensors. + // + std::unique_ptr m_InputToInputWeightsTensor; + std::unique_ptr m_InputToForgetWeightsTensor; + std::unique_ptr m_InputToCellWeightsTensor; + std::unique_ptr m_InputToOutputWeightsTensor; + std::unique_ptr m_RecurrentToInputWeightsTensor; + std::unique_ptr m_RecurrentToForgetWeightsTensor; + std::unique_ptr m_RecurrentToCellWeightsTensor; + std::unique_ptr m_RecurrentToOutputWeightsTensor; + std::unique_ptr m_CellToInputWeightsTensor; + std::unique_ptr m_CellToForgetWeightsTensor; + std::unique_ptr m_CellToOutputWeightsTensor; + std::unique_ptr m_InputGateBiasTensor; + std::unique_ptr m_ForgetGateBiasTensor; + std::unique_ptr m_CellBiasTensor; + std::unique_ptr m_OutputGateBiasTensor; + std::unique_ptr m_ProjectionWeightsTensor; + std::unique_ptr m_ProjectionBiasTensor; + + std::unique_ptr m_InputLayerNormWeightsTensor; + std::unique_ptr m_ForgetLayerNormWeightsTensor; + std::unique_ptr m_CellLayerNormWeightsTensor; + std::unique_ptr m_OutputLayerNormWeightsTensor; + + // + // Additional ACL arm_compute::Tensors and std::vector. + // Required to perform splitting, concatenation and permutations. + // + arm_compute::Tensor m_PermuteFirstOut; + std::vector m_SplitterOutputsTensors; + std::vector m_ConcatInputsTensors; + std::vector m_SplitterOutputs; + std::vector m_ConcatInputs; + arm_compute::Tensor concat_out; + + void FreeUnusedTensors(); +}; + +arm_compute::Status +NeonUnidirectionalSequenceLstmWorkloadValidate(const TensorInfo& input, + const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, + const TensorInfo& output, + const UnidirectionalSequenceLstmDescriptor& descriptor, + const LstmInputParamsInfo& paramsInfo); + +} //namespace armnn diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp index 4f5ba2d708..8b99f03a7f 100644 --- a/src/backends/neon/workloads/NeonWorkloads.hpp +++ b/src/backends/neon/workloads/NeonWorkloads.hpp @@ -69,3 +69,4 @@ #include "NeonTransposeConvolution2dWorkload.hpp" #include "NeonTransposeWorkload.hpp" #include "NeonUnidirectionalSequenceLstmFloatWorkload.hpp" +#include "NeonUnidirectionalSequenceLstmWorkload.hpp" \ No newline at end of file diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp index 66661cb521..919c6db6ff 100644 --- a/src/backends/reference/RefLayerSupport.cpp +++ b/src/backends/reference/RefLayerSupport.cpp @@ -465,57 +465,15 @@ bool RefLayerSupport::IsLayerSupported(const LayerType& type, "hiddenStateOutputVal, cellStateOutputVal, output}"); } auto desc = *(PolymorphicDowncast(&descriptor)); - - bool isHiddenStateOutputOptional = (infos[4] == TensorInfo()); - bool isCellStateOutput = (infos[5] == TensorInfo()); - if (isHiddenStateOutputOptional && isCellStateOutput) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - EmptyOptional(), - EmptyOptional(), - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else if (isHiddenStateOutputOptional) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - EmptyOptional(), - infos[5], - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else if (isCellStateOutput) - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - infos[4], - EmptyOptional(), - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } - else - { - return IsUnidirectionalSequenceLstmSupported(infos[0], - infos[1], - infos[2], - infos[3], - infos[4], - infos[5], - desc, - lstmParamsInfo.value(), - reasonIfUnsupported); - } + return IsUnidirectionalSequenceLstmSupported(infos[0], + infos[1], + infos[2], + infos[3], + infos[4], + infos[5], + desc, + lstmParamsInfo.value(), + reasonIfUnsupported); } case LayerType::Pooling3d: return IsPooling3dSupported(infos[0], @@ -2841,9 +2799,9 @@ bool RefLayerSupport::IsUnidirectionalSequenceLstmSupported( const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported) const @@ -2852,17 +2810,14 @@ bool RefLayerSupport::IsUnidirectionalSequenceLstmSupported( IgnoreUnused(paramsInfo); IgnoreUnused(outputStateIn); IgnoreUnused(cellStateIn); + IgnoreUnused(outputStateOut); + IgnoreUnused(cellStateOut); bool supported = true; - if (hiddenStateOutput.has_value() || cellStateOutput.has_value()) + std::array supportedTypes = { - reasonIfUnsupported.value() += "Reference UnidirectionalSequenceLstm: hidden state output " - "and cell state output are not supported at the moment."; - } - - std::array supportedTypes = - { - DataType::Float32 + DataType::Float32, + DataType::QAsymmS8 }; std::array supportedWeightTypes = @@ -2871,16 +2826,19 @@ bool RefLayerSupport::IsUnidirectionalSequenceLstmSupported( DataType::QAsymmS8 }; + std::array supportedBiasTypes = + { + DataType::Float32, + DataType::QAsymmS8, + DataType::Signed32 + }; + // check inputs and outputs supported &= CheckSupportRule(TypeAnyOf(input, supportedTypes), reasonIfUnsupported, "Reference UnidirectionalSequenceLstm: input is not a supported type."); - supported &= CheckSupportRule(TypesAreEqual(input, outputStateIn), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and outputStateIn types are mismatched"); - supported &= CheckSupportRule(TypesAreEqual(input, cellStateIn), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and cellStateIn types are mismatched"); + supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported, + "Reference UnidirectionalSequenceLstm: output is not a supported type."); - supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and output types are mismatched"); // check layer parameters supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetInputToForgetWeights(), supportedWeightTypes), reasonIfUnsupported, @@ -2905,14 +2863,13 @@ bool RefLayerSupport::IsUnidirectionalSequenceLstmSupported( reasonIfUnsupported, "Reference UnidirectionalSequenceLstm: RecurrentToOutputWeights " "is not a supported type."); - supported &= CheckSupportRule(TypesAreEqual(input, paramsInfo.GetForgetGateBias()), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and ForgetGateBias types " - "are mismatched"); - supported &= CheckSupportRule(TypesAreEqual(input, paramsInfo.GetCellBias()), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and CellBias types are mismatched"); - supported &= CheckSupportRule(TypesAreEqual(input, paramsInfo.GetOutputGateBias()), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and OutputGateBias types " - "are mismatched"); + + supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetForgetGateBias(), supportedBiasTypes), reasonIfUnsupported, + "Reference UnidirectionalSequenceLstm: ForgetGateBias is not a supported type."); + supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetCellBias(), supportedBiasTypes), reasonIfUnsupported, + "Reference UnidirectionalSequenceLstm: CellBias is not a supported type."); + supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetOutputGateBias(), supportedBiasTypes), reasonIfUnsupported, + "Reference UnidirectionalSequenceLstm: OutputGateBias is not a supported type."); if (!descriptor.m_CifgEnabled) { supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetInputToInputWeights(), supportedWeightTypes), @@ -2923,9 +2880,8 @@ bool RefLayerSupport::IsUnidirectionalSequenceLstmSupported( reasonIfUnsupported, "Reference UnidirectionalSequenceLstm: RecurrentToInputWeights " "is not a supported type."); - supported &= CheckSupportRule(TypesAreEqual(input, paramsInfo.GetInputGateBias()), reasonIfUnsupported, - "Reference UnidirectionalSequenceLstm: input and InputGateBias types " - "are mismatched"); + supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetInputGateBias(), supportedBiasTypes), reasonIfUnsupported, + "Reference UnidirectionalSequenceLstm: InputGateBias is not a supported type."); if (descriptor.m_PeepholeEnabled) { supported &= CheckSupportRule(TypeAnyOf(paramsInfo.GetCellToInputWeights(), supportedWeightTypes), diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp index 98770ad64a..aa8bd8dda4 100644 --- a/src/backends/reference/RefLayerSupport.hpp +++ b/src/backends/reference/RefLayerSupport.hpp @@ -367,9 +367,9 @@ public: const TensorInfo& input, const TensorInfo& outputStateIn, const TensorInfo& cellStateIn, + const TensorInfo& outputStateOut, + const TensorInfo& cellStateOut, const TensorInfo& output, - const Optional& hiddenStateOutput, - const Optional& cellStateOutput, const UnidirectionalSequenceLstmDescriptor& descriptor, const LstmInputParamsInfo& paramsInfo, Optional reasonIfUnsupported = EmptyOptional()) const override; diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp index d447a46b23..c4345d4978 100644 --- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp +++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp @@ -59,7 +59,9 @@ void RefUnidirectionalSequenceLstmWorkload::Execute(std::vector TensorInfo inputInfo = GetTensorInfo(inputs[0]); const TensorInfo& outputStateInfo = GetTensorInfo(inputs[1]); const TensorInfo& cellStateInfo = GetTensorInfo(inputs[2]); - TensorInfo outputInfo = GetTensorInfo(outputs[0]); + TensorInfo outputStateOutInfo = GetTensorInfo(outputs[0]); + TensorInfo cellStateOutInfo = GetTensorInfo(outputs[1]); + TensorInfo outputInfo = GetTensorInfo(outputs[2]); TensorShape& inputShape = inputInfo.GetShape(); TensorShape& outputShape= outputInfo.GetShape(); auto inputTensor = reinterpret_cast(inputs[0]->Map()); @@ -140,7 +142,7 @@ void RefUnidirectionalSequenceLstmWorkload::Execute(std::vector auto currentInputData = reinterpret_cast(inputs[0]->Map()); std::unique_ptr> inputData = MakeDecoder(lstmInputInfo, currentInputData); - auto currentOutputData = reinterpret_cast(outputs[0]->Map()); + auto currentOutputData = reinterpret_cast(outputs[2]->Map()); std::unique_ptr> output = MakeEncoder(lstmOutputInfo, currentOutputData); std::unique_ptr> outputDecoder = MakeDecoder(lstmOutputInfo, currentOutputData); @@ -296,7 +298,7 @@ void RefUnidirectionalSequenceLstmWorkload::Execute(std::vector { // Permute Output back to batch major const PermutationVector& mappings = {1U, 0U, 2U}; - auto outputData = reinterpret_cast(outputs[0]->Map()); + auto outputData = reinterpret_cast(outputs[2]->Map()); std::vector outputValue(outputData, outputData + outputInfo.GetNumElements()); outputShape = armnnUtils::Permuted(outputInfo.GetShape(), mappings); outputInfo.SetShape(outputShape); -- cgit v1.2.1