From a65b7aeafc0ef6acf40e4a8a6d36206bf53d717c Mon Sep 17 00:00:00 2001 From: Matteo Martincigh Date: Wed, 14 Nov 2018 12:39:55 +0000 Subject: IVGCVSW-2092 Port LSTMCell::Eval to ArmNN * Ported Google's LSTM implementation to RefLstmFloat32Workload * Fixed the code throughout because of an error in the docs around the scratch buffer size * Updated IsLstmSupported * Added the unit tests !android-nn-driver:127 Change-Id: I5577b7e39ca52df1a7f102a9b437df6aa99520b6 --- src/armnn/layers/LstmLayer.cpp | 9 +- src/armnn/test/CreateWorkload.hpp | 8 +- src/armnn/test/OptimizerTests.cpp | 7 +- src/backends/backendsCommon/test/LayerTests.cpp | 8 +- src/backends/backendsCommon/test/LstmTestImpl.hpp | 23 +- src/backends/cl/workloads/ClLstmFloatWorkload.cpp | 8 +- .../neon/workloads/NeonLstmFloatWorkload.cpp | 4 +- src/backends/reference/RefLayerSupport.cpp | 7 +- src/backends/reference/test/RefLayerTests.cpp | 8 + .../reference/workloads/RefLstmFloat32Workload.cpp | 365 ++++++++++++++++++++- .../reference/workloads/RefLstmFloat32Workload.hpp | 24 +- 11 files changed, 424 insertions(+), 47 deletions(-) diff --git a/src/armnn/layers/LstmLayer.cpp b/src/armnn/layers/LstmLayer.cpp index 866c837357..bd104d49fe 100644 --- a/src/armnn/layers/LstmLayer.cpp +++ b/src/armnn/layers/LstmLayer.cpp @@ -123,14 +123,7 @@ std::vector LstmLayer::InferOutputShapes(const std::vector outShapes; - if (!m_Param.m_CifgEnabled) - { - outShapes.push_back(TensorShape({batchSize, numUnits*3})); - } - else - { - outShapes.push_back(TensorShape({batchSize, numUnits*4})); - } + outShapes.push_back(TensorShape({batchSize, numUnits * (m_Param.m_CifgEnabled ? 3 : 4)})); outShapes.push_back(TensorShape({batchSize, outputSize})); outShapes.push_back(TensorShape({batchSize, numUnits})); outShapes.push_back(TensorShape({batchSize, outputSize})); diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp index 07f9079b5d..111df4b328 100644 --- a/src/armnn/test/CreateWorkload.hpp +++ b/src/armnn/test/CreateWorkload.hpp @@ -321,12 +321,8 @@ std::unique_ptr CreateLstmWorkloadTest(armnn::IWorkloadFactory& fa armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32); armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32); armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32); - armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32); - if (layerDesc.m_CifgEnabled) - { - lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 }); - } - + armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) }, + DataType::Float32); Connect(input, layer, lstmTensorInfo1, 0, 0); Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1); Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2); diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 8bd7d3dbee..30ca52092a 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -154,11 +154,8 @@ void CreateLSTMLayerHelper(Graph &graph, bool CifgEnabled) armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32); armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32); armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32); - armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32); - if (layerDesc.m_CifgEnabled) - { - lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 }); - } + armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) }, + DataType::Float32); Connect(input, layer, lstmTensorInfo1, 0, 0); Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1); diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp index dad13413b4..bd8b38da01 100755 --- a/src/backends/backendsCommon/test/LayerTests.cpp +++ b/src/backends/backendsCommon/test/LayerTests.cpp @@ -925,8 +925,7 @@ LayerTestResult LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest( -0.0186926f, 0.0193662f, -0.0115437f, 0.00422612f, -0.0345232f, 0.00223253f, -0.00957321f, 0.0210624f, 0.013331f, 0.0150954f, 0.02168f})); - return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl( - workloadFactory, memoryManager, input, expectedOutput); + return LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, memoryManager, input, expectedOutput); } LayerTestResult LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest( @@ -6684,7 +6683,6 @@ LayerTestResult BatchToSpaceNdNhwcUintTest1( std::vector blockShape({2, 2}); std::vector> crops = {{0, 0}, {0, 0}}; - return BatchToSpaceNdHelper(workloadFactory, memoryManager, - armnn::DataLayout::NHWC, inputShape, input, blockShape, - crops, outputShape, expectedOutput); + return BatchToSpaceNdHelper(workloadFactory, memoryManager, armnn::DataLayout::NHWC, inputShape, + input, blockShape, crops, outputShape, expectedOutput); } diff --git a/src/backends/backendsCommon/test/LstmTestImpl.hpp b/src/backends/backendsCommon/test/LstmTestImpl.hpp index dfe24aa541..56f40aba84 100644 --- a/src/backends/backendsCommon/test/LstmTestImpl.hpp +++ b/src/backends/backendsCommon/test/LstmTestImpl.hpp @@ -34,7 +34,7 @@ LayerTestResult LstmNoCifgNoPeepholeNoProjectionTestImpl( armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType()); - armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType()); + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType()); armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType()); armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType()); @@ -52,7 +52,7 @@ LayerTestResult LstmNoCifgNoPeepholeNoProjectionTestImpl( std::vector outputStateInVector(batchSize * outputSize, 0.f); auto outputStateInTensor = MakeTensor(outputStateInTensorInfo, outputStateInVector); - std::vector scratchBufferVector(batchSize * numUnits * 3, 0.f); + std::vector scratchBufferVector(batchSize * numUnits * 4, 0.f); auto scratchBufferTensor = MakeTensor(scratchBufferTensorInfo, scratchBufferVector); std::vector outputStateOutVector(batchSize * outputSize, 0.f); @@ -153,8 +153,8 @@ LayerTestResult LstmNoCifgNoPeepholeNoProjectionTestImpl( armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8); armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8); armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8); - armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16); armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16); armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16); armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16); armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4); @@ -222,11 +222,10 @@ LayerTestResult LstmNoCifgNoPeepholeNoProjectionTestImpl( LayerTestResult -LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl( - armnn::IWorkloadFactory& workloadFactory, - const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, - const boost::multi_array& input, - const boost::multi_array& outputExpected) +LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, + const boost::multi_array& input, + const boost::multi_array& outputExpected) { unsigned int batchSize = 2; unsigned int outputSize = 16; @@ -237,8 +236,8 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl( armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType()); armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType()); - // Scratch buffer size without CIFG [batchSize, numUnits * 3] - armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType()); + // Scratch buffer size without CIFG [batchSize, numUnits * 4] + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType()); armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType()); armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType()); @@ -255,7 +254,7 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl( std::vector outputStateInVector(batchSize * outputSize, 0.f); auto outputStateInTensor = MakeTensor(outputStateInTensorInfo, outputStateInVector); - std::vector scratchBufferVector(batchSize * numUnits * 3, 0.f); + std::vector scratchBufferVector(batchSize * numUnits * 4, 0.f); auto scratchBufferTensor = MakeTensor(scratchBufferTensorInfo, scratchBufferVector); std::vector outputStateOutVector(batchSize * outputSize, 0.f); @@ -955,7 +954,7 @@ LayerTestResult LstmLayerWithCifgWithPeepholeNoProjectionTestImpl( armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType()); armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType()); - unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3; + unsigned int scratchBufferSize = cifgEnabled ? cellSize * 3 : cellSize * 4; armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType()); armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType()); diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp index 2a664454e1..f4d8974226 100644 --- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp +++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp @@ -116,14 +116,14 @@ ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor, m_ScratchBuffer = std::make_unique(); if (m_Data.m_Parameters.m_CifgEnabled) { - // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG - armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32); + // 2D tensor with dimensions [num_units * 3, batch_size] with CIFG + armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32); BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1); } else { - // scratch_buffer [num_units * 3, batch_size] without CIFG - armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32); + // scratch_buffer [num_units * 4, batch_size] without CIFG + armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32); BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2); } diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp index d03454b705..1ab269ff56 100644 --- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp +++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp @@ -114,13 +114,13 @@ NeonLstmFloatWorkload::NeonLstmFloatWorkload(const LstmQueueDescriptor &descript if (m_Data.m_Parameters.m_CifgEnabled) { // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG - armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32); + armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32); BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1); } else { // scratch_buffer [num_units * 3, batch_size] without CIFG - armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32); + armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32); BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2); } diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp index d6c1e66626..167cba54e8 100644 --- a/src/backends/reference/RefLayerSupport.cpp +++ b/src/backends/reference/RefLayerSupport.cpp @@ -278,7 +278,6 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input, const TensorInfo* cellToOutputWeights, Optional reasonIfUnsupported) const { - ignore_unused(input); ignore_unused(outputStateIn); ignore_unused(cellStateIn); ignore_unused(scratchBuffer); @@ -303,8 +302,10 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input, ignore_unused(projectionBias); ignore_unused(cellToForgetWeights); ignore_unused(cellToOutputWeights); - ignore_unused(reasonIfUnsupported); - return false; + return IsSupportedForDataTypeRef(reasonIfUnsupported, + input.GetDataType(), + &TrueFunc<>, + &FalseFuncU8<>); } bool RefLayerSupport::IsMeanSupported(const TensorInfo& input, diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp index 4ff5cf2a2e..35981ea4b3 100644 --- a/src/backends/reference/test/RefLayerTests.cpp +++ b/src/backends/reference/test/RefLayerTests.cpp @@ -336,6 +336,14 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test) +// Lstm +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection, + LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection, + LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection, + LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest) + // Convert from Float16 to Float32 ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test) // Convert from Float32 to Float16 diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp index 50ff605701..c697b66658 100644 --- a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp +++ b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp @@ -4,13 +4,376 @@ // #include "RefLstmFloat32Workload.hpp" +#include "RefWorkloadUtils.hpp" +#include "Activation.hpp" + +namespace +{ + +// Helper functions ported from the Android code base +// Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc + +void MatrixBatchVectorMultiplyAccumulate(const float* matrix, + uint32_t mRows, + uint32_t mCols, + const float* vector, + uint32_t nBatch, + float* outResult, + int resultStride = 1) +{ + float* resultInBatch = outResult; + for (uint32_t b = 0; b < nBatch; b++) + { + const float* matrixPtr = matrix; + for (uint32_t r = 0; r < mRows; r++) + { + const float* vectorInBatch = vector + b * mCols; + for (uint32_t c = 0; c < mCols; c++) + { + *resultInBatch += *matrixPtr++ * *vectorInBatch++; + } + resultInBatch += resultStride; + } + } +} + +void VectorBatchVectorAssign(const float* vector, + uint32_t vSize, + uint32_t nBatch, + float* outBatchVector) +{ + for (uint32_t b = 0; b < nBatch; b++) + { + memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float)); + } +} + +void VectorBatchVectorCwiseProductAccumulate(const float* vector, + uint32_t vSize, + const float* batchVector, + uint32_t nBatch, + float* outResult) +{ + for (uint32_t b = 0; b < nBatch; b++) + { + for (uint32_t v = 0; v < vSize; v++) + { + *outResult++ += vector[v] * *batchVector++; + } + } +} + +void Sub1Vector(const float* vector, + uint32_t vSize, + float* result) +{ + for (uint32_t v = 0; v < vSize; v++) + { + *result++ = 1.0f - *vector++; + } +} + +void VectorVectorCwiseProduct(const float* vector1, + const float* vector2, + uint32_t vSize, + float* outResult) +{ + for (uint32_t v = 0; v < vSize; v++) + { + *outResult++ = *vector1++ * *vector2++; + } +} + +void VectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, + uint32_t vSize, + float* outResult) +{ + for (uint32_t v = 0; v < vSize; v++) + { + *outResult++ += *vector1++ * *vector2++; + } +} + +float Clip(float f, + float absLimit) +{ + float result = (absLimit < f) ? absLimit : f; + result = (-absLimit > result) ? -absLimit : result; + return result; +} + +void ClipVector(const float* vector, + uint32_t vSize, + float absLimit, + float* outResult) +{ + for (uint32_t v = 0; v < vSize; v++) + { + *outResult++ = Clip(*vector++, absLimit); + } +} + +void CopyVector(const float* vector, + uint32_t vSize, + float* outResult) +{ + memcpy(outResult, vector, vSize * sizeof(float)); +} + +void SetActivationParameters(uint32_t activation, + armnn::ActivationFunction& outArmnnActivation, + float& outA, + float& outB) +{ + switch (activation) + { + case 0: // None + outA = 0; + outB = 0; + return; + + case 1: // Relu + outArmnnActivation = armnn::ActivationFunction::ReLu; + outA = 0; + outB = 0; + return; + + case 3: // Relu6 + outArmnnActivation = armnn::ActivationFunction::BoundedReLu; + outA = 6; + outB = 0; + return; + + case 4: // Tanh + outArmnnActivation = armnn::ActivationFunction::TanH; + outA = 1; + outB = 1; + return; + + case 6: // Sigmoid + outArmnnActivation = armnn::ActivationFunction::Sigmoid; + outA = 0; + outB = 0; + return; + + default: + throw armnn::Exception("Unsupported activation function: " + std::to_string(activation)); + } +} + +std::unique_ptr AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr) +{ + if (!ptr) + { + return nullptr; + } + + return std::make_unique(*ptr); +} + +} // anonymous namespace namespace armnn { +RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info) + : Float32Workload(descriptor, info) + , m_InputToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights)) + , m_InputToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights)) + , m_InputToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights)) + , m_InputToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights)) + , m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights)) + , m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights)) + , m_RecurrentToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights)) + , m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights)) + , m_CellToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights)) + , m_CellToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights)) + , m_CellToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights)) + , m_InputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias)) + , m_ForgetGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias)) + , m_CellBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_CellBias)) + , m_OutputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias)) + , m_ProjectionWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights)) + , m_ProjectionBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias)) +{} + void RefLstmFloat32Workload::Execute() const { - throw armnn::Exception("No implementation of Lstm in the Ref backend!"); + // This is a porting of the LSTM::Eval() method in the Android code base + // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp + + const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); + const TensorShape& inputShape = inputInfo.GetShape(); + + float* scratchBuffer = GetOutputTensorDataFloat(0, m_Data); + float* outputStateOut = GetOutputTensorDataFloat(1, m_Data); + float* cellStateOut = GetOutputTensorDataFloat(2, m_Data); + float* output = GetOutputTensorDataFloat(3, m_Data); + + const float* inputData = GetInputTensorDataFloat(0, m_Data); + const float* outputStateIn = GetInputTensorDataFloat(1, m_Data); + const float* cellStateIn = GetInputTensorDataFloat(2, m_Data); + + const uint32_t nBatch = inputShape[0]; + const uint32_t nInput = inputShape[1]; + + const uint32_t nCell = m_InputToOutputWeightsTensor->GetShape()[0]; + const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1]; + + const bool useCifg = m_Data.m_Parameters.m_CifgEnabled; + const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled; + + // Index the scratch buffers pointers to the global scratch buffer. + float* inputGateScratch = nullptr; + float* cellScratch = nullptr; + float* forgetGateScratch = nullptr; + float* outputGateScratch = nullptr; + + if (useCifg) + { + cellScratch = scratchBuffer + 0 * nCell * nBatch; + forgetGateScratch = scratchBuffer + 1 * nCell * nBatch; + outputGateScratch = scratchBuffer + 2 * nCell * nBatch; + } + else + { + inputGateScratch = scratchBuffer + 0 * nCell * nBatch; + cellScratch = scratchBuffer + 1 * nCell * nBatch; + forgetGateScratch = scratchBuffer + 2 * nCell * nBatch; + outputGateScratch = scratchBuffer + 3 * nCell * nBatch; + } + + // Initialize scratch buffers with bias. + if (!useCifg) + { + VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor(), + nCell, nBatch, inputGateScratch); + } + VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor(), + nCell, nBatch, forgetGateScratch); + VectorBatchVectorAssign(m_CellBiasTensor->GetTensor(), + nCell, nBatch, cellScratch); + VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor(), + nCell, nBatch, outputGateScratch); + + // For each batch and cell: compute input_weight * input. + if (!useCifg) + { + MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor(), + nCell, nInput, inputData, nBatch, inputGateScratch); + } + MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor(), + nCell, nInput, inputData, nBatch, forgetGateScratch); + MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor(), + nCell, nInput, inputData, nBatch, cellScratch); + MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor(), + nCell, nInput, inputData, nBatch, outputGateScratch); + + // For each batch and cell: compute recurrent_weight * output_state. + if (!useCifg) + { + MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor(), + nCell, nOutput, outputStateIn, nBatch, inputGateScratch); + } + MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor(), + nCell, nOutput, outputStateIn, nBatch, forgetGateScratch); + MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor(), + nCell, nOutput, outputStateIn, nBatch, cellScratch); + MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor(), + nCell, nOutput, outputStateIn, nBatch, outputGateScratch); + + // For each batch and cell: update input gate. + if (!useCifg) + { + if (usePeephole) + { + VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor(), + nCell, cellStateIn, nBatch, inputGateScratch); + } + Activation(inputGateScratch, inputGateScratch, + TensorInfo({nCell, nBatch}, DataType::Float32), + ActivationFunction::Sigmoid, 0, 0); + } + + // For each batch and cell: update forget gate. + if (usePeephole) + { + VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor(), nCell, + cellStateIn, nBatch, forgetGateScratch); + } + Activation(forgetGateScratch, forgetGateScratch, + TensorInfo({nCell, nBatch}, DataType::Float32), + ActivationFunction::Sigmoid, 0, 0); + + // For each batch and cell: update the cell. + VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut); + + ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid; + float a = 0; + float b = 0; + SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b); + + if (m_Data.m_Parameters.m_ActivationFunc > 0) + { + Activation(cellScratch, cellScratch, + TensorInfo({nCell, nBatch}, DataType::Float32), + armnnActivationFunc, a, b); + } + if (useCifg) + { + Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch); + VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut); + } + else + { + VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut); + } + if (m_Data.m_Parameters.m_ClippingThresCell > 0.0) + { + ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut); + } + + // For each batch and cell: update the output gate. + if (usePeephole) + { + VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor(), + nCell, cellStateOut, nBatch, outputGateScratch); + } + Activation(outputGateScratch, outputGateScratch, + TensorInfo({nCell, nBatch}, DataType::Float32), + ActivationFunction::Sigmoid, 0, 0); + + if (m_Data.m_Parameters.m_ActivationFunc > 0) + { + Activation(cellStateOut, cellScratch, + TensorInfo({nCell, nBatch}, DataType::Float32), + armnnActivationFunc, a, b); + } + VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch); + + // For each batch: update the projection and output_state. + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + if (m_ProjectionBiasTensor) + { + VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor(), + nOutput, nBatch, output); + } + MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor(), + nOutput, nCell, outputGateScratch, nBatch, output); + + if (m_Data.m_Parameters.m_ClippingThresProj > 0.0) + { + ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output); + } + } + else + { + CopyVector(outputGateScratch, nBatch * nOutput, output); + } + + CopyVector(output, nBatch * nOutput, outputStateOut); } } //namespace armnn diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp index 1f634d3ca1..a2dead8b9c 100644 --- a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp +++ b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp @@ -5,6 +5,8 @@ #pragma once +#include + #include #include @@ -14,8 +16,28 @@ namespace armnn class RefLstmFloat32Workload : public Float32Workload { public: - using Float32Workload::Float32Workload; + explicit RefLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr m_InputToInputWeightsTensor; + std::unique_ptr m_InputToForgetWeightsTensor; + std::unique_ptr m_InputToCellWeightsTensor; + std::unique_ptr m_InputToOutputWeightsTensor; + std::unique_ptr m_RecurrentToInputWeightsTensor; + std::unique_ptr m_RecurrentToForgetWeightsTensor; + std::unique_ptr m_RecurrentToCellWeightsTensor; + std::unique_ptr m_RecurrentToOutputWeightsTensor; + std::unique_ptr m_CellToInputWeightsTensor; + std::unique_ptr m_CellToForgetWeightsTensor; + std::unique_ptr m_CellToOutputWeightsTensor; + std::unique_ptr m_InputGateBiasTensor; + std::unique_ptr m_ForgetGateBiasTensor; + std::unique_ptr m_CellBiasTensor; + std::unique_ptr m_OutputGateBiasTensor; + std::unique_ptr m_ProjectionWeightsTensor; + std::unique_ptr m_ProjectionBiasTensor; }; } //namespace armnn -- cgit v1.2.1