From a65b7aeafc0ef6acf40e4a8a6d36206bf53d717c Mon Sep 17 00:00:00 2001
From: Matteo Martincigh <matteo.martincigh@arm.com>
Date: Wed, 14 Nov 2018 12:39:55 +0000
Subject: IVGCVSW-2092 Port LSTMCell::Eval to ArmNN

 * Ported Google's LSTM implementation to RefLstmFloat32Workload
 * Fixed the code throughout because of an error in the docs around the
   scratch buffer size
 * Updated IsLstmSupported
 * Added the unit tests

!android-nn-driver:127

Change-Id: I5577b7e39ca52df1a7f102a9b437df6aa99520b6
---
 src/armnn/layers/LstmLayer.cpp                     |   9 +-
 src/armnn/test/CreateWorkload.hpp                  |   8 +-
 src/armnn/test/OptimizerTests.cpp                  |   7 +-
 src/backends/backendsCommon/test/LayerTests.cpp    |   8 +-
 src/backends/backendsCommon/test/LstmTestImpl.hpp  |  23 +-
 src/backends/cl/workloads/ClLstmFloatWorkload.cpp  |   8 +-
 .../neon/workloads/NeonLstmFloatWorkload.cpp       |   4 +-
 src/backends/reference/RefLayerSupport.cpp         |   7 +-
 src/backends/reference/test/RefLayerTests.cpp      |   8 +
 .../reference/workloads/RefLstmFloat32Workload.cpp | 365 ++++++++++++++++++++-
 .../reference/workloads/RefLstmFloat32Workload.hpp |  24 +-
 11 files changed, 424 insertions(+), 47 deletions(-)
diff --git a/src/armnn/layers/LstmLayer.cpp b/src/armnn/layers/LstmLayer.cpp
index 866c837357..bd104d49fe 100644
--- a/src/armnn/layers/LstmLayer.cpp
+++ b/src/armnn/layers/LstmLayer.cpp
@@ -123,14 +123,7 @@ std::vector<TensorShape> LstmLayer::InferOutputShapes(const std::vector<TensorSh
     unsigned int numUnits = inputShapes[2][1];
 
     std::vector<TensorShape> outShapes;
-    if (!m_Param.m_CifgEnabled)
-    {
-        outShapes.push_back(TensorShape({batchSize, numUnits*3}));
-    }
-    else
-    {
-        outShapes.push_back(TensorShape({batchSize, numUnits*4}));
-    }
+    outShapes.push_back(TensorShape({batchSize, numUnits * (m_Param.m_CifgEnabled ? 3 : 4)}));
     outShapes.push_back(TensorShape({batchSize, outputSize}));
     outShapes.push_back(TensorShape({batchSize, numUnits}));
     outShapes.push_back(TensorShape({batchSize, outputSize}));
diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp
index 07f9079b5d..111df4b328 100644
--- a/src/armnn/test/CreateWorkload.hpp
+++ b/src/armnn/test/CreateWorkload.hpp
@@ -321,12 +321,8 @@ std::unique_ptr<LstmWorkload> CreateLstmWorkloadTest(armnn::IWorkloadFactory& fa
     armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
     armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
     armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
-    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
-    if (layerDesc.m_CifgEnabled)
-    {
-        lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
-    }
-
+    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) },
+                                                DataType::Float32);
     Connect(input, layer, lstmTensorInfo1, 0, 0);
     Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
     Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2);
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 8bd7d3dbee..30ca52092a 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -154,11 +154,8 @@ void CreateLSTMLayerHelper(Graph &graph, bool CifgEnabled)
     armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
     armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
     armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
-    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
-    if (layerDesc.m_CifgEnabled)
-    {
-        lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
-    }
+    armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) },
+                                                DataType::Float32);
 
     Connect(input, layer, lstmTensorInfo1, 0, 0);
     Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index dad13413b4..bd8b38da01 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -925,8 +925,7 @@ LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(
              -0.0186926f,   0.0193662f,   -0.0115437f,  0.00422612f,  -0.0345232f,
              0.00223253f,   -0.00957321f, 0.0210624f,   0.013331f,    0.0150954f,
              0.02168f}));
-    return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
-        workloadFactory, memoryManager, input, expectedOutput);
+    return LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, memoryManager, input, expectedOutput);
 }
 
 LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(
@@ -6684,7 +6683,6 @@ LayerTestResult<uint8_t, 4> BatchToSpaceNdNhwcUintTest1(
     std::vector<unsigned int> blockShape({2, 2});
     std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
 
-    return BatchToSpaceNdHelper<uint8_t, 4, 4>(workloadFactory, memoryManager,
-        armnn::DataLayout::NHWC, inputShape, input, blockShape,
-        crops, outputShape, expectedOutput);
+    return BatchToSpaceNdHelper<uint8_t, 4, 4>(workloadFactory, memoryManager, armnn::DataLayout::NHWC, inputShape,
+                                               input, blockShape, crops, outputShape, expectedOutput);
 }
diff --git a/src/backends/backendsCommon/test/LstmTestImpl.hpp b/src/backends/backendsCommon/test/LstmTestImpl.hpp
index dfe24aa541..56f40aba84 100644
--- a/src/backends/backendsCommon/test/LstmTestImpl.hpp
+++ b/src/backends/backendsCommon/test/LstmTestImpl.hpp
@@ -34,7 +34,7 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
     armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
 
 
-    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType<float>());
     armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
     armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
     armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
@@ -52,7 +52,7 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
     std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
     auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
 
-    std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+    std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
     auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
 
     std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
@@ -153,8 +153,8 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
     armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8);
     armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8);
     armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8);
-    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
     armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
     armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16);
     armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16);
     armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4);
@@ -222,11 +222,10 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
 
 
 LayerTestResult<float, 2>
-LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
-        armnn::IWorkloadFactory& workloadFactory,
-        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-        const boost::multi_array<float, 2>& input,
-        const boost::multi_array<float, 2>& outputExpected)
+LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                                  const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                  const boost::multi_array<float, 2>& input,
+                                                  const boost::multi_array<float, 2>& outputExpected)
 {
     unsigned int batchSize = 2;
     unsigned int outputSize = 16;
@@ -237,8 +236,8 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
     armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>());
     armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
 
-    // Scratch buffer size without CIFG [batchSize, numUnits * 3]
-    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+    // Scratch buffer size without CIFG [batchSize, numUnits * 4]
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType<float>());
     armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
     armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
     armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
@@ -255,7 +254,7 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
     std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
     auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
 
-    std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+    std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
     auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
 
     std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
@@ -955,7 +954,7 @@ LayerTestResult<float, 2> LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(
     armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
     armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
 
-    unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3;
+    unsigned int scratchBufferSize = cifgEnabled ? cellSize * 3 : cellSize * 4;
     armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType<float>());
     armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
     armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index 2a664454e1..f4d8974226 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -116,14 +116,14 @@ ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor,
     m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>();
     if (m_Data.m_Parameters.m_CifgEnabled)
     {
-        // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
-        armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+        // 2D tensor with dimensions [num_units * 3, batch_size] with CIFG
+        armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32);
         BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
     }
     else
     {
-        // scratch_buffer [num_units * 3, batch_size] without CIFG
-        armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+        // scratch_buffer [num_units * 4, batch_size] without CIFG
+        armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32);
         BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
     }
 
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
index d03454b705..1ab269ff56 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -114,13 +114,13 @@ NeonLstmFloatWorkload::NeonLstmFloatWorkload(const LstmQueueDescriptor &descript
     if (m_Data.m_Parameters.m_CifgEnabled)
     {
         // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
-        armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+        armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32);
         BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
     }
     else
     {
         // scratch_buffer [num_units * 3, batch_size] without CIFG
-        armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+        armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32);
         BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
     }
 
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index d6c1e66626..167cba54e8 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -278,7 +278,6 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input,
                                       const TensorInfo* cellToOutputWeights,
                                       Optional<std::string&> reasonIfUnsupported) const
 {
-    ignore_unused(input);
     ignore_unused(outputStateIn);
     ignore_unused(cellStateIn);
     ignore_unused(scratchBuffer);
@@ -303,8 +302,10 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input,
     ignore_unused(projectionBias);
     ignore_unused(cellToForgetWeights);
     ignore_unused(cellToOutputWeights);
-    ignore_unused(reasonIfUnsupported);
-    return false;
+    return IsSupportedForDataTypeRef(reasonIfUnsupported,
+                                     input.GetDataType(),
+                                     &TrueFunc<>,
+                                     &FalseFuncU8<>);
 }
 
 bool RefLayerSupport::IsMeanSupported(const TensorInfo& input,
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 4ff5cf2a2e..35981ea4b3 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -336,6 +336,14 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test)
 ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test)
 ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test)
 
+// Lstm
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection,
+                     LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection,
+                     LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection,
+                     LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest)
+
 // Convert from Float16 to Float32
 ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test)
 // Convert from Float32 to Float16
diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
index 50ff605701..c697b66658 100644
--- a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
@@ -4,13 +4,376 @@
 //
 
 #include "RefLstmFloat32Workload.hpp"
+#include "RefWorkloadUtils.hpp"
+#include "Activation.hpp"
+
+namespace
+{
+
+// Helper functions ported from the Android code base
+// Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                         uint32_t mRows,
+                                         uint32_t mCols,
+                                         const float* vector,
+                                         uint32_t nBatch,
+                                         float* outResult,
+                                         int resultStride = 1)
+{
+    float* resultInBatch = outResult;
+    for (uint32_t b = 0; b < nBatch; b++)
+    {
+        const float* matrixPtr = matrix;
+        for (uint32_t r = 0; r < mRows; r++)
+        {
+            const float* vectorInBatch = vector + b * mCols;
+            for (uint32_t c = 0; c < mCols; c++)
+            {
+                *resultInBatch += *matrixPtr++ * *vectorInBatch++;
+            }
+            resultInBatch += resultStride;
+        }
+    }
+}
+
+void VectorBatchVectorAssign(const float* vector,
+                             uint32_t vSize,
+                             uint32_t nBatch,
+                             float* outBatchVector)
+{
+    for (uint32_t b = 0; b < nBatch; b++)
+    {
+        memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float));
+    }
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                             uint32_t vSize,
+                                             const float* batchVector,
+                                             uint32_t nBatch,
+                                             float* outResult)
+{
+    for (uint32_t b = 0; b < nBatch; b++)
+    {
+        for (uint32_t v = 0; v < vSize; v++)
+        {
+            *outResult++ += vector[v] * *batchVector++;
+        }
+    }
+}
+
+void Sub1Vector(const float* vector,
+                uint32_t vSize,
+                float* result)
+{
+    for (uint32_t v = 0; v < vSize; v++)
+    {
+        *result++ = 1.0f - *vector++;
+    }
+}
+
+void VectorVectorCwiseProduct(const float* vector1,
+                              const float* vector2,
+                              uint32_t vSize,
+                              float* outResult)
+{
+    for (uint32_t v = 0; v < vSize; v++)
+    {
+        *outResult++ = *vector1++ * *vector2++;
+    }
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2,
+                                        uint32_t vSize,
+                                        float* outResult)
+{
+    for (uint32_t v = 0; v < vSize; v++)
+    {
+        *outResult++ += *vector1++ * *vector2++;
+    }
+}
+
+float Clip(float f,
+           float absLimit)
+{
+    float result = (absLimit < f) ? absLimit : f;
+    result = (-absLimit > result) ? -absLimit : result;
+    return result;
+}
+
+void ClipVector(const float* vector,
+                uint32_t vSize,
+                float absLimit,
+                float* outResult)
+{
+    for (uint32_t v = 0; v < vSize; v++)
+    {
+        *outResult++ = Clip(*vector++, absLimit);
+    }
+}
+
+void CopyVector(const float* vector,
+                uint32_t vSize,
+                float* outResult)
+{
+    memcpy(outResult, vector, vSize * sizeof(float));
+}
+
+void SetActivationParameters(uint32_t activation,
+                             armnn::ActivationFunction& outArmnnActivation,
+                             float& outA,
+                             float& outB)
+{
+    switch (activation)
+    {
+    case 0: // None
+        outA = 0;
+        outB = 0;
+        return;
+
+    case 1: // Relu
+        outArmnnActivation = armnn::ActivationFunction::ReLu;
+        outA = 0;
+        outB = 0;
+        return;
+
+    case 3: // Relu6
+        outArmnnActivation = armnn::ActivationFunction::BoundedReLu;
+        outA = 6;
+        outB = 0;
+        return;
+
+    case 4: // Tanh
+        outArmnnActivation = armnn::ActivationFunction::TanH;
+        outA = 1;
+        outB = 1;
+        return;
+
+    case 6: // Sigmoid
+        outArmnnActivation = armnn::ActivationFunction::Sigmoid;
+        outA = 0;
+        outB = 0;
+        return;
+
+    default:
+        throw armnn::Exception("Unsupported activation function: " + std::to_string(activation));
+    }
+}
+
+std::unique_ptr<armnn::ScopedCpuTensorHandle> AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr)
+{
+    if (!ptr)
+    {
+        return nullptr;
+    }
+
+    return std::make_unique<armnn::ScopedCpuTensorHandle>(*ptr);
+}
+
+} // anonymous namespace
 
 namespace armnn
 {
 
+RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
+    : Float32Workload<LstmQueueDescriptor>(descriptor, info)
+    , m_InputToInputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights))
+    , m_InputToForgetWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights))
+    , m_InputToCellWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights))
+    , m_InputToOutputWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights))
+    , m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights))
+    , m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights))
+    , m_RecurrentToCellWeightsTensor  (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights))
+    , m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights))
+    , m_CellToInputWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights))
+    , m_CellToForgetWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights))
+    , m_CellToOutputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights))
+    , m_InputGateBiasTensor           (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias))
+    , m_ForgetGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias))
+    , m_CellBiasTensor                (AssignScopedCpuTensorHandle(descriptor.m_CellBias))
+    , m_OutputGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias))
+    , m_ProjectionWeightsTensor       (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights))
+    , m_ProjectionBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias))
+{}
+
 void RefLstmFloat32Workload::Execute() const
 {
-    throw armnn::Exception("No implementation of Lstm in the Ref backend!");
+    // This is a porting of the LSTM::Eval() method in the Android code base
+    // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp
+
+    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorShape& inputShape = inputInfo.GetShape();
+
+    float* scratchBuffer  = GetOutputTensorDataFloat(0, m_Data);
+    float* outputStateOut = GetOutputTensorDataFloat(1, m_Data);
+    float* cellStateOut   = GetOutputTensorDataFloat(2, m_Data);
+    float* output         = GetOutputTensorDataFloat(3, m_Data);
+
+    const float* inputData     = GetInputTensorDataFloat(0, m_Data);
+    const float* outputStateIn = GetInputTensorDataFloat(1, m_Data);
+    const float* cellStateIn   = GetInputTensorDataFloat(2, m_Data);
+
+    const uint32_t nBatch = inputShape[0];
+    const uint32_t nInput = inputShape[1];
+
+    const uint32_t nCell   = m_InputToOutputWeightsTensor->GetShape()[0];
+    const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1];
+
+    const bool useCifg     = m_Data.m_Parameters.m_CifgEnabled;
+    const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled;
+
+    // Index the scratch buffers pointers to the global scratch buffer.
+    float* inputGateScratch  = nullptr;
+    float* cellScratch       = nullptr;
+    float* forgetGateScratch = nullptr;
+    float* outputGateScratch = nullptr;
+
+    if (useCifg)
+    {
+        cellScratch       = scratchBuffer + 0 * nCell * nBatch;
+        forgetGateScratch = scratchBuffer + 1 * nCell * nBatch;
+        outputGateScratch = scratchBuffer + 2 * nCell * nBatch;
+    }
+    else
+    {
+        inputGateScratch  = scratchBuffer + 0 * nCell * nBatch;
+        cellScratch       = scratchBuffer + 1 * nCell * nBatch;
+        forgetGateScratch = scratchBuffer + 2 * nCell * nBatch;
+        outputGateScratch = scratchBuffer + 3 * nCell * nBatch;
+    }
+
+    // Initialize scratch buffers with bias.
+    if (!useCifg)
+    {
+        VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor<float>(),
+                                nCell, nBatch, inputGateScratch);
+    }
+    VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor<float>(),
+                            nCell, nBatch, forgetGateScratch);
+    VectorBatchVectorAssign(m_CellBiasTensor->GetTensor<float>(),
+                            nCell, nBatch, cellScratch);
+    VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor<float>(),
+                            nCell, nBatch, outputGateScratch);
+
+    // For each batch and cell: compute input_weight * input.
+    if (!useCifg)
+    {
+        MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor<float>(),
+                                            nCell, nInput, inputData, nBatch, inputGateScratch);
+    }
+    MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor<float>(),
+                                        nCell, nInput, inputData, nBatch, forgetGateScratch);
+    MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor<float>(),
+                                        nCell, nInput, inputData, nBatch, cellScratch);
+    MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor<float>(),
+                                        nCell, nInput, inputData, nBatch, outputGateScratch);
+
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!useCifg)
+    {
+        MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor<float>(),
+                                            nCell, nOutput, outputStateIn, nBatch, inputGateScratch);
+    }
+    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor<float>(),
+                                        nCell, nOutput, outputStateIn, nBatch, forgetGateScratch);
+    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor<float>(),
+                                        nCell, nOutput, outputStateIn, nBatch, cellScratch);
+    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor<float>(),
+                                        nCell, nOutput, outputStateIn, nBatch, outputGateScratch);
+
+    // For each batch and cell: update input gate.
+    if (!useCifg)
+    {
+        if (usePeephole)
+        {
+            VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor<float>(),
+                                                    nCell, cellStateIn, nBatch, inputGateScratch);
+        }
+        Activation(inputGateScratch, inputGateScratch,
+                   TensorInfo({nCell, nBatch}, DataType::Float32),
+                   ActivationFunction::Sigmoid, 0, 0);
+    }
+
+    // For each batch and cell: update forget gate.
+    if (usePeephole)
+    {
+        VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor<float>(), nCell,
+                                                cellStateIn, nBatch, forgetGateScratch);
+    }
+    Activation(forgetGateScratch, forgetGateScratch,
+               TensorInfo({nCell, nBatch}, DataType::Float32),
+               ActivationFunction::Sigmoid, 0, 0);
+
+    // For each batch and cell: update the cell.
+    VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut);
+
+    ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid;
+    float a = 0;
+    float b = 0;
+    SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b);
+
+    if (m_Data.m_Parameters.m_ActivationFunc > 0)
+    {
+        Activation(cellScratch, cellScratch,
+                   TensorInfo({nCell, nBatch}, DataType::Float32),
+                   armnnActivationFunc, a, b);
+    }
+    if (useCifg)
+    {
+        Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch);
+        VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut);
+    }
+    else
+    {
+        VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut);
+    }
+    if (m_Data.m_Parameters.m_ClippingThresCell > 0.0)
+    {
+        ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut);
+    }
+
+    // For each batch and cell: update the output gate.
+    if (usePeephole)
+    {
+        VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor<float>(),
+                                                nCell, cellStateOut, nBatch, outputGateScratch);
+    }
+    Activation(outputGateScratch, outputGateScratch,
+               TensorInfo({nCell, nBatch}, DataType::Float32),
+               ActivationFunction::Sigmoid, 0, 0);
+
+    if (m_Data.m_Parameters.m_ActivationFunc > 0)
+    {
+        Activation(cellStateOut, cellScratch,
+                   TensorInfo({nCell, nBatch}, DataType::Float32),
+                   armnnActivationFunc, a, b);
+    }
+    VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch);
+
+    // For each batch: update the projection and output_state.
+    if (m_Data.m_Parameters.m_ProjectionEnabled)
+    {
+        if (m_ProjectionBiasTensor)
+        {
+            VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor<float>(),
+                                    nOutput, nBatch, output);
+        }
+        MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor<float>(),
+                                            nOutput, nCell, outputGateScratch, nBatch, output);
+
+        if (m_Data.m_Parameters.m_ClippingThresProj > 0.0)
+        {
+            ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output);
+        }
+    }
+    else
+    {
+        CopyVector(outputGateScratch, nBatch * nOutput, output);
+    }
+
+    CopyVector(output, nBatch * nOutput, outputStateOut);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
index 1f634d3ca1..a2dead8b9c 100644
--- a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
+++ b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <armnn/TypesUtils.hpp>
+
 #include <backendsCommon/Workload.hpp>
 #include <backendsCommon/WorkloadData.hpp>
 
@@ -14,8 +16,28 @@ namespace armnn
 class RefLstmFloat32Workload : public Float32Workload<LstmQueueDescriptor>
 {
 public:
-    using Float32Workload<LstmQueueDescriptor>::Float32Workload;
+    explicit RefLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+
     virtual void Execute() const override;
+
+private:
+    std::unique_ptr<ScopedCpuTensorHandle> m_InputToInputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_InputToForgetWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_InputToCellWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_InputToOutputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToInputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToForgetWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToCellWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToOutputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_CellToInputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_CellToForgetWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_CellToOutputWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_InputGateBiasTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_ForgetGateBiasTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_CellBiasTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_OutputGateBiasTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionWeightsTensor;
+    std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionBiasTensor;
 };
 
 } //namespace armnn
-- 
cgit v1.2.1