aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatteo Martincigh <matteo.martincigh@arm.com>2018-11-14 12:39:55 +0000
committerMatteo Martincigh <matteo.martincigh@arm.com>2018-11-16 09:05:24 +0000
commita65b7aeafc0ef6acf40e4a8a6d36206bf53d717c (patch)
treed62257a911f3a4a4ed99243d4860a2453e95ec98
parent74ba3dc7113e51cf11ab772ee1eb030c07a7dda5 (diff)
downloadarmnn-a65b7aeafc0ef6acf40e4a8a6d36206bf53d717c.tar.gz
IVGCVSW-2092 Port LSTMCell::Eval to ArmNN
* Ported Google's LSTM implementation to RefLstmFloat32Workload * Fixed the code throughout because of an error in the docs around the scratch buffer size * Updated IsLstmSupported * Added the unit tests !android-nn-driver:127 Change-Id: I5577b7e39ca52df1a7f102a9b437df6aa99520b6
-rw-r--r--src/armnn/layers/LstmLayer.cpp9
-rw-r--r--src/armnn/test/CreateWorkload.hpp8
-rw-r--r--src/armnn/test/OptimizerTests.cpp7
-rwxr-xr-xsrc/backends/backendsCommon/test/LayerTests.cpp8
-rw-r--r--src/backends/backendsCommon/test/LstmTestImpl.hpp23
-rw-r--r--src/backends/cl/workloads/ClLstmFloatWorkload.cpp8
-rw-r--r--src/backends/neon/workloads/NeonLstmFloatWorkload.cpp4
-rw-r--r--src/backends/reference/RefLayerSupport.cpp7
-rw-r--r--src/backends/reference/test/RefLayerTests.cpp8
-rw-r--r--src/backends/reference/workloads/RefLstmFloat32Workload.cpp365
-rw-r--r--src/backends/reference/workloads/RefLstmFloat32Workload.hpp24
11 files changed, 424 insertions, 47 deletions
diff --git a/src/armnn/layers/LstmLayer.cpp b/src/armnn/layers/LstmLayer.cpp
index 866c837357..bd104d49fe 100644
--- a/src/armnn/layers/LstmLayer.cpp
+++ b/src/armnn/layers/LstmLayer.cpp
@@ -123,14 +123,7 @@ std::vector<TensorShape> LstmLayer::InferOutputShapes(const std::vector<TensorSh
unsigned int numUnits = inputShapes[2][1];
std::vector<TensorShape> outShapes;
- if (!m_Param.m_CifgEnabled)
- {
- outShapes.push_back(TensorShape({batchSize, numUnits*3}));
- }
- else
- {
- outShapes.push_back(TensorShape({batchSize, numUnits*4}));
- }
+ outShapes.push_back(TensorShape({batchSize, numUnits * (m_Param.m_CifgEnabled ? 3 : 4)}));
outShapes.push_back(TensorShape({batchSize, outputSize}));
outShapes.push_back(TensorShape({batchSize, numUnits}));
outShapes.push_back(TensorShape({batchSize, outputSize}));
diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp
index 07f9079b5d..111df4b328 100644
--- a/src/armnn/test/CreateWorkload.hpp
+++ b/src/armnn/test/CreateWorkload.hpp
@@ -321,12 +321,8 @@ std::unique_ptr<LstmWorkload> CreateLstmWorkloadTest(armnn::IWorkloadFactory& fa
armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
- armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
- if (layerDesc.m_CifgEnabled)
- {
- lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
- }
-
+ armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) },
+ DataType::Float32);
Connect(input, layer, lstmTensorInfo1, 0, 0);
Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2);
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 8bd7d3dbee..30ca52092a 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -154,11 +154,8 @@ void CreateLSTMLayerHelper(Graph &graph, bool CifgEnabled)
armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32);
armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32);
armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32);
- armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32);
- if (layerDesc.m_CifgEnabled)
- {
- lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 });
- }
+ armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits * (layerDesc.m_CifgEnabled ? 3 : 4) },
+ DataType::Float32);
Connect(input, layer, lstmTensorInfo1, 0, 0);
Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1);
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index dad13413b4..bd8b38da01 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -925,8 +925,7 @@ LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(
-0.0186926f, 0.0193662f, -0.0115437f, 0.00422612f, -0.0345232f,
0.00223253f, -0.00957321f, 0.0210624f, 0.013331f, 0.0150954f,
0.02168f}));
- return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
- workloadFactory, memoryManager, input, expectedOutput);
+ return LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, memoryManager, input, expectedOutput);
}
LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(
@@ -6684,7 +6683,6 @@ LayerTestResult<uint8_t, 4> BatchToSpaceNdNhwcUintTest1(
std::vector<unsigned int> blockShape({2, 2});
std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
- return BatchToSpaceNdHelper<uint8_t, 4, 4>(workloadFactory, memoryManager,
- armnn::DataLayout::NHWC, inputShape, input, blockShape,
- crops, outputShape, expectedOutput);
+ return BatchToSpaceNdHelper<uint8_t, 4, 4>(workloadFactory, memoryManager, armnn::DataLayout::NHWC, inputShape,
+ input, blockShape, crops, outputShape, expectedOutput);
}
diff --git a/src/backends/backendsCommon/test/LstmTestImpl.hpp b/src/backends/backendsCommon/test/LstmTestImpl.hpp
index dfe24aa541..56f40aba84 100644
--- a/src/backends/backendsCommon/test/LstmTestImpl.hpp
+++ b/src/backends/backendsCommon/test/LstmTestImpl.hpp
@@ -34,7 +34,7 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
- armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+ armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType<float>());
armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
@@ -52,7 +52,7 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
- std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+ std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
@@ -153,8 +153,8 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8);
armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8);
armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8);
- armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16);
+ armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16);
armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16);
armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4);
@@ -222,11 +222,10 @@ LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(
LayerTestResult<float, 2>
-LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
- armnn::IWorkloadFactory& workloadFactory,
- const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
- const boost::multi_array<float, 2>& input,
- const boost::multi_array<float, 2>& outputExpected)
+LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+ const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+ const boost::multi_array<float, 2>& input,
+ const boost::multi_array<float, 2>& outputExpected)
{
unsigned int batchSize = 2;
unsigned int outputSize = 16;
@@ -237,8 +236,8 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>());
armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
- // Scratch buffer size without CIFG [batchSize, numUnits * 3]
- armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+ // Scratch buffer size without CIFG [batchSize, numUnits * 4]
+ armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, armnn::GetDataType<float>());
armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
@@ -255,7 +254,7 @@ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(
std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
- std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+ std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
@@ -955,7 +954,7 @@ LayerTestResult<float, 2> LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(
armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
- unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3;
+ unsigned int scratchBufferSize = cifgEnabled ? cellSize * 3 : cellSize * 4;
armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType<float>());
armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index 2a664454e1..f4d8974226 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -116,14 +116,14 @@ ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor,
m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>();
if (m_Data.m_Parameters.m_CifgEnabled)
{
- // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
- armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+ // 2D tensor with dimensions [num_units * 3, batch_size] with CIFG
+ armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32);
BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
}
else
{
- // scratch_buffer [num_units * 3, batch_size] without CIFG
- armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+ // scratch_buffer [num_units * 4, batch_size] without CIFG
+ armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32);
BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
}
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
index d03454b705..1ab269ff56 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -114,13 +114,13 @@ NeonLstmFloatWorkload::NeonLstmFloatWorkload(const LstmQueueDescriptor &descript
if (m_Data.m_Parameters.m_CifgEnabled)
{
// 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
- armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+ armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 3 }, DataType::Float32);
BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
}
else
{
// scratch_buffer [num_units * 3, batch_size] without CIFG
- armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+ armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 4 }, DataType::Float32);
BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
}
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index d6c1e66626..167cba54e8 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -278,7 +278,6 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input,
const TensorInfo* cellToOutputWeights,
Optional<std::string&> reasonIfUnsupported) const
{
- ignore_unused(input);
ignore_unused(outputStateIn);
ignore_unused(cellStateIn);
ignore_unused(scratchBuffer);
@@ -303,8 +302,10 @@ bool RefLayerSupport::IsLstmSupported(const TensorInfo& input,
ignore_unused(projectionBias);
ignore_unused(cellToForgetWeights);
ignore_unused(cellToOutputWeights);
- ignore_unused(reasonIfUnsupported);
- return false;
+ return IsSupportedForDataTypeRef(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &FalseFuncU8<>);
}
bool RefLayerSupport::IsMeanSupported(const TensorInfo& input,
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 4ff5cf2a2e..35981ea4b3 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -336,6 +336,14 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test)
+// Lstm
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection,
+ LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection,
+ LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection,
+ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest)
+
// Convert from Float16 to Float32
ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test)
// Convert from Float32 to Float16
diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
index 50ff605701..c697b66658 100644
--- a/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefLstmFloat32Workload.cpp
@@ -4,13 +4,376 @@
//
#include "RefLstmFloat32Workload.hpp"
+#include "RefWorkloadUtils.hpp"
+#include "Activation.hpp"
+
+namespace
+{
+
+// Helper functions ported from the Android code base
+// Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix,
+ uint32_t mRows,
+ uint32_t mCols,
+ const float* vector,
+ uint32_t nBatch,
+ float* outResult,
+ int resultStride = 1)
+{
+ float* resultInBatch = outResult;
+ for (uint32_t b = 0; b < nBatch; b++)
+ {
+ const float* matrixPtr = matrix;
+ for (uint32_t r = 0; r < mRows; r++)
+ {
+ const float* vectorInBatch = vector + b * mCols;
+ for (uint32_t c = 0; c < mCols; c++)
+ {
+ *resultInBatch += *matrixPtr++ * *vectorInBatch++;
+ }
+ resultInBatch += resultStride;
+ }
+ }
+}
+
+void VectorBatchVectorAssign(const float* vector,
+ uint32_t vSize,
+ uint32_t nBatch,
+ float* outBatchVector)
+{
+ for (uint32_t b = 0; b < nBatch; b++)
+ {
+ memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float));
+ }
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector,
+ uint32_t vSize,
+ const float* batchVector,
+ uint32_t nBatch,
+ float* outResult)
+{
+ for (uint32_t b = 0; b < nBatch; b++)
+ {
+ for (uint32_t v = 0; v < vSize; v++)
+ {
+ *outResult++ += vector[v] * *batchVector++;
+ }
+ }
+}
+
+void Sub1Vector(const float* vector,
+ uint32_t vSize,
+ float* result)
+{
+ for (uint32_t v = 0; v < vSize; v++)
+ {
+ *result++ = 1.0f - *vector++;
+ }
+}
+
+void VectorVectorCwiseProduct(const float* vector1,
+ const float* vector2,
+ uint32_t vSize,
+ float* outResult)
+{
+ for (uint32_t v = 0; v < vSize; v++)
+ {
+ *outResult++ = *vector1++ * *vector2++;
+ }
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2,
+ uint32_t vSize,
+ float* outResult)
+{
+ for (uint32_t v = 0; v < vSize; v++)
+ {
+ *outResult++ += *vector1++ * *vector2++;
+ }
+}
+
+float Clip(float f,
+ float absLimit)
+{
+ float result = (absLimit < f) ? absLimit : f;
+ result = (-absLimit > result) ? -absLimit : result;
+ return result;
+}
+
+void ClipVector(const float* vector,
+ uint32_t vSize,
+ float absLimit,
+ float* outResult)
+{
+ for (uint32_t v = 0; v < vSize; v++)
+ {
+ *outResult++ = Clip(*vector++, absLimit);
+ }
+}
+
+void CopyVector(const float* vector,
+ uint32_t vSize,
+ float* outResult)
+{
+ memcpy(outResult, vector, vSize * sizeof(float));
+}
+
+void SetActivationParameters(uint32_t activation,
+ armnn::ActivationFunction& outArmnnActivation,
+ float& outA,
+ float& outB)
+{
+ switch (activation)
+ {
+ case 0: // None
+ outA = 0;
+ outB = 0;
+ return;
+
+ case 1: // Relu
+ outArmnnActivation = armnn::ActivationFunction::ReLu;
+ outA = 0;
+ outB = 0;
+ return;
+
+ case 3: // Relu6
+ outArmnnActivation = armnn::ActivationFunction::BoundedReLu;
+ outA = 6;
+ outB = 0;
+ return;
+
+ case 4: // Tanh
+ outArmnnActivation = armnn::ActivationFunction::TanH;
+ outA = 1;
+ outB = 1;
+ return;
+
+ case 6: // Sigmoid
+ outArmnnActivation = armnn::ActivationFunction::Sigmoid;
+ outA = 0;
+ outB = 0;
+ return;
+
+ default:
+ throw armnn::Exception("Unsupported activation function: " + std::to_string(activation));
+ }
+}
+
+std::unique_ptr<armnn::ScopedCpuTensorHandle> AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr)
+{
+ if (!ptr)
+ {
+ return nullptr;
+ }
+
+ return std::make_unique<armnn::ScopedCpuTensorHandle>(*ptr);
+}
+
+} // anonymous namespace
namespace armnn
{
+RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
+ : Float32Workload<LstmQueueDescriptor>(descriptor, info)
+ , m_InputToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights))
+ , m_InputToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights))
+ , m_InputToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights))
+ , m_InputToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights))
+ , m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights))
+ , m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights))
+ , m_RecurrentToCellWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights))
+ , m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights))
+ , m_CellToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights))
+ , m_CellToForgetWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights))
+ , m_CellToOutputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights))
+ , m_InputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias))
+ , m_ForgetGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias))
+ , m_CellBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_CellBias))
+ , m_OutputGateBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias))
+ , m_ProjectionWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights))
+ , m_ProjectionBiasTensor (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias))
+{}
+
void RefLstmFloat32Workload::Execute() const
{
- throw armnn::Exception("No implementation of Lstm in the Ref backend!");
+ // This is a porting of the LSTM::Eval() method in the Android code base
+ // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp
+
+ const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+ const TensorShape& inputShape = inputInfo.GetShape();
+
+ float* scratchBuffer = GetOutputTensorDataFloat(0, m_Data);
+ float* outputStateOut = GetOutputTensorDataFloat(1, m_Data);
+ float* cellStateOut = GetOutputTensorDataFloat(2, m_Data);
+ float* output = GetOutputTensorDataFloat(3, m_Data);
+
+ const float* inputData = GetInputTensorDataFloat(0, m_Data);
+ const float* outputStateIn = GetInputTensorDataFloat(1, m_Data);
+ const float* cellStateIn = GetInputTensorDataFloat(2, m_Data);
+
+ const uint32_t nBatch = inputShape[0];
+ const uint32_t nInput = inputShape[1];
+
+ const uint32_t nCell = m_InputToOutputWeightsTensor->GetShape()[0];
+ const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1];
+
+ const bool useCifg = m_Data.m_Parameters.m_CifgEnabled;
+ const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled;
+
+ // Index the scratch buffers pointers to the global scratch buffer.
+ float* inputGateScratch = nullptr;
+ float* cellScratch = nullptr;
+ float* forgetGateScratch = nullptr;
+ float* outputGateScratch = nullptr;
+
+ if (useCifg)
+ {
+ cellScratch = scratchBuffer + 0 * nCell * nBatch;
+ forgetGateScratch = scratchBuffer + 1 * nCell * nBatch;
+ outputGateScratch = scratchBuffer + 2 * nCell * nBatch;
+ }
+ else
+ {
+ inputGateScratch = scratchBuffer + 0 * nCell * nBatch;
+ cellScratch = scratchBuffer + 1 * nCell * nBatch;
+ forgetGateScratch = scratchBuffer + 2 * nCell * nBatch;
+ outputGateScratch = scratchBuffer + 3 * nCell * nBatch;
+ }
+
+ // Initialize scratch buffers with bias.
+ if (!useCifg)
+ {
+ VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor<float>(),
+ nCell, nBatch, inputGateScratch);
+ }
+ VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor<float>(),
+ nCell, nBatch, forgetGateScratch);
+ VectorBatchVectorAssign(m_CellBiasTensor->GetTensor<float>(),
+ nCell, nBatch, cellScratch);
+ VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor<float>(),
+ nCell, nBatch, outputGateScratch);
+
+ // For each batch and cell: compute input_weight * input.
+ if (!useCifg)
+ {
+ MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor<float>(),
+ nCell, nInput, inputData, nBatch, inputGateScratch);
+ }
+ MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor<float>(),
+ nCell, nInput, inputData, nBatch, forgetGateScratch);
+ MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor<float>(),
+ nCell, nInput, inputData, nBatch, cellScratch);
+ MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor<float>(),
+ nCell, nInput, inputData, nBatch, outputGateScratch);
+
+ // For each batch and cell: compute recurrent_weight * output_state.
+ if (!useCifg)
+ {
+ MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor<float>(),
+ nCell, nOutput, outputStateIn, nBatch, inputGateScratch);
+ }
+ MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor<float>(),
+ nCell, nOutput, outputStateIn, nBatch, forgetGateScratch);
+ MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor<float>(),
+ nCell, nOutput, outputStateIn, nBatch, cellScratch);
+ MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor<float>(),
+ nCell, nOutput, outputStateIn, nBatch, outputGateScratch);
+
+ // For each batch and cell: update input gate.
+ if (!useCifg)
+ {
+ if (usePeephole)
+ {
+ VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor<float>(),
+ nCell, cellStateIn, nBatch, inputGateScratch);
+ }
+ Activation(inputGateScratch, inputGateScratch,
+ TensorInfo({nCell, nBatch}, DataType::Float32),
+ ActivationFunction::Sigmoid, 0, 0);
+ }
+
+ // For each batch and cell: update forget gate.
+ if (usePeephole)
+ {
+ VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor<float>(), nCell,
+ cellStateIn, nBatch, forgetGateScratch);
+ }
+ Activation(forgetGateScratch, forgetGateScratch,
+ TensorInfo({nCell, nBatch}, DataType::Float32),
+ ActivationFunction::Sigmoid, 0, 0);
+
+ // For each batch and cell: update the cell.
+ VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut);
+
+ ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid;
+ float a = 0;
+ float b = 0;
+ SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b);
+
+ if (m_Data.m_Parameters.m_ActivationFunc > 0)
+ {
+ Activation(cellScratch, cellScratch,
+ TensorInfo({nCell, nBatch}, DataType::Float32),
+ armnnActivationFunc, a, b);
+ }
+ if (useCifg)
+ {
+ Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch);
+ VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut);
+ }
+ else
+ {
+ VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut);
+ }
+ if (m_Data.m_Parameters.m_ClippingThresCell > 0.0)
+ {
+ ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut);
+ }
+
+ // For each batch and cell: update the output gate.
+ if (usePeephole)
+ {
+ VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor<float>(),
+ nCell, cellStateOut, nBatch, outputGateScratch);
+ }
+ Activation(outputGateScratch, outputGateScratch,
+ TensorInfo({nCell, nBatch}, DataType::Float32),
+ ActivationFunction::Sigmoid, 0, 0);
+
+ if (m_Data.m_Parameters.m_ActivationFunc > 0)
+ {
+ Activation(cellStateOut, cellScratch,
+ TensorInfo({nCell, nBatch}, DataType::Float32),
+ armnnActivationFunc, a, b);
+ }
+ VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch);
+
+ // For each batch: update the projection and output_state.
+ if (m_Data.m_Parameters.m_ProjectionEnabled)
+ {
+ if (m_ProjectionBiasTensor)
+ {
+ VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor<float>(),
+ nOutput, nBatch, output);
+ }
+ MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor<float>(),
+ nOutput, nCell, outputGateScratch, nBatch, output);
+
+ if (m_Data.m_Parameters.m_ClippingThresProj > 0.0)
+ {
+ ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output);
+ }
+ }
+ else
+ {
+ CopyVector(outputGateScratch, nBatch * nOutput, output);
+ }
+
+ CopyVector(output, nBatch * nOutput, outputStateOut);
}
} //namespace armnn
diff --git a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
index 1f634d3ca1..a2dead8b9c 100644
--- a/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
+++ b/src/backends/reference/workloads/RefLstmFloat32Workload.hpp
@@ -5,6 +5,8 @@
#pragma once
+#include <armnn/TypesUtils.hpp>
+
#include <backendsCommon/Workload.hpp>
#include <backendsCommon/WorkloadData.hpp>
@@ -14,8 +16,28 @@ namespace armnn
class RefLstmFloat32Workload : public Float32Workload<LstmQueueDescriptor>
{
public:
- using Float32Workload<LstmQueueDescriptor>::Float32Workload;
+ explicit RefLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_InputToInputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_InputToForgetWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_InputToCellWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_InputToOutputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToInputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToForgetWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToCellWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToOutputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_CellToInputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_CellToForgetWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_CellToOutputWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_InputGateBiasTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_ForgetGateBiasTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_CellBiasTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_OutputGateBiasTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionWeightsTensor;
+ std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionBiasTensor;
};
} //namespace armnn