plain/23.05/_ref_q_lstm_workload_8cpp_source.xhtml

//

// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.

// SPDX-License-Identifier: MIT

//


#include "RefQLstmWorkload.hpp"

#include "Activation.hpp"

#include "Encoders.hpp"

#include "Decoders.hpp"

#include "LstmUtils.hpp"

#include "RefWorkloadUtils.hpp"


namespace armnn

{


RefQLstmWorkload::RefQLstmWorkload(const QLstmQueueDescriptor &descriptor, const WorkloadInfo &info)

        : RefBaseWorkload<QLstmQueueDescriptor>(descriptor, info)

        , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))

        , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))

        , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))

        , m_InputToOutputWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToOutputWeights))


        , m_RecurrentToInputWeightsTensor (AssignScopedTensorHandle(descriptor.m_RecurrentToInputWeights))

        , m_RecurrentToForgetWeightsTensor(AssignScopedTensorHandle(descriptor.m_RecurrentToForgetWeights))

        , m_RecurrentToCellWeightsTensor  (AssignScopedTensorHandle(descriptor.m_RecurrentToCellWeights))

        , m_RecurrentToOutputWeightsTensor(AssignScopedTensorHandle(descriptor.m_RecurrentToOutputWeights))


        , m_CellToInputWeightsTensor      (AssignScopedTensorHandle(descriptor.m_CellToInputWeights))

        , m_CellToForgetWeightsTensor     (AssignScopedTensorHandle(descriptor.m_CellToForgetWeights))

        , m_CellToOutputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_CellToOutputWeights))


        , m_InputGateBiasTensor           (AssignScopedTensorHandle(descriptor.m_InputGateBias))

        , m_ForgetGateBiasTensor          (AssignScopedTensorHandle(descriptor.m_ForgetGateBias))

        , m_CellBiasTensor                (AssignScopedTensorHandle(descriptor.m_CellBias))

        , m_OutputGateBiasTensor          (AssignScopedTensorHandle(descriptor.m_OutputGateBias))


        , m_ProjectionWeightsTensor       (AssignScopedTensorHandle(descriptor.m_ProjectionWeights))

        , m_ProjectionBiasTensor          (AssignScopedTensorHandle(descriptor.m_ProjectionBias))


        , m_InputLayerNormWeightsTensor   (AssignScopedTensorHandle(descriptor.m_InputLayerNormWeights))

        , m_ForgetLayerNormWeightsTensor  (AssignScopedTensorHandle(descriptor.m_ForgetLayerNormWeights))

        , m_CellLayerNormWeightsTensor    (AssignScopedTensorHandle(descriptor.m_CellLayerNormWeights))

        , m_OutputLayerNormWeightsTensor  (AssignScopedTensorHandle(descriptor.m_OutputLayerNormWeights))

{}


void RefQLstmWorkload::Execute() const

{

    Execute(m_Data.m_Inputs, m_Data.m_Outputs);

}


void RefQLstmWorkload::ExecuteAsync(ExecutionData& executionData)

{

    WorkingMemDescriptor* workingMemDescriptor = static_cast<WorkingMemDescriptor*>(executionData.m_Data);

    Execute(workingMemDescriptor->m_Inputs, workingMemDescriptor->m_Outputs);

}


void RefQLstmWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const

{

    // This is a porting of the QLSTM::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs)

    // method in the Android code base

    // Note: this implementation wraps the arithmetic functions of the LSTM cell in Quantize/Dequantize ops, so all

    // computation is done in the floating point domain. Arithmetic functions are found in LstmUtils.cpp.

    // Refer to: android/frameworks/ml/nn/common/operations/QLSTM.cpp

    const DataType& internalType = armnn::DataType::QSymmS16;


    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);

    const TensorInfo& outputStateInInfo = GetTensorInfo(inputs[1]);

    const TensorInfo& cellStateInInfo = GetTensorInfo(inputs[2]);


    const TensorInfo& outputStateOutInfo = GetTensorInfo(outputs[0]);

    const TensorInfo& cellStateOutInfo = GetTensorInfo(outputs[1]);

    const TensorInfo& outputInfo = GetTensorInfo(outputs[2]);


    const TensorShape& inputShape = inputInfo.GetShape();

    const TensorShape& outputStateInShape = outputStateInInfo.GetShape();

    const TensorShape& cellStateInShape = cellStateInInfo.GetShape();


    // Infer numBatches, inputSize, outputSize and numUnits

    const uint32_t numBatches = inputShape[0];

    const uint32_t inputSize  = inputShape[1];

    const uint32_t outputSize = outputStateInShape[1];

    const uint32_t numUnits   = cellStateInShape[1];


    // Optional param settings

    const bool cifgEnabled      = m_Data.m_Parameters.m_CifgEnabled;

    const bool peepholeEnabled  = m_Data.m_Parameters.m_PeepholeEnabled;

    const bool projectionEnabled = m_Data.m_Parameters.m_ProjectionEnabled;

    const bool layerNormEnabled = m_Data.m_Parameters.m_LayerNormEnabled;


    // Input decoders

    std::unique_ptr<Decoder<float>> inputDecoder =

            MakeDecoder<float>(inputInfo, inputs[0]->Map());

    std::unique_ptr<Decoder<float>> outputStateInDecoder =

            MakeDecoder<float>(outputStateInInfo, inputs[1]->Map());

    std::unique_ptr<Decoder<float>> cellStateInDecoder =

            MakeDecoder<float>(cellStateInInfo, inputs[2]->Map());


    // Output decoders

    std::unique_ptr<Decoder<float>> outputStateOutDecoder =

            MakeDecoder<float>(outputStateOutInfo, outputs[0]->Map());

    std::unique_ptr<Decoder<float>> cellStateOutDecoder =

            MakeDecoder<float>(cellStateOutInfo, outputs[1]->Map());

    std::unique_ptr<Decoder<float>> outputDecoder =

            MakeDecoder<float>(outputInfo, outputs[2]->Map());


    // Output encoders

    std::unique_ptr<Encoder<float>> outputStateOutEncoder =

            MakeEncoder<float>(outputStateOutInfo, outputs[0]->Map());

    std::unique_ptr<Encoder<float>> cellStateOutEncoder =

            MakeEncoder<float>(cellStateOutInfo, outputs[1]->Map());

    std::unique_ptr<Encoder<float>> outputEncoder =

            MakeEncoder<float>(outputInfo, outputs[2]->Map());


    // Weights decoders

    std::unique_ptr<Decoder<float>> inputToForgetWeightsDecoder = MakeDecoder<float>(

            m_InputToForgetWeightsTensor->GetTensorInfo(), m_InputToForgetWeightsTensor->GetConstTensor<void>());

    std::unique_ptr<Decoder<float>> inputToCellWeightsDecoder = MakeDecoder<float>(

            m_InputToCellWeightsTensor->GetTensorInfo(), m_InputToCellWeightsTensor->GetConstTensor<void>());

    std::unique_ptr<Decoder<float>> inputToOutputWeightsDecoder = MakeDecoder<float>(

            m_InputToOutputWeightsTensor->GetTensorInfo(), m_InputToOutputWeightsTensor->GetConstTensor<void>());


    std::unique_ptr<Decoder<float>> recurrentToForgetWeightsDecoder = MakeDecoder<float>(

            m_RecurrentToForgetWeightsTensor->GetTensorInfo(),

            m_RecurrentToForgetWeightsTensor->GetConstTensor<void>());

    std::unique_ptr<Decoder<float>> recurrentToCellWeightsDecoder = MakeDecoder<float>(

            m_RecurrentToCellWeightsTensor->GetTensorInfo(), m_RecurrentToCellWeightsTensor->GetConstTensor<void>());

    std::unique_ptr<Decoder<float>> recurrentToOutputWeightsDecoder = MakeDecoder<float>(

            m_RecurrentToOutputWeightsTensor->GetTensorInfo(),

            m_RecurrentToOutputWeightsTensor->GetConstTensor<void>());


    // Optional CIFG params

    std::unique_ptr<Decoder<float>> inputToInputWeightsDecoder;

    std::unique_ptr<Decoder<float>> recurrentToInputWeightsDecoder;

    std::unique_ptr<Decoder<float>> inputGateBiasDecoder;


    // Optional Peephole params

    std::unique_ptr<Decoder<float>> cellToInputWeightsDecoder;

    std::unique_ptr<Decoder<float>> cellToForgetWeightsDecoder;

    std::unique_ptr<Decoder<float>> cellToOutputWeightsDecoder;


    // Optional Projection params

    std::unique_ptr<Decoder<float>> projectionWeightsDecoder;

    std::unique_ptr<Decoder<float>> projectionBiasDecoder;


    // Optional Layer Norm params

    std::unique_ptr<Decoder<float>> inputLayerNormWeightsDecoder;

    std::unique_ptr<Decoder<float>> forgetLayerNormWeightsDecoder;

    std::unique_ptr<Decoder<float>> cellLayerNormWeightsDecoder;

    std::unique_ptr<Decoder<float>> outputLayerNormWeightsDecoder;


    // Biases are only used when Layer Norm is enabled. Scale is defined as (XLayerNormWeights Scale / 1024)

    std::unique_ptr<Decoder<float>> forgetGateBiasDecoder;

    std::unique_ptr<Decoder<float>> cellGateBiasDecoder;

    std::unique_ptr<Decoder<float>> outputGateBiasDecoder;


    // Int16 vectors for internal state data (to be decoded/encoded)

    const uint32_t stateTensorSize = numBatches * numUnits;

    std::vector<int16_t> inputGateData(stateTensorSize);

    std::vector<int16_t> cellGateData(stateTensorSize);

    std::vector<int16_t> forgetGateData(stateTensorSize);

    std::vector<int16_t> outputGateData(stateTensorSize);

    std::vector<int32_t> hiddenStateData(stateTensorSize);

    std::vector<int16_t> outputInt16Data(numBatches * outputSize);


    armnn::TensorInfo inputGateInfo(

            {numBatches , numUnits}, armnn::DataType::QSymmS16, m_Data.m_Parameters.m_InputIntermediateScale, 0);

    armnn::TensorInfo cellGateInfo(

            {numBatches , numUnits}, armnn::DataType::QSymmS16, m_Data.m_Parameters.m_CellIntermediateScale, 0);

    armnn::TensorInfo forgetGateInfo(

            {numBatches , numUnits}, armnn::DataType::QSymmS16, m_Data.m_Parameters.m_ForgetIntermediateScale, 0);

    armnn::TensorInfo outputGateInfo(

            {numBatches , numUnits}, armnn::DataType::QSymmS16, m_Data.m_Parameters.m_OutputIntermediateScale, 0);

    armnn::TensorInfo hiddenStateInfo({numBatches, numUnits},

                                      armnn::DataType::QAsymmS8,

                                      m_Data.m_Parameters.m_HiddenStateScale,

                                      m_Data.m_Parameters.m_HiddenStateZeroPoint);

    armnn::TensorInfo outputInt16Info({numBatches , outputSize},

                                      armnn::DataType::QSymmS16,

                                      outputInfo.GetQuantizationScale(),

                                      outputInfo.GetQuantizationOffset());


    // Decoders/Encoders for internal states

    std::unique_ptr<Decoder<float>> inputGateDecoder =

            MakeDecoder<float>(inputGateInfo, inputGateData.data());

    std::unique_ptr<Decoder<float>> cellGateDecoder =

            MakeDecoder<float>(cellGateInfo, cellGateData.data());

    std::unique_ptr<Decoder<float>> forgetGateDecoder =

            MakeDecoder<float>(forgetGateInfo, forgetGateData.data());

    std::unique_ptr<Decoder<float>> outputGateDecoder =

            MakeDecoder<float>(outputGateInfo, outputGateData.data());

    std::unique_ptr<Decoder<float>> hiddenStateDecoder =

            MakeDecoder<float>(hiddenStateInfo, hiddenStateData.data());


    std::unique_ptr<Encoder<float>> inputGateEncoder =

            MakeEncoder<float>(inputGateInfo, inputGateData.data());

    std::unique_ptr<Encoder<float>> cellGateEncoder =

            MakeEncoder<float>(cellGateInfo, cellGateData.data());

    std::unique_ptr<Encoder<float>> forgetGateEncoder =

            MakeEncoder<float>(forgetGateInfo, forgetGateData.data());

    std::unique_ptr<Encoder<float>> outputGateEncoder =

            MakeEncoder<float>(outputGateInfo, outputGateData.data());

    std::unique_ptr<Encoder<float>> hiddenStateEncoder =

            MakeEncoder<float>(hiddenStateInfo, hiddenStateData.data());


    // Int16 used to accumulate output to prevent overflowing (after Projection MatMul)

    std::unique_ptr<Decoder<float>> outputInt16Decoder =

            MakeDecoder<float>(outputInt16Info, outputInt16Data.data());

    std::unique_ptr<Encoder<float>> outputInt16Encoder =

            MakeEncoder<float>(outputInt16Info, outputInt16Data.data());


    // Create decoders for optional params if they are enabled

    if (!cifgEnabled)

    {

        inputToInputWeightsDecoder = MakeDecoder<float>(

                m_InputToInputWeightsTensor->GetTensorInfo(), m_InputToInputWeightsTensor->GetConstTensor<void>());

        recurrentToInputWeightsDecoder = MakeDecoder<float>(m_RecurrentToInputWeightsTensor->GetTensorInfo(),

                                                            m_RecurrentToInputWeightsTensor->GetConstTensor<void>());

    }


    if (peepholeEnabled)

    {

        if (!cifgEnabled)

        {

            cellToInputWeightsDecoder = MakeDecoder<float>(

                    m_CellToInputWeightsTensor->GetTensorInfo(), m_CellToInputWeightsTensor->GetConstTensor<void>());

        }

        cellToForgetWeightsDecoder = MakeDecoder<float>(

                m_CellToForgetWeightsTensor->GetTensorInfo(), m_CellToForgetWeightsTensor->GetConstTensor<void>());

        cellToOutputWeightsDecoder = MakeDecoder<float>(

                m_CellToOutputWeightsTensor->GetTensorInfo(), m_CellToOutputWeightsTensor->GetConstTensor<void>());

    }


    if (projectionEnabled)

    {

        projectionWeightsDecoder = MakeDecoder<float>(

                m_ProjectionWeightsTensor->GetTensorInfo(), m_ProjectionWeightsTensor->GetConstTensor<void>());

        if (m_ProjectionBiasTensor)

        {

            projectionBiasDecoder = MakeDecoder<float>(

                    m_ProjectionBiasTensor->GetTensorInfo(), m_ProjectionBiasTensor->GetConstTensor<void>());

        }

    }


    if (layerNormEnabled)

    {

        if (!cifgEnabled)

        {

            inputLayerNormWeightsDecoder = MakeDecoder<float>(m_InputLayerNormWeightsTensor->GetTensorInfo(),

                                                              m_InputLayerNormWeightsTensor->GetConstTensor<void>());


            // Bias only used if layer norm enabled

            armnn::TensorInfo inputGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,

                    m_InputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);

            inputGateBiasDecoder = MakeDecoder<float>(

                    inputGateBiasTensorInfo, m_InputGateBiasTensor->GetConstTensor<void>());

        }


        forgetLayerNormWeightsDecoder = MakeDecoder<float>(

                m_ForgetLayerNormWeightsTensor->GetTensorInfo(),

                m_ForgetLayerNormWeightsTensor->GetConstTensor<void>());

        cellLayerNormWeightsDecoder = MakeDecoder<float>(

                m_CellLayerNormWeightsTensor->GetTensorInfo(), m_CellLayerNormWeightsTensor->GetConstTensor<void>());

        outputLayerNormWeightsDecoder = MakeDecoder<float>(

                m_OutputLayerNormWeightsTensor->GetTensorInfo(),

                m_OutputLayerNormWeightsTensor->GetConstTensor<void>());


        // Bias only used if layer norm enabled

        armnn::TensorInfo forgetGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,

                m_ForgetLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);

        forgetGateBiasDecoder = MakeDecoder<float>(

                forgetGateBiasTensorInfo, m_ForgetGateBiasTensor->GetConstTensor<void>());


        armnn::TensorInfo cellGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,

                m_CellLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);

        cellGateBiasDecoder = MakeDecoder<float>(

                cellGateBiasTensorInfo, m_CellBiasTensor->GetConstTensor<void>());


        armnn::TensorInfo outputGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,

                m_OutputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);

        outputGateBiasDecoder = MakeDecoder<float>(

                outputGateBiasTensorInfo, m_OutputGateBiasTensor->GetConstTensor<void>());

    }


    // Initialize internal state tensors with zeroes.

    if (!cifgEnabled)

    {

        ZeroVector(*inputGateEncoder, stateTensorSize);

    }

    ZeroVector(*forgetGateEncoder, stateTensorSize);

    ZeroVector(*cellGateEncoder, stateTensorSize);

    ZeroVector(*outputGateEncoder, stateTensorSize);

    ZeroVector(*hiddenStateEncoder, stateTensorSize);


    // Input weights * Input

    if (!cifgEnabled)

    {

        MatrixBatchVectorMultiplyAccumulate(*inputToInputWeightsDecoder,

                                            numUnits, inputSize, *inputDecoder, numBatches, *inputGateEncoder);

    }


    MatrixBatchVectorMultiplyAccumulate(*inputToForgetWeightsDecoder,

                                        numUnits, inputSize, *inputDecoder, numBatches, *forgetGateEncoder);


    MatrixBatchVectorMultiplyAccumulate(*inputToCellWeightsDecoder,

                                        numUnits, inputSize, *inputDecoder, numBatches, *cellGateEncoder);


    MatrixBatchVectorMultiplyAccumulate(*inputToOutputWeightsDecoder,

                                        numUnits, inputSize, *inputDecoder, numBatches, *outputGateEncoder);


    // Recurrent weights * OutputStateIn

    if (!cifgEnabled)

    {

        MatrixBatchVectorMultiplyAccumulate(*recurrentToInputWeightsDecoder,

                                            numUnits, outputSize, *outputStateInDecoder, numBatches, *inputGateEncoder);

    }


    MatrixBatchVectorMultiplyAccumulate(*recurrentToForgetWeightsDecoder,

                                        numUnits, outputSize, *outputStateInDecoder, numBatches, *forgetGateEncoder);


    MatrixBatchVectorMultiplyAccumulate(*recurrentToCellWeightsDecoder,

                                        numUnits, outputSize, *outputStateInDecoder, numBatches, *cellGateEncoder);


    MatrixBatchVectorMultiplyAccumulate(*recurrentToOutputWeightsDecoder,

                                        numUnits, outputSize, *outputStateInDecoder, numBatches, *outputGateEncoder);


    // Input gate.

    if (!cifgEnabled)

    {

        if (peepholeEnabled)

        {

            VectorBatchVectorCwiseProductAccumulate(*cellToInputWeightsDecoder,

                                                    numUnits, *cellStateInDecoder, numBatches, *inputGateEncoder);

        }


        if (layerNormEnabled)

        {

            inputGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *

                                               m_InputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *

                                               1024);

            inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());


            MeanStddevNormalization(*inputGateDecoder,

                                    *inputGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);


            inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());


            VectorBatchVectorCwiseProduct(*inputLayerNormWeightsDecoder,

                                          numUnits, *inputGateDecoder, numBatches, *inputGateEncoder);


            inputGateInfo.SetQuantizationScale(1.f / 4096);

            inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());


            VectorBatchVectorAdd(*inputGateBiasDecoder,

                                 numUnits, *inputGateDecoder, numBatches, *inputGateEncoder);


            inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());

        }


        inputGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());

        inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());


        // Input gate sigmoid

        Activation(*inputGateDecoder, *inputGateEncoder,

                   TensorInfo({numUnits, numBatches}, internalType),

                   ActivationFunction::Sigmoid, 0, 0);


        inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());

    }


    // Forget gate

    if (peepholeEnabled)

    {

        VectorBatchVectorCwiseProductAccumulate(*cellToForgetWeightsDecoder, numUnits,

                                                *cellStateInDecoder, numBatches, *forgetGateEncoder);

    }


    if (layerNormEnabled)

    {

        // Quantize layer norm output to Input Scale * m_ForgetLayerNormWeightsTensor * 1024

        forgetGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *

                                            m_ForgetLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *

                                            1024);

        forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());


        MeanStddevNormalization(*forgetGateDecoder,

                                *forgetGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);


        forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());


        VectorBatchVectorCwiseProduct(*forgetLayerNormWeightsDecoder,

                                      numUnits, *forgetGateDecoder, numBatches, *forgetGateEncoder);


        // Dequantize layer norm output to (1 / 4096)

        forgetGateInfo.SetQuantizationScale(1.f / 4096);

        forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());


        VectorBatchVectorAdd(*forgetGateBiasDecoder,

                             numUnits, *forgetGateDecoder, numBatches, *forgetGateEncoder);


        forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());

    }


    forgetGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());

    forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());


    // Forget gate sigmoid

    Activation(*forgetGateDecoder, *forgetGateEncoder,

               TensorInfo({numUnits, numBatches}, internalType),

               ActivationFunction::Sigmoid, 0, 0);


    forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());


    // Cell (Modulation) gate

    if (layerNormEnabled)

    {

        cellGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *

                                          m_CellLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *

                                          1024);

        cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());


        MeanStddevNormalization(*cellGateDecoder, *cellGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);


        cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());


        VectorBatchVectorCwiseProduct(*cellLayerNormWeightsDecoder,

                                      numUnits, *cellGateDecoder, numBatches, *cellGateEncoder);


        cellGateInfo.SetQuantizationScale(1.f / 4096);

        cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());


        VectorBatchVectorAdd(*cellGateBiasDecoder,

                             numUnits, *cellGateDecoder, numBatches, *cellGateEncoder);


        cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());

    }


    cellGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());

    cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());


    // Cell (Modulation) gate tanH

    Activation(*cellGateDecoder, *cellGateEncoder,

               TensorInfo({numUnits, numBatches}, internalType),

               ActivationFunction::TanH, 1.0f, 1.0f);


    cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());


    VectorVectorCwiseProduct(*forgetGateDecoder, *cellStateInDecoder, stateTensorSize, *cellStateOutEncoder);


    if (cifgEnabled)

    {

        Sub1Vector(*forgetGateDecoder, stateTensorSize, *forgetGateEncoder);

        VectorVectorCwiseProductAccumulate(

                *cellGateDecoder, *forgetGateDecoder, stateTensorSize, *cellStateOutEncoder);

    }

    else

    {

        VectorVectorCwiseProductAccumulate(

                *cellGateDecoder, *inputGateDecoder, stateTensorSize, *cellStateOutEncoder);

    }


    // Final cell state out calculated here

    if (m_Data.m_Parameters.m_CellClip > 0.0)

    {

        ClipVector(*cellStateOutDecoder, stateTensorSize, m_Data.m_Parameters.m_CellClip, *cellStateOutEncoder);

    }


    // Output gate.

    if (peepholeEnabled)

    {

        VectorBatchVectorCwiseProductAccumulate(*cellToOutputWeightsDecoder,

                                                numUnits, *cellStateOutDecoder, numBatches, *outputGateEncoder);

    }


    if (layerNormEnabled)

    {

        outputGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *

                                            m_OutputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *

                                            1024);

        outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());


        MeanStddevNormalization(*outputGateDecoder, *outputGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);


        outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());


        VectorBatchVectorCwiseProduct(*outputLayerNormWeightsDecoder, numUnits, *outputGateDecoder,

                                      numBatches, *outputGateEncoder);


        outputGateInfo.SetQuantizationScale(1.f / 4096);

        outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());


        VectorBatchVectorAdd(*outputGateBiasDecoder, numUnits, *outputGateDecoder, numBatches, *outputGateEncoder);


        outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());

    }


    outputGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());

    outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());


    // Output gate sigmoid

    Activation(*outputGateDecoder, *outputGateEncoder,

               TensorInfo({numUnits, numBatches}, internalType),

               ActivationFunction::Sigmoid, 0, 0);


    outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());


    // Hidden state tanH

    Activation(*cellStateOutDecoder, *cellGateEncoder,

               TensorInfo({numUnits, numBatches}, internalType),

               ActivationFunction::TanH, 1.0f, 1.0f);


    // Final hidden state output

    VectorVectorCwiseProduct(*outputGateDecoder, *cellGateDecoder, stateTensorSize, *hiddenStateEncoder);


    // Projection

    if (m_Data.m_Parameters.m_ProjectionEnabled)

    {

        if (m_ProjectionBiasTensor)

        {

            VectorBatchVectorAssign(*projectionBiasDecoder, outputSize, numBatches, *outputInt16Encoder);

        }


        MatrixBatchVectorMultiplyAccumulate(*projectionWeightsDecoder, outputSize, numUnits, *hiddenStateDecoder,

                                            numBatches, *outputInt16Encoder);


        CopyVector(*outputInt16Decoder, numBatches * outputSize, *outputEncoder);


        if (m_Data.m_Parameters.m_ProjectionClip > 0.0)

        {

            ClipVector(*outputDecoder, numBatches * outputSize, m_Data.m_Parameters.m_ProjectionClip, *outputEncoder);

        }

    }

    else

    {

        // Output has same quantization scale as hidden state if projection is disabled

        CopyVector(*hiddenStateDecoder, numBatches * outputSize, *outputEncoder);

    }


    // output == outputStateOut

    CopyVector(*outputDecoder, numBatches * outputSize, *outputStateOutEncoder);

}


} //namespace armnn