//
// Copyright © 2017 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//

#include "RefLstmFloat32Workload.hpp"
#include "RefWorkloadUtils.hpp"
#include "Activation.hpp"

namespace
{

// Helper functions ported from the Android code base
// Refer to: android/external/tensorflow/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc

void MatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                         uint32_t mRows,
                                         uint32_t mCols,
                                         const float* vector,
                                         uint32_t nBatch,
                                         float* outResult,
                                         int resultStride = 1)
{
    float* resultInBatch = outResult;
    for (uint32_t b = 0; b < nBatch; b++)
    {
        const float* matrixPtr = matrix;
        for (uint32_t r = 0; r < mRows; r++)
        {
            const float* vectorInBatch = vector + b * mCols;
            for (uint32_t c = 0; c < mCols; c++)
            {
                *resultInBatch += *matrixPtr++ * *vectorInBatch++;
            }
            resultInBatch += resultStride;
        }
    }
}

void VectorBatchVectorAssign(const float* vector,
                             uint32_t vSize,
                             uint32_t nBatch,
                             float* outBatchVector)
{
    for (uint32_t b = 0; b < nBatch; b++)
    {
        memcpy(outBatchVector + b * vSize, vector, vSize * sizeof(float));
    }
}

void VectorBatchVectorCwiseProductAccumulate(const float* vector,
                                             uint32_t vSize,
                                             const float* batchVector,
                                             uint32_t nBatch,
                                             float* outResult)
{
    for (uint32_t b = 0; b < nBatch; b++)
    {
        for (uint32_t v = 0; v < vSize; v++)
        {
            *outResult++ += vector[v] * *batchVector++;
        }
    }
}

void Sub1Vector(const float* vector,
                uint32_t vSize,
                float* result)
{
    for (uint32_t v = 0; v < vSize; v++)
    {
        *result++ = 1.0f - *vector++;
    }
}

void VectorVectorCwiseProduct(const float* vector1,
                              const float* vector2,
                              uint32_t vSize,
                              float* outResult)
{
    for (uint32_t v = 0; v < vSize; v++)
    {
        *outResult++ = *vector1++ * *vector2++;
    }
}

void VectorVectorCwiseProductAccumulate(const float* vector1,
                                        const float* vector2,
                                        uint32_t vSize,
                                        float* outResult)
{
    for (uint32_t v = 0; v < vSize; v++)
    {
        *outResult++ += *vector1++ * *vector2++;
    }
}

float Clip(float f,
           float absLimit)
{
    float result = (absLimit < f) ? absLimit : f;
    result = (-absLimit > result) ? -absLimit : result;
    return result;
}

void ClipVector(const float* vector,
                uint32_t vSize,
                float absLimit,
                float* outResult)
{
    for (uint32_t v = 0; v < vSize; v++)
    {
        *outResult++ = Clip(*vector++, absLimit);
    }
}

void CopyVector(const float* vector,
                uint32_t vSize,
                float* outResult)
{
    memcpy(outResult, vector, vSize * sizeof(float));
}

void SetActivationParameters(uint32_t activation,
                             armnn::ActivationFunction& outArmnnActivation,
                             float& outA,
                             float& outB)
{
    switch (activation)
    {
    case 0: // None
        outA = 0;
        outB = 0;
        return;

    case 1: // Relu
        outArmnnActivation = armnn::ActivationFunction::ReLu;
        outA = 0;
        outB = 0;
        return;

    case 3: // Relu6
        outArmnnActivation = armnn::ActivationFunction::BoundedReLu;
        outA = 6;
        outB = 0;
        return;

    case 4: // Tanh
        outArmnnActivation = armnn::ActivationFunction::TanH;
        outA = 1;
        outB = 1;
        return;

    case 6: // Sigmoid
        outArmnnActivation = armnn::ActivationFunction::Sigmoid;
        outA = 0;
        outB = 0;
        return;

    default:
        throw armnn::Exception("Unsupported activation function: " + std::to_string(activation));
    }
}

std::unique_ptr<armnn::ScopedCpuTensorHandle> AssignScopedCpuTensorHandle(const armnn::ConstCpuTensorHandle* ptr)
{
    if (!ptr)
    {
        return nullptr;
    }

    return std::make_unique<armnn::ScopedCpuTensorHandle>(*ptr);
}

} // anonymous namespace

namespace armnn
{

RefLstmFloat32Workload::RefLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
    : Float32Workload<LstmQueueDescriptor>(descriptor, info)
    , m_InputToInputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_InputToInputWeights))
    , m_InputToForgetWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToForgetWeights))
    , m_InputToCellWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_InputToCellWeights))
    , m_InputToOutputWeightsTensor    (AssignScopedCpuTensorHandle(descriptor.m_InputToOutputWeights))
    , m_RecurrentToInputWeightsTensor (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToInputWeights))
    , m_RecurrentToForgetWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToForgetWeights))
    , m_RecurrentToCellWeightsTensor  (AssignScopedCpuTensorHandle(descriptor.m_RecurrentToCellWeights))
    , m_RecurrentToOutputWeightsTensor(AssignScopedCpuTensorHandle(descriptor.m_RecurrentToOutputWeights))
    , m_CellToInputWeightsTensor      (AssignScopedCpuTensorHandle(descriptor.m_CellToInputWeights))
    , m_CellToForgetWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToForgetWeights))
    , m_CellToOutputWeightsTensor     (AssignScopedCpuTensorHandle(descriptor.m_CellToOutputWeights))
    , m_InputGateBiasTensor           (AssignScopedCpuTensorHandle(descriptor.m_InputGateBias))
    , m_ForgetGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ForgetGateBias))
    , m_CellBiasTensor                (AssignScopedCpuTensorHandle(descriptor.m_CellBias))
    , m_OutputGateBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_OutputGateBias))
    , m_ProjectionWeightsTensor       (AssignScopedCpuTensorHandle(descriptor.m_ProjectionWeights))
    , m_ProjectionBiasTensor          (AssignScopedCpuTensorHandle(descriptor.m_ProjectionBias))
{}

void RefLstmFloat32Workload::Execute() const
{
    // This is a porting of the LSTM::Eval() method in the Android code base
    // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp

    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
    const TensorShape& inputShape = inputInfo.GetShape();

    float* scratchBuffer  = GetOutputTensorDataFloat(0, m_Data);
    float* outputStateOut = GetOutputTensorDataFloat(1, m_Data);
    float* cellStateOut   = GetOutputTensorDataFloat(2, m_Data);
    float* output         = GetOutputTensorDataFloat(3, m_Data);

    const float* inputData     = GetInputTensorDataFloat(0, m_Data);
    const float* outputStateIn = GetInputTensorDataFloat(1, m_Data);
    const float* cellStateIn   = GetInputTensorDataFloat(2, m_Data);

    const uint32_t nBatch = inputShape[0];
    const uint32_t nInput = inputShape[1];

    const uint32_t nCell   = m_InputToOutputWeightsTensor->GetShape()[0];
    const uint32_t nOutput = m_RecurrentToOutputWeightsTensor->GetShape()[1];

    const bool useCifg     = m_Data.m_Parameters.m_CifgEnabled;
    const bool usePeephole = m_Data.m_Parameters.m_PeepholeEnabled;

    // Index the scratch buffers pointers to the global scratch buffer.
    float* inputGateScratch  = nullptr;
    float* cellScratch       = nullptr;
    float* forgetGateScratch = nullptr;
    float* outputGateScratch = nullptr;

    if (useCifg)
    {
        cellScratch       = scratchBuffer + 0 * nCell * nBatch;
        forgetGateScratch = scratchBuffer + 1 * nCell * nBatch;
        outputGateScratch = scratchBuffer + 2 * nCell * nBatch;
    }
    else
    {
        inputGateScratch  = scratchBuffer + 0 * nCell * nBatch;
        cellScratch       = scratchBuffer + 1 * nCell * nBatch;
        forgetGateScratch = scratchBuffer + 2 * nCell * nBatch;
        outputGateScratch = scratchBuffer + 3 * nCell * nBatch;
    }

    // Initialize scratch buffers with bias.
    if (!useCifg)
    {
        VectorBatchVectorAssign(m_InputGateBiasTensor->GetTensor<float>(),
                                nCell, nBatch, inputGateScratch);
    }
    VectorBatchVectorAssign(m_ForgetGateBiasTensor->GetTensor<float>(),
                            nCell, nBatch, forgetGateScratch);
    VectorBatchVectorAssign(m_CellBiasTensor->GetTensor<float>(),
                            nCell, nBatch, cellScratch);
    VectorBatchVectorAssign(m_OutputGateBiasTensor->GetTensor<float>(),
                            nCell, nBatch, outputGateScratch);

    // For each batch and cell: compute input_weight * input.
    if (!useCifg)
    {
        MatrixBatchVectorMultiplyAccumulate(m_InputToInputWeightsTensor->GetTensor<float>(),
                                            nCell, nInput, inputData, nBatch, inputGateScratch);
    }
    MatrixBatchVectorMultiplyAccumulate(m_InputToForgetWeightsTensor->GetTensor<float>(),
                                        nCell, nInput, inputData, nBatch, forgetGateScratch);
    MatrixBatchVectorMultiplyAccumulate(m_InputToCellWeightsTensor->GetTensor<float>(),
                                        nCell, nInput, inputData, nBatch, cellScratch);
    MatrixBatchVectorMultiplyAccumulate(m_InputToOutputWeightsTensor->GetTensor<float>(),
                                        nCell, nInput, inputData, nBatch, outputGateScratch);

    // For each batch and cell: compute recurrent_weight * output_state.
    if (!useCifg)
    {
        MatrixBatchVectorMultiplyAccumulate(m_RecurrentToInputWeightsTensor->GetTensor<float>(),
                                            nCell, nOutput, outputStateIn, nBatch, inputGateScratch);
    }
    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToForgetWeightsTensor->GetTensor<float>(),
                                        nCell, nOutput, outputStateIn, nBatch, forgetGateScratch);
    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToCellWeightsTensor->GetTensor<float>(),
                                        nCell, nOutput, outputStateIn, nBatch, cellScratch);
    MatrixBatchVectorMultiplyAccumulate(m_RecurrentToOutputWeightsTensor->GetTensor<float>(),
                                        nCell, nOutput, outputStateIn, nBatch, outputGateScratch);

    // For each batch and cell: update input gate.
    if (!useCifg)
    {
        if (usePeephole)
        {
            VectorBatchVectorCwiseProductAccumulate(m_CellToInputWeightsTensor->GetTensor<float>(),
                                                    nCell, cellStateIn, nBatch, inputGateScratch);
        }
        Activation(inputGateScratch, inputGateScratch,
                   TensorInfo({nCell, nBatch}, DataType::Float32),
                   ActivationFunction::Sigmoid, 0, 0);
    }

    // For each batch and cell: update forget gate.
    if (usePeephole)
    {
        VectorBatchVectorCwiseProductAccumulate(m_CellToForgetWeightsTensor->GetTensor<float>(), nCell,
                                                cellStateIn, nBatch, forgetGateScratch);
    }
    Activation(forgetGateScratch, forgetGateScratch,
               TensorInfo({nCell, nBatch}, DataType::Float32),
               ActivationFunction::Sigmoid, 0, 0);

    // For each batch and cell: update the cell.
    VectorVectorCwiseProduct(forgetGateScratch, cellStateIn, nBatch * nCell, cellStateOut);

    ActivationFunction armnnActivationFunc = ActivationFunction::Sigmoid;
    float a = 0;
    float b = 0;
    SetActivationParameters(m_Data.m_Parameters.m_ActivationFunc, armnnActivationFunc, a, b);

    if (m_Data.m_Parameters.m_ActivationFunc > 0)
    {
        Activation(cellScratch, cellScratch,
                   TensorInfo({nCell, nBatch}, DataType::Float32),
                   armnnActivationFunc, a, b);
    }
    if (useCifg)
    {
        Sub1Vector(forgetGateScratch, nBatch * nCell, forgetGateScratch);
        VectorVectorCwiseProductAccumulate(cellScratch, forgetGateScratch, nBatch * nCell, cellStateOut);
    }
    else
    {
        VectorVectorCwiseProductAccumulate(cellScratch, inputGateScratch, nBatch * nCell, cellStateOut);
    }
    if (m_Data.m_Parameters.m_ClippingThresCell > 0.0)
    {
        ClipVector(cellStateOut, nBatch * nCell, m_Data.m_Parameters.m_ClippingThresCell, cellStateOut);
    }

    // For each batch and cell: update the output gate.
    if (usePeephole)
    {
        VectorBatchVectorCwiseProductAccumulate(m_CellToOutputWeightsTensor->GetTensor<float>(),
                                                nCell, cellStateOut, nBatch, outputGateScratch);
    }
    Activation(outputGateScratch, outputGateScratch,
               TensorInfo({nCell, nBatch}, DataType::Float32),
               ActivationFunction::Sigmoid, 0, 0);

    if (m_Data.m_Parameters.m_ActivationFunc > 0)
    {
        Activation(cellStateOut, cellScratch,
                   TensorInfo({nCell, nBatch}, DataType::Float32),
                   armnnActivationFunc, a, b);
    }
    VectorVectorCwiseProduct(outputGateScratch, cellScratch, nBatch * nCell, outputGateScratch);

    // For each batch: update the projection and output_state.
    if (m_Data.m_Parameters.m_ProjectionEnabled)
    {
        if (m_ProjectionBiasTensor)
        {
            VectorBatchVectorAssign(m_ProjectionBiasTensor->GetTensor<float>(),
                                    nOutput, nBatch, output);
        }
        MatrixBatchVectorMultiplyAccumulate(m_ProjectionWeightsTensor->GetTensor<float>(),
                                            nOutput, nCell, outputGateScratch, nBatch, output);

        if (m_Data.m_Parameters.m_ClippingThresProj > 0.0)
        {
            ClipVector(output, nBatch * nOutput, m_Data.m_Parameters.m_ClippingThresProj, output);
        }
    }
    else
    {
        CopyVector(outputGateScratch, nBatch * nOutput, output);
    }

    CopyVector(output, nBatch * nOutput, outputStateOut);
}

} //namespace armnn