plain/23.11/_fuse_batch_norm_8hpp_source.html

//

// Copyright © 2020,2022 Arm Ltd and Contributors. All rights reserved.

// SPDX-License-Identifier: MIT

//


#pragma once


#include "Optimization.hpp"

#include <armnnUtils/DataLayoutIndexed.hpp>

#include <ResolveType.hpp>


namespace armnn

{

namespace optimizations

{


template<typename ConvLayer, armnn::DataType ArmnnType,

         typename T = armnn::ResolveType<ArmnnType>>

class FuseBatchNorm

{

public:

    /// Run for every exclusive connection between any base Convolution layer and a child BatchNorm layer for not

    /// quantized layers.

    /// The child will be removed, the base will be removed if it's left unconnected. A new Convolution layer will

    /// be added, its weights and bias will be calculated using the weights and bias of the base Convolution layer

    /// combined with the parameters of the child BatchNorm layer.

    void Run(Graph& graph, InputSlot& connection) const

    {

        Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();

        Layer& child = connection.GetOwningLayer();


        bool depthwise = (base.GetType() == LayerType::DepthwiseConvolution2d);


        ARMNN_ASSERT(base.GetType() == LayerType::Convolution2d || depthwise);

        ARMNN_ASSERT(child.GetType() == LayerType::BatchNormalization);


        if (base.GetDataType() == ArmnnType && child.GetDataType() == ArmnnType)

        {

            OutputSlot* parentOut = base.GetInputSlot(0).GetConnectedOutputSlot();

            auto convLayer = PolymorphicDowncast<ConvLayer*>(&base);

            auto batchNormLayer = PolymorphicDowncast<BatchNormalizationLayer*>(&child);


            // Read convolution and batch norm parameters

            BatchNormalizationDescriptor batchNormDescriptor = batchNormLayer->GetParameters();

            auto epsilon = batchNormDescriptor.m_Eps;

            IgnoreUnused(epsilon);


            ConstTensor betaTensor(batchNormLayer->m_Beta->GetTensorInfo(), batchNormLayer->m_Beta->Map(true));

            ConstTensor gammaTensor(batchNormLayer->m_Gamma->GetTensorInfo(), batchNormLayer->m_Gamma->Map(true));

            ConstTensor meanTensor(batchNormLayer->m_Mean->GetTensorInfo(), batchNormLayer->m_Mean->Map(true));

            ConstTensor varTensor(batchNormLayer->m_Variance->GetTensorInfo(), batchNormLayer->m_Variance->Map(true));


            auto convDescriptor = convLayer->GetParameters();

            ConstTensor weightsTensor;

            ARMNN_ASSERT_MSG(convLayer->GetInputSlots()[1].GetConnection() != nullptr,

                             "FuseBatchNorm: Weight data should not be null.");


            ConstantLayer* weightLayer = PolymorphicDowncast<ConstantLayer*>(

                                        &base.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer());


            weightsTensor = ConstTensor(weightLayer->m_LayerOutput->GetTensorInfo(),

                                        weightLayer->m_LayerOutput->Map(true));


            armnnUtils::DataLayoutIndexed dataLayout(convDescriptor.m_DataLayout);

            auto weightsShape = weightsTensor.GetInfo().GetShape();

            const unsigned int inputChannels   = parentOut->GetTensorInfo().GetShape()[dataLayout.GetChannelsIndex()];

            const unsigned int depthMultiplier = depthwise ? weightsShape[3] / inputChannels : 1;

            const unsigned int outputChannels  = depthwise ? weightsShape[3] : weightsShape[0];

            const unsigned int weightsHeight   = depthwise ? weightsShape[1] :

                                                 weightsShape[dataLayout.GetHeightIndex()];

            const unsigned int weightsWidth    = depthwise ? weightsShape[2] :

                                                 weightsShape[dataLayout.GetWidthIndex()];


            const auto* weightsBuffer = static_cast<const T*>(weightsTensor.GetMemoryArea());

            const auto* betaBuffer    = static_cast<const T*>(betaTensor.GetMemoryArea());

            const auto* gammaBuffer   = static_cast<const T*>(gammaTensor.GetMemoryArea());

            const auto* meanBuffer    = static_cast<const T*>(meanTensor.GetMemoryArea());

            const auto* varBuffer     = static_cast<const T*>(varTensor.GetMemoryArea());


            std::vector<T> weightsVector (weightsBuffer, weightsBuffer + weightsTensor.GetNumElements());

            std::vector<T> betaVector    (betaBuffer, betaBuffer + betaTensor.GetNumElements());

            std::vector<T> gammaVector   (gammaBuffer, gammaBuffer + gammaTensor.GetNumElements());

            std::vector<T> meanVector    (meanBuffer, meanBuffer + meanTensor.GetNumElements());

            std::vector<T> varianceVector(varBuffer, varBuffer + varTensor.GetNumElements());


            // fusedWeights = ( gamma * weights ) / ( std - epsilon);

            std::vector<T> fusedWeightsVector(weightsVector.size());


            for (unsigned int cInput = 0; cInput < inputChannels; ++cInput)

            {

                for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)

                {

                    T mult = gammaVector[cOut] / static_cast<T>(sqrtf(varianceVector[cOut] + epsilon));


                    for (unsigned int h = 0; h < weightsHeight; ++h)

                    {

                        for (unsigned int w = 0; w < weightsWidth; ++w)

                        {

                            unsigned int weightsIdx = 0;


                            if (depthwise)

                            {

                                cInput = cOut / depthMultiplier;

                                weightsIdx = w * outputChannels + cOut +

                                             h * weightsWidth * outputChannels;

                            }

                            else if (convDescriptor.m_DataLayout == DataLayout::NHWC)

                            {

                                weightsIdx = cOut * weightsHeight * weightsWidth * inputChannels +

                                             h * weightsWidth * inputChannels +

                                             w * inputChannels +

                                             cInput;

                            }

                            else

                            {

                                weightsIdx = cOut * weightsWidth * weightsHeight * inputChannels +

                                             cInput * weightsWidth * weightsHeight +

                                             h * weightsWidth +

                                             w;

                            }

                            fusedWeightsVector[weightsIdx] = mult * weightsVector[weightsIdx];

                        }

                    }

                }

            }

            ConstTensor fusedWeightsTensor(weightsTensor.GetInfo(), fusedWeightsVector);


            //  fusedBias = (gamma * (bias - mean)) / (variance - epsilon) + beta;

            std::vector<T> fusedBiasVector(outputChannels);

            bool biasWasEnabledBeforeOpt = convDescriptor.m_BiasEnabled;

            if (biasWasEnabledBeforeOpt)

            {

                ConstTensor biasTensor;

                ARMNN_ASSERT_MSG(convLayer->GetInputSlots()[2].GetConnection() != nullptr,

                                 "FuseBatchNorm: Bias data should not be null if bias is enabled.");


                ConstantLayer* biasLayer = PolymorphicDowncast<ConstantLayer*>(

                                                &base.GetInputSlot(2).GetConnectedOutputSlot()->GetOwningLayer());


                biasTensor = ConstTensor(biasLayer->m_LayerOutput->GetTensorInfo(),

                                         biasLayer->m_LayerOutput->Map(true));


                const auto* biasBuffer = static_cast<const T*>(biasTensor.GetMemoryArea());

                std::vector<T> biasVector(biasBuffer, biasBuffer + biasTensor.GetNumElements());


                for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)

                {

                    fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /

                                             sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];

                }

            }

            else

            {

                convDescriptor.m_BiasEnabled = true;

                std::vector<T> biasVector(outputChannels, T(0));


                for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)

                {

                    fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /

                                             sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];

                }

            }

            ConstTensor fusedBiasTensor(TensorInfo({outputChannels}, ArmnnType, 0.0f, 0, true), fusedBiasVector);


            // Insert the new convolution layer that has batch norm parameters fused into

            const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + base.GetName();

            auto& newConv2dLayer = *graph.InsertNewLayer<ConvLayer>(base.GetInputSlot(0),

                                                                    convDescriptor,

                                                                    name.c_str());


            // Connect weights and bias from old to new Conv2d layer

            // This optimization will always have 3 input slots on the Conv2d base layer

            if (newConv2dLayer.GetNumInputSlots() > 1)

            {

                // Remove old connection and connect to new layer2d

                weightLayer->GetOutputSlot(0).Disconnect(base.GetInputSlot(1));

                weightLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(1));

                weightLayer->m_LayerOutput = std::make_unique<ScopedTensorHandle>(fusedWeightsTensor);


                // Move bias const layers as normal if it was enabled before the optimisation

                ConstantLayer* biasLayer;

                if (biasWasEnabledBeforeOpt)

                {

                    biasLayer = PolymorphicDowncast<ConstantLayer*>(

                        &base.GetInputSlot(2).GetConnectedOutputSlot()->GetOwningLayer());

                    // Remove old connection and connect to new layer2d

                    biasLayer->GetOutputSlot(0).Disconnect(base.GetInputSlot(2));

                    biasLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(2));


                }

                // Otherwise create a new bias layer and add to the new convolution2d

                else

                {

                    // Add in bias constant layer

                    biasLayer = graph.AddLayer<ConstantLayer>("Bias");

                    biasLayer->GetOutputSlot(0).SetTensorInfo(fusedBiasTensor.GetInfo());

                    biasLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(2));

                }

                biasLayer->m_LayerOutput = std::make_unique<ScopedTensorHandle>(ConstTensor(fusedBiasTensor));

            }


            // Reconnects with original parent.

            newConv2dLayer.GetOutputSlot().MoveAllConnections(*parentOut);

            // Parent is now the new convolution2d layer.

            parentOut = &newConv2dLayer.GetOutputSlot();


            // Moves connections in child output to parent layer.

            // Child layer will be removed as it's left unconnected.

            // Base layer will be removed if left unconnected.

            child.GetOutputSlot().MoveAllConnections(*parentOut);

        }

    }

protected:

    FuseBatchNorm()  = default;

    ~FuseBatchNorm() = default;

};


using FuseBatchNormIntoConvolution2DFloat32 =

        OptimizeForExclusiveConnection<Convolution2dLayer,

                                       BatchNormalizationLayer,

                                       FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float32>>;


using FuseBatchNormIntoConvolution2DFloat16 =

        OptimizeForExclusiveConnection<Convolution2dLayer,

                                       BatchNormalizationLayer,

                                       FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float16>>;


using FuseBatchNormIntoDepthwiseConvolution2DFloat32 =

        OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,

                                       BatchNormalizationLayer,

                                       FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float32>>;


using FuseBatchNormIntoDepthwiseConvolution2DFloat16 =

        OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,

                                       BatchNormalizationLayer,

                                       FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float16>>;


} // namespace optimizations

} // namespace armnn