plain/22.08/_fuse_batch_norm_8hpp_source.xhtml

 //
 // Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #pragma once

 #include "Optimization.hpp"
 #include <armnnUtils/DataLayoutIndexed.hpp>
 #include <ResolveType.hpp>

 namespace armnn
 {
 namespace optimizations
 {

 template<typename ConvLayer, armnn::DataType ArmnnType,
          typename T = armnn::ResolveType<ArmnnType>>
 class FuseBatchNorm
 {
 public:
     /// Run for every exclusive connection between any base Convolution layer and a child BatchNorm layer for not
     /// quantized layers.
     /// The child will be removed, the base will be removed if it's left unconnected. A new Convolution layer will
     /// be added, its weights and bias will be calculated using the weights and bias of the base Convolution layer
     /// combined with the parameters of the child BatchNorm layer.
     void Run(Graph& graph, InputSlot& connection) const
     {
         Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
         Layer& child = connection.GetOwningLayer();

         bool depthwise = (base.GetType() == LayerType::DepthwiseConvolution2d);

         ARMNN_ASSERT(base.GetType() == LayerType::Convolution2d || depthwise);
         ARMNN_ASSERT(child.GetType() == LayerType::BatchNormalization);

         if (base.GetDataType() == ArmnnType && child.GetDataType() == ArmnnType)
         {
             OutputSlot* parentOut = base.GetInputSlot(0).GetConnectedOutputSlot();
             auto convLayer = PolymorphicDowncast<ConvLayer*>(&base);
             auto batchNormLayer = PolymorphicDowncast<BatchNormalizationLayer*>(&child);

             // Read convolution and batch norm parameters
             BatchNormalizationDescriptor batchNormDescriptor = batchNormLayer->GetParameters();
             auto epsilon = batchNormDescriptor.m_Eps;
             IgnoreUnused(epsilon);

             ConstTensor betaTensor(batchNormLayer->m_Beta->GetTensorInfo(), batchNormLayer->m_Beta->Map(true));
             ConstTensor gammaTensor(batchNormLayer->m_Gamma->GetTensorInfo(), batchNormLayer->m_Gamma->Map(true));
             ConstTensor meanTensor(batchNormLayer->m_Mean->GetTensorInfo(), batchNormLayer->m_Mean->Map(true));
             ConstTensor varTensor(batchNormLayer->m_Variance->GetTensorInfo(), batchNormLayer->m_Variance->Map(true));

             auto convDescriptor = convLayer->GetParameters();
             ConstTensor weightsTensor;
             ARMNN_ASSERT_MSG(convLayer->GetInputSlots()[1].GetConnection() != nullptr,
                              "FuseBatchNorm: Weight data should not be null.");

             ConstantLayer* weightLayer = PolymorphicDowncast<ConstantLayer*>(
                                         &base.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer());

             weightsTensor = ConstTensor(weightLayer->m_LayerOutput->GetTensorInfo(),
                                         weightLayer->m_LayerOutput->Map(true));

             armnnUtils::DataLayoutIndexed dataLayout(convDescriptor.m_DataLayout);
             auto weightsShape = weightsTensor.GetInfo().GetShape();
             const unsigned int inputChannels   = parentOut->GetTensorInfo().GetShape()[dataLayout.GetChannelsIndex()];
             const unsigned int depthMultiplier = depthwise ? weightsShape[3] / inputChannels : 1;
             const unsigned int outputChannels  = depthwise ? weightsShape[3] : weightsShape[0];
             const unsigned int weightsHeight   = depthwise ? weightsShape[1] :
                                                  weightsShape[dataLayout.GetHeightIndex()];
             const unsigned int weightsWidth    = depthwise ? weightsShape[2] :
                                                  weightsShape[dataLayout.GetWidthIndex()];

             const auto* weightsBuffer = static_cast<const T*>(weightsTensor.GetMemoryArea());
             const auto* betaBuffer    = static_cast<const T*>(betaTensor.GetMemoryArea());
             const auto* gammaBuffer   = static_cast<const T*>(gammaTensor.GetMemoryArea());
             const auto* meanBuffer    = static_cast<const T*>(meanTensor.GetMemoryArea());
             const auto* varBuffer     = static_cast<const T*>(varTensor.GetMemoryArea());

             std::vector<T> weightsVector (weightsBuffer, weightsBuffer + weightsTensor.GetNumElements());
             std::vector<T> betaVector    (betaBuffer, betaBuffer + betaTensor.GetNumElements());
             std::vector<T> gammaVector   (gammaBuffer, gammaBuffer + gammaTensor.GetNumElements());
             std::vector<T> meanVector    (meanBuffer, meanBuffer + meanTensor.GetNumElements());
             std::vector<T> varianceVector(varBuffer, varBuffer + varTensor.GetNumElements());

             // fusedWeights = ( gamma * weights ) / ( std - epsilon);
             std::vector<T> fusedWeightsVector(weightsVector.size());

             for (unsigned int cInput = 0; cInput < inputChannels; ++cInput)
             {
                 for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
                 {
                     T mult = gammaVector[cOut] / static_cast<T>(sqrtf(varianceVector[cOut] + epsilon));

                     for (unsigned int h = 0; h < weightsHeight; ++h)
                     {
                         for (unsigned int w = 0; w < weightsWidth; ++w)
                         {
                             unsigned int weightsIdx = 0;

                             if (depthwise)
                             {
                                 cInput = cOut / depthMultiplier;
                                 weightsIdx = w * outputChannels + cOut +
                                              h * weightsWidth * outputChannels;
                             }
                             else if (convDescriptor.m_DataLayout == DataLayout::NHWC)
                             {
                                 weightsIdx = cOut * weightsHeight * weightsWidth * inputChannels +
                                              h * weightsWidth * inputChannels +
                                              w * inputChannels +
                                              cInput;
                             }
                             else
                             {
                                 weightsIdx = cOut * weightsWidth * weightsHeight * inputChannels +
                                              cInput * weightsWidth * weightsHeight +
                                              h * weightsWidth +
                                              w;
                             }
                             fusedWeightsVector[weightsIdx] = mult * weightsVector[weightsIdx];
                         }
                     }
                 }
             }
             ConstTensor fusedWeightsTensor(weightsTensor.GetInfo(), fusedWeightsVector);

             //  fusedBias = (gamma * (bias - mean)) / (variance - epsilon) + beta;
             std::vector<T> fusedBiasVector(outputChannels);
             bool biasWasEnabledBeforeOpt = convDescriptor.m_BiasEnabled;
             if (biasWasEnabledBeforeOpt)
             {
                 ConstTensor biasTensor;
                 ARMNN_ASSERT_MSG(convLayer->GetInputSlots()[2].GetConnection() != nullptr,
                                  "FuseBatchNorm: Bias data should not be null if bias is enabled.");

                 ConstantLayer* biasLayer = PolymorphicDowncast<ConstantLayer*>(
                                                 &base.GetInputSlot(2).GetConnectedOutputSlot()->GetOwningLayer());

                 biasTensor = ConstTensor(biasLayer->m_LayerOutput->GetTensorInfo(),
                                          biasLayer->m_LayerOutput->Map(true));

                 const auto* biasBuffer = static_cast<const T*>(biasTensor.GetMemoryArea());
                 std::vector<T> biasVector(biasBuffer, biasBuffer + biasTensor.GetNumElements());

                 for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
                 {
                     fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /
                                              sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];
                 }
             }
             else
             {
                 convDescriptor.m_BiasEnabled = true;
                 std::vector<T> biasVector(outputChannels, T(0));

                 for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
                 {
                     fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /
                                              sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];
                 }
             }
             ConstTensor fusedBiasTensor(TensorInfo({outputChannels}, ArmnnType, 0.0f, 0, true), fusedBiasVector);

             // Insert the new convolution layer that has batch norm parameters fused into
             const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + base.GetName();
             auto& newConv2dLayer = *graph.InsertNewLayer<ConvLayer>(base.GetInputSlot(0),
                                                                     convDescriptor,
                                                                     name.c_str());
             newConv2dLayer.m_Weight = std::make_unique<ScopedTensorHandle>(fusedWeightsTensor);
             newConv2dLayer.m_Bias = std::make_unique<ScopedTensorHandle>(ConstTensor(fusedBiasTensor));

             // Connect weights and bias from old to new Conv2d layer
             // This optimization will always have 3 input slots on the Conv2d base layer
             if (newConv2dLayer.GetNumInputSlots() > 1)
             {
                 // Remove old connection and connect to new layer2d
                 weightLayer->GetOutputSlot(0).Disconnect(base.GetInputSlot(1));
                 weightLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(1));
                 weightLayer->m_LayerOutput = newConv2dLayer.m_Weight;

                 // Move bias const layers as normal if it was enabled before the optimisation
                 ConstantLayer* biasLayer;
                 if (biasWasEnabledBeforeOpt)
                 {
                     biasLayer = PolymorphicDowncast<ConstantLayer*>(
                         &base.GetInputSlot(2).GetConnectedOutputSlot()->GetOwningLayer());
                     // Remove old connection and connect to new layer2d
                     biasLayer->GetOutputSlot(0).Disconnect(base.GetInputSlot(2));
                     biasLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(2));

                 }
                 // Otherwise create a new bias layer and add to the new convolution2d
                 else
                 {
                     // Add in bias constant layer
                     biasLayer = graph.AddLayer<ConstantLayer>("Bias");
                     biasLayer->GetOutputSlot(0).SetTensorInfo(fusedBiasTensor.GetInfo());
                     biasLayer->GetOutputSlot(0).Connect(newConv2dLayer.GetInputSlot(2));
                 }
                 biasLayer->m_LayerOutput = newConv2dLayer.m_Bias;
             }


             // Reconnects with original parent.
             newConv2dLayer.GetOutputSlot().MoveAllConnections(*parentOut);
             // Parent is now the new convolution2d layer.
             parentOut = &newConv2dLayer.GetOutputSlot();

             // Moves connections in child output to parent layer.
             // Child layer will be removed as it's left unconnected.
             // Base layer will be removed if left unconnected.
             child.GetOutputSlot().MoveAllConnections(*parentOut);
         }
     }
 protected:
     FuseBatchNorm()  = default;
     ~FuseBatchNorm() = default;
 };

 using FuseBatchNormIntoConvolution2DFloat32 =
         OptimizeForExclusiveConnection<Convolution2dLayer,
                                        BatchNormalizationLayer,
                                        FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float32>>;

 using FuseBatchNormIntoConvolution2DFloat16 =
         OptimizeForExclusiveConnection<Convolution2dLayer,
                                        BatchNormalizationLayer,
                                        FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float16>>;

 using FuseBatchNormIntoDepthwiseConvolution2DFloat32 =
         OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,
                                        BatchNormalizationLayer,
                                        FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float32>>;

 using FuseBatchNormIntoDepthwiseConvolution2DFloat16 =
         OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,
                                        BatchNormalizationLayer,
                                        FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float16>>;

 } // namespace optimizations
 } // namespace armnn
armnn::ConstantLayer
A layer that the constant data can be bound to.
Definition: ConstantLayer.hpp:15

armnn::BatchNormalizationLayer
This layer represents a batch normalization operation.
Definition: BatchNormalizationLayer.hpp:15

DataLayoutIndexed.hpp

armnn::TensorInfo::GetShape
const TensorShape & GetShape() const
Definition: Tensor.hpp:191

armnn::optimizations::FuseBatchNorm
Definition: FuseBatchNorm.hpp:19

armnn::TensorInfo
Definition: Tensor.hpp:152

armnn::DepthwiseConvolution2dLayer
This layer represents a depthwise convolution 2d operation.
Definition: DepthwiseConvolution2dLayer.hpp:15

armnn::ConstantLayer::m_LayerOutput
std::shared_ptr< ConstTensorHandle > m_LayerOutput
Definition: ConstantLayer.hpp:44

armnn::Graph::AddLayer
LayerT * AddLayer(Args &&... args)
Adds a new layer, of type LayerType, to the graph constructed with the arguments passed.
Definition: Graph.hpp:456

armnn::OutputSlot::GetOwningLayer
Layer & GetOwningLayer() const
Definition: Layer.hpp:119

armnn::OutputSlot::Connect
int Connect(InputSlot &destination)
Definition: Layer.cpp:112

armnn::BatchNormalizationDescriptor::m_Eps
float m_Eps
Value to add to the variance. Used to avoid dividing by zero.
Definition: Descriptors.hpp:802

armnn::optimizations::FuseBatchNorm::FuseBatchNorm
FuseBatchNorm()=default

armnn::ResolveType
typename ResolveTypeImpl< DT >::Type ResolveType
Definition: ResolveType.hpp:79

armnn::BaseTensor::GetNumElements
unsigned int GetNumElements() const
Definition: Tensor.hpp:303

ResolveType.hpp

armnn::InputSlot
Definition: Layer.hpp:42

armnn
Copyright (c) 2021 ARM Limited and Contributors.
Definition: 01_00_quick_start.dox:6

armnn::IgnoreUnused
void IgnoreUnused(Ts &&...)
Definition: IgnoreUnused.hpp:14

armnn::OutputSlot::Disconnect
void Disconnect(InputSlot &slot)
Definition: Layer.cpp:120

armnn::optimizations::FuseBatchNorm::~FuseBatchNorm
~FuseBatchNorm()=default

armnn::Layer::GetInputSlot
const InputSlot & GetInputSlot(unsigned int index) const override
Get a const input slot handle by slot index.
Definition: Layer.hpp:324

armnn::DataType
DataType
Definition: Types.hpp:48

ARMNN_ASSERT_MSG
#define ARMNN_ASSERT_MSG(COND, MSG)
Definition: Assert.hpp:15

armnnUtils::DataLayoutIndexed
Provides access to the appropriate indexes for Channels, Height and Width based on DataLayout...
Definition: DataLayoutIndexed.hpp:17

armnn::ConstTensor
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:327

armnn::Layer::GetType
LayerType GetType() const override
Returns the armnn::LayerType of this layer.
Definition: Layer.hpp:273

armnn::LayerType::Convolution2d

armnn::OutputSlot
Definition: Layer.hpp:87

ARMNN_ASSERT
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14

armnn::InputSlot::GetConnectedOutputSlot
const OutputSlot * GetConnectedOutputSlot() const
Definition: Layer.hpp:56

armnn::optimizations::FuseBatchNorm::Run
void Run(Graph &graph, InputSlot &connection) const
Run for every exclusive connection between any base Convolution layer and a child BatchNorm layer for...
Definition: FuseBatchNorm.hpp:27

armnn::InputSlot::GetOwningLayer
Layer & GetOwningLayer() const
Definition: Layer.hpp:53

armnn::BaseTensor::GetInfo
const TensorInfo & GetInfo() const
Definition: Tensor.hpp:295

armnn::Graph
Definition: Graph.hpp:30

armnn::LayerType::DepthwiseConvolution2d

armnn::OutputSlot::SetTensorInfo
void SetTensorInfo(const TensorInfo &tensorInfo) override
Definition: Layer.cpp:87

armnn::LayerType::BatchNormalization

armnn::Layer::GetDataType
DataType GetDataType() const
Definition: Layer.cpp:313

armnn::Layer::GetOutputSlot
const OutputSlot & GetOutputSlot(unsigned int index=0) const override
Get the const output slot handle by slot index.
Definition: Layer.hpp:326

armnn::OptimizeForExclusiveConnection
Definition: Optimization.hpp:173

armnn::Layer::GetName
const char * GetName() const override
Returns the name of the layer.
Definition: Layer.hpp:319

armnn::Convolution2dLayer
This layer represents a convolution 2d operation.
Definition: Convolution2dLayer.hpp:15

Optimization.hpp

armnn::Graph::InsertNewLayer
LayerT * InsertNewLayer(InputSlot &insertBefore, Args &&... args)
Inserts a new layer between the output slot currently connected to insertBefore and insertBefore itse...
Definition: Graph.hpp:471

armnn::OutputSlot::GetTensorInfo
const TensorInfo & GetTensorInfo() const override
Definition: Layer.cpp:92

armnn::OutputSlot::MoveAllConnections
void MoveAllConnections(OutputSlot &destination)
Moves all connections to another OutputSlot.
Definition: Layer.cpp:145

armnn::Layer
Definition: Layer.hpp:217

armnn::BatchNormalizationDescriptor
A BatchNormalizationDescriptor for the BatchNormalizationLayer.
Definition: Descriptors.hpp:789

armnn::DataLayout::NHWC