From b9dcfe63b87f024c6f8c5f4b68447de04119dc19 Mon Sep 17 00:00:00 2001
From: Finn Williams <Finn.Williams@arm.com>
Date: Thu, 17 Sep 2020 15:58:31 +0100
Subject: IVGCVSW-5325 Speed up the reference backend

Change-Id: Id8bd0a0418be31d975b944b54bbacb25051ffb2e
Signed-off-by: Finn Williams <Finn.Williams@arm.com>
---
 src/backends/reference/workloads/BaseIterator.hpp  | 248 ++++++++++++++++++++-
 src/backends/reference/workloads/ConvImpl.cpp      |  88 +++++---
 .../reference/workloads/FullyConnected.cpp         |  19 +-
 .../reference/workloads/FullyConnected.hpp         |   1 +
 src/backends/reference/workloads/Pooling2d.cpp     |  79 +++++--
 .../workloads/RefFullyConnectedWorkload.cpp        |   1 +
 .../reference/workloads/TransposeConvolution2d.cpp |  90 +++++---
 7 files changed, 438 insertions(+), 88 deletions(-)

(limited to 'src/backends')
diff --git a/src/backends/reference/workloads/BaseIterator.hpp b/src/backends/reference/workloads/BaseIterator.hpp
index 1f4f2da717..0165ec7c7a 100644
--- a/src/backends/reference/workloads/BaseIterator.hpp
+++ b/src/backends/reference/workloads/BaseIterator.hpp
@@ -44,6 +44,10 @@ public:
     virtual void Reset(void*) = 0;
 
     virtual IType Get() const = 0;
+
+    virtual std::vector<float> DecodeTensor(uint32_t size,
+                                            uint32_t channelStep = 1,
+                                            uint32_t channelMultiplier = 1) = 0;
 };
 
 template<typename IType>
@@ -130,7 +134,24 @@ public:
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+        }
+
+        return decodedTensor;
+    }
+
 private:
+
     const float m_Scale;
     const int32_t m_Offset;
 };
@@ -149,9 +170,26 @@ public:
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+        }
+
+        return decodedTensor;
+    }
+
 private:
     const float m_Scale;
     const int32_t m_Offset;
+
 };
 
 class QSymmS8Decoder : public TypedIterator<const int8_t, Decoder<float>>
@@ -168,9 +206,26 @@ public:
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+        }
+
+        return decodedTensor;
+    }
+
 private:
     const float m_Scale;
     const int32_t m_Offset;
+
 };
 
 class QSymm16Decoder : public TypedIterator<const int16_t, Decoder<float>>
@@ -187,9 +242,28 @@ public:
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
 
+
+
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+        }
+
+        return decodedTensor;
+    }
+
 private:
     const float m_Scale;
     const int32_t m_Offset;
+
 };
 
 class BFloat16Decoder : public TypedIterator<const BFloat16, Decoder<float>>
@@ -207,6 +281,26 @@ public:
         armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val);
         return val;
     }
+
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+
+            float val = 0.f;
+            armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val);
+            decodedTensor.emplace_back(val);
+        }
+
+        return decodedTensor;
+    }
+
 };
 
 class Float16Decoder : public TypedIterator<const Half, Decoder<float>>
@@ -224,6 +318,26 @@ public:
         armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val);
         return val;
     }
+
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            float val = 0.f;
+            this->operator[](i);
+            armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val);
+            decodedTensor.emplace_back(val);
+        }
+
+        return decodedTensor;
+    }
+
+
 };
 
 class Float32Decoder : public TypedIterator<const float, Decoder<float>>
@@ -239,6 +353,16 @@ public:
     {
         return *m_Iterator;
     }
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+        std::vector<float> decodedTensor;
+
+        decodedTensor.reserve(size);
+        decodedTensor.assign(m_Start, m_Start + size);
+
+        return decodedTensor;
+    }
 };
 
 class ScaledInt32Decoder : public TypedIterator<const int32_t, Decoder<float>>
@@ -255,8 +379,25 @@ public:
         return static_cast<float>(*m_Iterator) * m_Scale;
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(static_cast<float>(*m_Iterator) * m_Scale);
+        }
+
+        return decodedTensor;
+    }
+
 private:
     const float m_Scale;
+
 };
 
 class Int32Decoder : public TypedIterator<const int32_t, Decoder<float>>
@@ -272,6 +413,22 @@ public:
     {
         return static_cast<float>(*m_Iterator);
     }
+
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(static_cast<float>(*m_Iterator));
+        }
+
+        return decodedTensor;
+    }
 };
 
 class Int32ToInt32tDecoder : public TypedIterator<const int32_t, Decoder<int32_t>>
@@ -287,6 +444,22 @@ public:
     {
         return *m_Iterator;
     }
+
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(*m_Iterator);
+        }
+
+        return decodedTensor;
+    }
 };
 
 class BooleanDecoder : public TypedIterator<const uint8_t, Decoder<float>>
@@ -303,6 +476,21 @@ public:
         return *m_Iterator;
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        IgnoreUnused(channelStepSize, channelMultiplier);
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        for (uint32_t i = 0; i < size; ++i)
+        {
+            this->operator[](i);
+            decodedTensor.emplace_back(*m_Iterator);
+        }
+
+        return decodedTensor;
+    }
 };
 
 class QASymm8Encoder : public TypedIterator<uint8_t, Encoder<float>>
@@ -530,7 +718,7 @@ template<typename T, typename Base>
 class PerAxisIterator : public Base
 {
 public:
-    // axisFactor is used to calculate axisIndex
+    // axisFactor is used to calculate channelStep
     PerAxisIterator(T* data = nullptr, unsigned int axisFactor = 0)
         : m_Iterator(data), m_Start(data), m_AxisIndex(0), m_AxisFactor(axisFactor)
     {}
@@ -607,6 +795,35 @@ public:
         return m_Scale[m_AxisIndex];
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        uint32_t channels = static_cast<uint32_t>(m_Scale.size());
+        uint32_t channelSteps = size / (channelStepSize * channelMultiplier);
+        uint32_t scale;
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out
+        // channelStepSize is the length of a contiguous section of a channel within a tensor
+        // channelSteps is the number of those steps/blocks in the tensor
+        for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+        {
+            for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep)
+            {
+                scale = (channelMultiplier * channelStep + mult) % channels;
+                for (uint32_t i = 0; i < channelStepSize; ++i)
+                {
+                    unsigned int index = mult * channelStepSize * channelMultiplier +
+                                         channelStep * channelStepSize + i;
+                    this->operator[](index);
+                    decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale[scale], 0));
+                }
+            }
+        }
+        return decodedTensor;
+    }
+
 private:
     std::vector<float> m_Scale;
 };
@@ -654,6 +871,35 @@ public:
         return m_Scales[m_AxisIndex];
     }
 
+    std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+    {
+        uint32_t channels = static_cast<uint32_t>(m_Scales.size());
+        uint32_t channelSteps = size / (channelStepSize * channelMultiplier);
+        uint32_t scale;
+
+        std::vector<float> decodedTensor;
+        decodedTensor.reserve(size);
+
+        // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out
+        // channelStepSize is the length of a contiguous section of a channel within a tensor
+        // channelSteps is the number of those steps/blocks in the tensor
+        for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+        {
+            for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep)
+            {
+                scale = (channelMultiplier * channelStep + mult) % channels;
+                for (uint32_t i = 0; i < channelStepSize; ++i)
+                {
+                    unsigned int index = mult * channelStepSize * channelMultiplier +
+                                         channelStep * channelStepSize + i;
+                    this->operator[](index);
+                    decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0));
+                }
+            }
+        }
+        return decodedTensor;
+    }
+
 private:
     std::vector<float> m_Scales;
 };
diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp
index 7e8b8fffb6..f11c351c61 100644
--- a/src/backends/reference/workloads/ConvImpl.cpp
+++ b/src/backends/reference/workloads/ConvImpl.cpp
@@ -95,19 +95,45 @@ void Convolve(const TensorShape& rInputShape,
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1;
-    unsigned int inputChannels   = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex];
-    unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : rFilterShape[0];
+    const unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1;
+    const unsigned int inputChannels   = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex];
+    const unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : rFilterShape[0];
 
-    unsigned int batchSize    = rOutputShape[0];
-    unsigned int outputHeight = rOutputShape[heightIndex];
-    unsigned int outputWidth  = rOutputShape[widthIndex];
-    unsigned int inputHeight  = rInputShape[heightIndex];
-    unsigned int inputWidth   = rInputShape[widthIndex];
+    const unsigned int batchSize    = rOutputShape[0];
+    const unsigned int outputHeight = rOutputShape[heightIndex];
+    const unsigned int outputWidth  = rOutputShape[widthIndex];
+    const unsigned int inputHeight  = rInputShape[heightIndex];
+    const unsigned int inputWidth   = rInputShape[widthIndex];
 
-    unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex];
-    unsigned int filterWidth  = depthwise ? rFilterShape[3] : rFilterShape[widthIndex];
+    const unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex];
+    const unsigned int filterWidth  = depthwise ? rFilterShape[3] : rFilterShape[widthIndex];
 
+    const std::vector<float> inputVec = rInputDecoder.DecodeTensor(rInputShape.GetNumElements());
+
+    uint32_t channelStepSize;
+    if (depthwise)
+    {
+        channelStepSize = filterHeight * filterWidth;
+    }
+    else
+    {
+        if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
+        {
+            channelStepSize = rFilterShape[3];
+        }
+        else
+        {
+            channelStepSize = rFilterShape[1] * rFilterShape[2] * rFilterShape[3];
+        }
+    }
+
+    const std::vector<float> filterVec = rFilterDecoder.DecodeTensor(rFilterShape.GetNumElements(),
+                                                                     channelStepSize,
+                                                                     depthMultiplier);
+    const std::vector<float> biasVec = biasEnabled ?
+                                       pBiasDecoder->DecodeTensor(outputChannels) : std::vector<float>();
+
+    unsigned int depthwiseMultiplierIdx = 0;
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
         for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
@@ -117,15 +143,15 @@ void Convolve(const TensorShape& rInputShape,
                 for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
                 {
                     // This loop goes over each output element.
-                    float sum =  0.0f;
+                    float sum = 0.0f;
 
                     // For depthwise, each output channel corresponds to exactly one input channel.
                     // For normal, must loop over each input channel.
                     for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
-                        unsigned int depthwiseMultiplierIdx = 0;
                         if (depthwise)
                         {
+                            depthwiseMultiplierIdx = 0;
                             cInput = cOutput / depthMultiplier;
                             depthwiseMultiplierIdx = cOutput % depthMultiplier;
                         }
@@ -149,7 +175,7 @@ void Convolve(const TensorShape& rInputShape,
                                 {
                                     // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great
                                     // performance regression.
-                                    if (dataLayout == DataLayout::NHWC)
+                                    if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
                                     {
                                         filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
                                                       yFilter * filterWidth * inputChannels +
@@ -159,15 +185,12 @@ void Convolve(const TensorShape& rInputShape,
                                     else
                                     {
                                         filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
-                                                      cInput  * filterWidth * filterHeight +
+                                                      cInput * filterWidth * filterHeight +
                                                       yFilter * filterWidth +
                                                       xFilter;
                                     }
                                 }
 
-                                rFilterDecoder.SetIndex(filterIndex, cOutput);
-                                float filterValue = rFilterDecoder.Get();
-
                                 unsigned int yInput = yOutput * yStride + yFilter * yDilation;
                                 unsigned int xInput = xOutput * xStride + xFilter * xDilation;
 
@@ -175,7 +198,7 @@ void Convolve(const TensorShape& rInputShape,
 
                                 // Check if we're in the padding.
                                 if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
-                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
+                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft)
                                 {
                                     inputValue = 0.0f;
                                 }
@@ -185,9 +208,9 @@ void Convolve(const TensorShape& rInputShape,
 
                                     // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great
                                     // performance regression.
-                                    if (dataLayout == DataLayout::NHWC)
+                                    if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
                                     {
-                                        inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
+                                        inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
                                                      (yInput - paddingTop) * inputWidth * inputChannels +
                                                      (xInput - paddingLeft) * inputChannels +
                                                      cInput;
@@ -199,23 +222,34 @@ void Convolve(const TensorShape& rInputShape,
                                                      inputWidth * (yInput - paddingTop) +
                                                      xInput - paddingLeft;
                                     }
-
-                                    rInputDecoder[inputIndex];
-                                    inputValue = rInputDecoder.Get();
+                                    inputValue = inputVec[inputIndex];
                                 }
 
-                                sum += filterValue * inputValue;
+                                sum += filterVec[filterIndex] * inputValue;
                             }
                         }
                     }
 
                     if (biasEnabled)
                     {
-                        (*pBiasDecoder).SetIndex(cOutput, cOutput);
-                        sum += pBiasDecoder->Get();
+                        sum += biasVec[cOutput];
                     }
 
-                    unsigned int outIdx = dataLayoutIndexed.GetIndex(rOutputShape, batchIdx, cOutput, yOutput, xOutput);
+                    unsigned int outIdx;
+                    if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
+                    {
+                        outIdx =  batchIdx * outputHeight * outputWidth * outputChannels +
+                                  yOutput * outputWidth * outputChannels +
+                                  xOutput * outputChannels +
+                                  cOutput;
+                    }
+                    else
+                    {
+                        outIdx = batchIdx * outputHeight * outputWidth * outputChannels +
+                                 cOutput * outputHeight * outputWidth +
+                                 yOutput * outputWidth +
+                                 xOutput;
+                    }
 
                     rOutputEncoder[outIdx];
                     rOutputEncoder.Set(sum);
diff --git a/src/backends/reference/workloads/FullyConnected.cpp b/src/backends/reference/workloads/FullyConnected.cpp
index 8016c1b628..61c8e88bce 100644
--- a/src/backends/reference/workloads/FullyConnected.cpp
+++ b/src/backends/reference/workloads/FullyConnected.cpp
@@ -14,6 +14,7 @@ void FullyConnected(const TensorShape& rInputShape,
                     Decoder<float>& rInputDecoder,
                     const TensorShape& rOutputShape,
                     Encoder<float>& rOutputEncoder,
+                    const TensorShape& rWeightsShape,
                     Decoder<float>& rWeightDecoder,
                     Decoder<float>& rBiasDecoder,
                     const bool biasEnabled,
@@ -23,6 +24,12 @@ void FullyConnected(const TensorShape& rInputShape,
     // Perform FullyConnected implementation
     unsigned int outputSize = rOutputShape[1];
 
+    const std::vector<float> decodedInputs = rInputDecoder.DecodeTensor(rInputShape.GetNumElements());
+    const std::vector<float> decodedWeights = rWeightDecoder.DecodeTensor(rWeightsShape.GetNumElements());
+    const std::vector<float> decodedBiases = biasEnabled ?
+                                             rBiasDecoder.DecodeTensor(outputSize) : std::vector<float>();
+
+
     for (unsigned int n = 0; n < rInputShape[0]; n++)
     {
         for (unsigned int channelOutput = 0; channelOutput < outputSize; channelOutput++)
@@ -34,23 +41,19 @@ void FullyConnected(const TensorShape& rInputShape,
                 float weight;
                 if (transposeWeights)
                 {
-                    rWeightDecoder[channelOutput * K + channelInput];
-                    weight = rWeightDecoder.Get();
+                    weight = decodedWeights[channelOutput * K + channelInput];
                 }
                 else
                 {
-                    rWeightDecoder[channelInput * outputSize + channelOutput];
-                    weight = rWeightDecoder.Get();
+                    weight = decodedWeights[channelInput * outputSize + channelOutput];
                 }
 
-                rInputDecoder[n * K + channelInput];
-                outval += weight * rInputDecoder.Get();
+                outval += weight * decodedInputs[n * K + channelInput];
             }
 
             if (biasEnabled)
             {
-                rBiasDecoder[channelOutput];
-                outval += rBiasDecoder.Get();
+                outval += decodedBiases[channelOutput];
             }
 
             rOutputEncoder[n * outputSize + channelOutput];
diff --git a/src/backends/reference/workloads/FullyConnected.hpp b/src/backends/reference/workloads/FullyConnected.hpp
index 6f9559db24..e33060631b 100644
--- a/src/backends/reference/workloads/FullyConnected.hpp
+++ b/src/backends/reference/workloads/FullyConnected.hpp
@@ -19,6 +19,7 @@ void FullyConnected(const TensorShape& rInputShape,
                     Decoder<float>& rInputDecoder,
                     const TensorShape& rOutputShape,
                     Encoder<float>& rOutputEncoder,
+                    const TensorShape& rWeightsShape,
                     Decoder<float>& rWeightDecoder,
                     Decoder<float>& rBiasDecoder,
                     bool biasEnabled,
diff --git a/src/backends/reference/workloads/Pooling2d.cpp b/src/backends/reference/workloads/Pooling2d.cpp
index 435671ffad..2bc3b4f213 100644
--- a/src/backends/reference/workloads/Pooling2d.cpp
+++ b/src/backends/reference/workloads/Pooling2d.cpp
@@ -172,9 +172,6 @@ void Pooling2d(Decoder<float>& rInputDecoder,
     Accumulator accumulate = GetAccumulator(params.m_PoolType);
     Executor execute       = GetExecutor(params.m_PoolType);
 
-    TensorShape outputShape = outputInfo.GetShape();
-    TensorShape inputShape =  inputInfo.GetShape();
-
     // Check supported padding methods outside the loop to simplify
     // the inner loop.
     if (params.m_PaddingMethod != PaddingMethod::Exclude &&
@@ -183,6 +180,8 @@ void Pooling2d(Decoder<float>& rInputDecoder,
         throw armnn::InvalidArgumentException("Unsupported padding type");
     }
 
+    const std::vector<float> decodedInputVec = rInputDecoder.DecodeTensor(inputInfo.GetNumElements());
+
     for (int n = 0; n < batchSize; n++)
     {
         for (int c = 0; c < channels; c++)
@@ -221,12 +220,24 @@ void Pooling2d(Decoder<float>& rInputDecoder,
                     {
                         result = 0.0f;
 
-                        unsigned int outputIndex = dataLayout.GetIndex(outputShape,
-                                                                       armnn::numeric_cast<unsigned int>(n),
-                                                                       armnn::numeric_cast<unsigned int>(c),
-                                                                       armnn::numeric_cast<unsigned int>(yOutput),
-                                                                       armnn::numeric_cast<unsigned int>(xOutput));
-                        rOutputEncoder[outputIndex];
+                        int outputIndex;
+
+                        if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+                        {
+                            outputIndex = n * heightOutput * widthOutput * channels +
+                                          yOutput * widthOutput * channels +
+                                          xOutput * channels +
+                                          c;
+                        }
+                        else
+                        {
+                            outputIndex = n * heightOutput * widthOutput * channels +
+                                          c * heightOutput * widthOutput +
+                                          yOutput * widthOutput +
+                                          xOutput;
+                        }
+
+                        rOutputEncoder[static_cast<unsigned int>(outputIndex)];
                         rOutputEncoder.Set(result);
                         continue;
                     }
@@ -244,28 +255,48 @@ void Pooling2d(Decoder<float>& rInputDecoder,
                     {
                         for (auto xInput = wstart; xInput < wend; xInput++)
                         {
-                            unsigned int inputIndex = dataLayout.GetIndex(inputShape,
-                                                                          armnn::numeric_cast<unsigned int>(n),
-                                                                          armnn::numeric_cast<unsigned int>(c),
-                                                                          armnn::numeric_cast<unsigned int>(yInput),
-                                                                          armnn::numeric_cast<unsigned int>(xInput));
-
-                            rInputDecoder[inputIndex];
-                            float inval = rInputDecoder.Get();
 
-                            accumulate(result, inval);
+                            int inputIndex;
+                            if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+                            {
+                                inputIndex = n * heightInput * widthInput * channels +
+                                             yInput * widthInput * channels +
+                                             xInput * channels +
+                                             c;
+
+                            }
+                            else
+                            {
+                                inputIndex = n * heightInput * widthInput * channels +
+                                             c * heightInput * widthInput +
+                                             yInput * widthInput +
+                                             xInput;
+                            }
+
+                            accumulate(result, decodedInputVec[static_cast<unsigned int>(inputIndex)]);
                         }
                     }
 
                     execute(result, poolAreaSize);
 
-                    unsigned int outputIndex = dataLayout.GetIndex(outputShape,
-                                                                   armnn::numeric_cast<unsigned int>(n),
-                                                                   armnn::numeric_cast<unsigned int>(c),
-                                                                   armnn::numeric_cast<unsigned int>(yOutput),
-                                                                   armnn::numeric_cast<unsigned int>(xOutput));
+                    int outputIndex;
+
+                    if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+                    {
+                        outputIndex = n * heightOutput * widthOutput * channels +
+                                      yOutput * widthOutput * channels +
+                                      xOutput * channels +
+                                      c;
+                    }
+                    else
+                    {
+                        outputIndex = n * heightOutput * widthOutput * channels +
+                                      c * heightOutput * widthOutput +
+                                      yOutput * widthOutput +
+                                      xOutput;
+                    }
 
-                    rOutputEncoder[outputIndex];
+                    rOutputEncoder[static_cast<unsigned int>(outputIndex)];
                     rOutputEncoder.Set(result);
                 }
             }
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index 32c65d3ebd..9acca219b5 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -58,6 +58,7 @@ void RefFullyConnectedWorkload::Execute() const
                    *m_InputDecoder,
                    m_OutputShape,
                    *m_OutputEncoder,
+                   m_WeightShape,
                    *m_WeightDecoder,
                    *m_BiasDecoder,
                    m_Data.m_Parameters.m_BiasEnabled,
diff --git a/src/backends/reference/workloads/TransposeConvolution2d.cpp b/src/backends/reference/workloads/TransposeConvolution2d.cpp
index 5698014181..c34a309806 100644
--- a/src/backends/reference/workloads/TransposeConvolution2d.cpp
+++ b/src/backends/reference/workloads/TransposeConvolution2d.cpp
@@ -30,27 +30,35 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    unsigned int numBatches = inputShape[0];
+    const unsigned int numBatches = inputShape[0];
 
-    unsigned int inputWidth  = inputShape[widthIndex];
-    unsigned int inputHeight = inputShape[heightIndex];
-    unsigned int inputDepth  = inputShape[channelsIndex];
+    const unsigned int inputWidth  = inputShape[widthIndex];
+    const unsigned int inputHeight = inputShape[heightIndex];
+    const unsigned int inputDepth  = inputShape[channelsIndex];
 
-    unsigned int weightsHeight = weightsShape[heightIndex];
-    unsigned int weightsWidth  = weightsShape[widthIndex];
+    const unsigned int weightsHeight = weightsShape[heightIndex];
+    const unsigned int weightsWidth  = weightsShape[widthIndex];
+    const unsigned int weightsDepth  = weightsShape[channelsIndex];
 
-    unsigned int outputHeight = outputShape[heightIndex];
-    unsigned int outputWidth  = outputShape[widthIndex];
-    unsigned int outputDepth  = outputShape[channelsIndex];
+    const unsigned int outputHeight = outputShape[heightIndex];
+    const unsigned int outputWidth  = outputShape[widthIndex];
+    const unsigned int outputDepth  = outputShape[channelsIndex];
 
-    unsigned int paddingLeft = descriptor.m_PadLeft;
-    unsigned int paddingTop  = descriptor.m_PadTop;
+    const unsigned int paddingLeft = descriptor.m_PadLeft;
+    const unsigned int paddingTop  = descriptor.m_PadTop;
 
-    unsigned int strideX = descriptor.m_StrideX;
-    unsigned int strideY = descriptor.m_StrideY;
+    const unsigned int strideX = descriptor.m_StrideX;
+    const unsigned int strideY = descriptor.m_StrideY;
 
     std::vector<float> outputBuffer(outputShape.GetNumElements(), 0);
 
+    const std::vector<float> inputVec = inputDecoder.DecodeTensor(inputShape.GetNumElements());
+
+    const unsigned channelStep = weightsWidth * weightsHeight * weightsDepth;
+
+    const std::vector<float> filterVec =
+            weightsDecoder.DecodeTensor(weightsShape.GetNumElements(), channelStep);
+
     for (unsigned int batch = 0u; batch < numBatches; ++batch)
     {
         for (unsigned int yInput = 0u; yInput < inputHeight; ++yInput)
@@ -73,25 +81,51 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript
                             {
                                 for (unsigned int dInput = 0u; dInput < inputDepth; dInput++)
                                 {
-                                    const unsigned int inputIndex =
-                                        dataLayoutIndexed.GetIndex(inputShape, batch, dInput, yInput, xInput);
-                                    inputDecoder[inputIndex];
-
-                                    const unsigned int weightsIndex =
-                                        dataLayoutIndexed.GetIndex(weightsShape, dOutput, dInput, yWeights, xWeights);
-                                    weightsDecoder.SetIndex(weightsIndex, dOutput);
-
-                                    const unsigned int outputIndex =
-                                        dataLayoutIndexed.GetIndex(outputShape, batch, dOutput, yOutput, xOutput);
-                                    outputEncoder[outputIndex];
-
-                                    float output = outputBuffer[outputIndex];
-                                    output += inputDecoder.Get() * weightsDecoder.Get();
-                                    outputBuffer[outputIndex] = output;
+                                    unsigned int inputIndex;
+                                    unsigned int outputIndex;
+                                    unsigned int weightsIndex;
+
+                                    if(descriptor.m_DataLayout == armnn::DataLayout::NHWC)
+                                    {
+                                        inputIndex   = batch  * inputHeight * inputWidth * inputDepth +
+                                                       yInput * inputWidth * inputDepth +
+                                                       xInput * inputDepth +
+                                                       dInput;
+
+                                        weightsIndex = dOutput  * weightsHeight * weightsWidth * weightsDepth +
+                                                       yWeights * weightsWidth * weightsDepth +
+                                                       xWeights * weightsDepth +
+                                                       dInput;
+
+                                        outputIndex  = batch   * outputHeight * outputWidth * outputDepth +
+                                                       yOutput * outputWidth * outputDepth +
+                                                       xOutput * outputDepth +
+                                                       dOutput;
+                                    }
+                                    else
+                                    {
+                                        inputIndex   = batch  * inputDepth * inputHeight * inputWidth +
+                                                       dInput * inputHeight * inputWidth +
+                                                       yInput * inputWidth +
+                                                       xInput;
+
+                                        weightsIndex = dOutput  * weightsDepth * weightsHeight * weightsWidth +
+                                                       dInput   * weightsHeight * weightsWidth +
+                                                       yWeights * weightsWidth +
+                                                       xWeights;
+
+                                        outputIndex  = batch   * outputDepth * outputHeight * outputWidth +
+                                                       dOutput * outputHeight * outputWidth +
+                                                       yOutput * outputWidth +
+                                                       xOutput;
+                                    }
+
+                                    outputBuffer[outputIndex] += inputVec[inputIndex] * filterVec[weightsIndex];
                                 }
                             }
                         }
                     }
+
                 }
             }
         }
-- 
cgit v1.2.1