From b9dcfe63b87f024c6f8c5f4b68447de04119dc19 Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Thu, 17 Sep 2020 15:58:31 +0100 Subject: IVGCVSW-5325 Speed up the reference backend Change-Id: Id8bd0a0418be31d975b944b54bbacb25051ffb2e Signed-off-by: Finn Williams --- src/backends/reference/workloads/BaseIterator.hpp | 248 ++++++++++++++++++++- src/backends/reference/workloads/ConvImpl.cpp | 88 +++++--- .../reference/workloads/FullyConnected.cpp | 19 +- .../reference/workloads/FullyConnected.hpp | 1 + src/backends/reference/workloads/Pooling2d.cpp | 79 +++++-- .../workloads/RefFullyConnectedWorkload.cpp | 1 + .../reference/workloads/TransposeConvolution2d.cpp | 90 +++++--- 7 files changed, 438 insertions(+), 88 deletions(-) (limited to 'src/backends') diff --git a/src/backends/reference/workloads/BaseIterator.hpp b/src/backends/reference/workloads/BaseIterator.hpp index 1f4f2da717..0165ec7c7a 100644 --- a/src/backends/reference/workloads/BaseIterator.hpp +++ b/src/backends/reference/workloads/BaseIterator.hpp @@ -44,6 +44,10 @@ public: virtual void Reset(void*) = 0; virtual IType Get() const = 0; + + virtual std::vector DecodeTensor(uint32_t size, + uint32_t channelStep = 1, + uint32_t channelMultiplier = 1) = 0; }; template @@ -130,7 +134,24 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset)); + } + + return decodedTensor; + } + private: + const float m_Scale; const int32_t m_Offset; }; @@ -149,9 +170,26 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset)); + } + + return decodedTensor; + } + private: const float m_Scale; const int32_t m_Offset; + }; class QSymmS8Decoder : public TypedIterator> @@ -168,9 +206,26 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset)); + } + + return decodedTensor; + } + private: const float m_Scale; const int32_t m_Offset; + }; class QSymm16Decoder : public TypedIterator> @@ -187,9 +242,28 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } + + + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset)); + } + + return decodedTensor; + } + private: const float m_Scale; const int32_t m_Offset; + }; class BFloat16Decoder : public TypedIterator> @@ -207,6 +281,26 @@ public: armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val); return val; } + + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + + float val = 0.f; + armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val); + decodedTensor.emplace_back(val); + } + + return decodedTensor; + } + }; class Float16Decoder : public TypedIterator> @@ -224,6 +318,26 @@ public: armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val); return val; } + + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + float val = 0.f; + this->operator[](i); + armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val); + decodedTensor.emplace_back(val); + } + + return decodedTensor; + } + + }; class Float32Decoder : public TypedIterator> @@ -239,6 +353,16 @@ public: { return *m_Iterator; } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + std::vector decodedTensor; + + decodedTensor.reserve(size); + decodedTensor.assign(m_Start, m_Start + size); + + return decodedTensor; + } }; class ScaledInt32Decoder : public TypedIterator> @@ -255,8 +379,25 @@ public: return static_cast(*m_Iterator) * m_Scale; } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(static_cast(*m_Iterator) * m_Scale); + } + + return decodedTensor; + } + private: const float m_Scale; + }; class Int32Decoder : public TypedIterator> @@ -272,6 +413,22 @@ public: { return static_cast(*m_Iterator); } + + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(static_cast(*m_Iterator)); + } + + return decodedTensor; + } }; class Int32ToInt32tDecoder : public TypedIterator> @@ -287,6 +444,22 @@ public: { return *m_Iterator; } + + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(*m_Iterator); + } + + return decodedTensor; + } }; class BooleanDecoder : public TypedIterator> @@ -303,6 +476,21 @@ public: return *m_Iterator; } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + IgnoreUnused(channelStepSize, channelMultiplier); + + std::vector decodedTensor; + decodedTensor.reserve(size); + + for (uint32_t i = 0; i < size; ++i) + { + this->operator[](i); + decodedTensor.emplace_back(*m_Iterator); + } + + return decodedTensor; + } }; class QASymm8Encoder : public TypedIterator> @@ -530,7 +718,7 @@ template class PerAxisIterator : public Base { public: - // axisFactor is used to calculate axisIndex + // axisFactor is used to calculate channelStep PerAxisIterator(T* data = nullptr, unsigned int axisFactor = 0) : m_Iterator(data), m_Start(data), m_AxisIndex(0), m_AxisFactor(axisFactor) {} @@ -607,6 +795,35 @@ public: return m_Scale[m_AxisIndex]; } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + uint32_t channels = static_cast(m_Scale.size()); + uint32_t channelSteps = size / (channelStepSize * channelMultiplier); + uint32_t scale; + + std::vector decodedTensor; + decodedTensor.reserve(size); + + // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out + // channelStepSize is the length of a contiguous section of a channel within a tensor + // channelSteps is the number of those steps/blocks in the tensor + for (uint32_t mult = 0; mult < channelMultiplier; ++mult) + { + for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep) + { + scale = (channelMultiplier * channelStep + mult) % channels; + for (uint32_t i = 0; i < channelStepSize; ++i) + { + unsigned int index = mult * channelStepSize * channelMultiplier + + channelStep * channelStepSize + i; + this->operator[](index); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale[scale], 0)); + } + } + } + return decodedTensor; + } + private: std::vector m_Scale; }; @@ -654,6 +871,35 @@ public: return m_Scales[m_AxisIndex]; } + std::vector DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override + { + uint32_t channels = static_cast(m_Scales.size()); + uint32_t channelSteps = size / (channelStepSize * channelMultiplier); + uint32_t scale; + + std::vector decodedTensor; + decodedTensor.reserve(size); + + // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out + // channelStepSize is the length of a contiguous section of a channel within a tensor + // channelSteps is the number of those steps/blocks in the tensor + for (uint32_t mult = 0; mult < channelMultiplier; ++mult) + { + for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep) + { + scale = (channelMultiplier * channelStep + mult) % channels; + for (uint32_t i = 0; i < channelStepSize; ++i) + { + unsigned int index = mult * channelStepSize * channelMultiplier + + channelStep * channelStepSize + i; + this->operator[](index); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0)); + } + } + } + return decodedTensor; + } + private: std::vector m_Scales; }; diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp index 7e8b8fffb6..f11c351c61 100644 --- a/src/backends/reference/workloads/ConvImpl.cpp +++ b/src/backends/reference/workloads/ConvImpl.cpp @@ -95,19 +95,45 @@ void Convolve(const TensorShape& rInputShape, const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex(); const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex(); - unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1; - unsigned int inputChannels = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex]; - unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : rFilterShape[0]; + const unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1; + const unsigned int inputChannels = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex]; + const unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : rFilterShape[0]; - unsigned int batchSize = rOutputShape[0]; - unsigned int outputHeight = rOutputShape[heightIndex]; - unsigned int outputWidth = rOutputShape[widthIndex]; - unsigned int inputHeight = rInputShape[heightIndex]; - unsigned int inputWidth = rInputShape[widthIndex]; + const unsigned int batchSize = rOutputShape[0]; + const unsigned int outputHeight = rOutputShape[heightIndex]; + const unsigned int outputWidth = rOutputShape[widthIndex]; + const unsigned int inputHeight = rInputShape[heightIndex]; + const unsigned int inputWidth = rInputShape[widthIndex]; - unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex]; - unsigned int filterWidth = depthwise ? rFilterShape[3] : rFilterShape[widthIndex]; + const unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex]; + const unsigned int filterWidth = depthwise ? rFilterShape[3] : rFilterShape[widthIndex]; + const std::vector inputVec = rInputDecoder.DecodeTensor(rInputShape.GetNumElements()); + + uint32_t channelStepSize; + if (depthwise) + { + channelStepSize = filterHeight * filterWidth; + } + else + { + if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC) + { + channelStepSize = rFilterShape[3]; + } + else + { + channelStepSize = rFilterShape[1] * rFilterShape[2] * rFilterShape[3]; + } + } + + const std::vector filterVec = rFilterDecoder.DecodeTensor(rFilterShape.GetNumElements(), + channelStepSize, + depthMultiplier); + const std::vector biasVec = biasEnabled ? + pBiasDecoder->DecodeTensor(outputChannels) : std::vector(); + + unsigned int depthwiseMultiplierIdx = 0; for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++) @@ -117,15 +143,15 @@ void Convolve(const TensorShape& rInputShape, for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++) { // This loop goes over each output element. - float sum = 0.0f; + float sum = 0.0f; // For depthwise, each output channel corresponds to exactly one input channel. // For normal, must loop over each input channel. for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++) { - unsigned int depthwiseMultiplierIdx = 0; if (depthwise) { + depthwiseMultiplierIdx = 0; cInput = cOutput / depthMultiplier; depthwiseMultiplierIdx = cOutput % depthMultiplier; } @@ -149,7 +175,7 @@ void Convolve(const TensorShape& rInputShape, { // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great // performance regression. - if (dataLayout == DataLayout::NHWC) + if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC) { filterIndex = cOutput * filterHeight * filterWidth * inputChannels + yFilter * filterWidth * inputChannels + @@ -159,15 +185,12 @@ void Convolve(const TensorShape& rInputShape, else { filterIndex = cOutput * filterWidth * filterHeight * inputChannels + - cInput * filterWidth * filterHeight + + cInput * filterWidth * filterHeight + yFilter * filterWidth + xFilter; } } - rFilterDecoder.SetIndex(filterIndex, cOutput); - float filterValue = rFilterDecoder.Get(); - unsigned int yInput = yOutput * yStride + yFilter * yDilation; unsigned int xInput = xOutput * xStride + xFilter * xDilation; @@ -175,7 +198,7 @@ void Convolve(const TensorShape& rInputShape, // Check if we're in the padding. if (yInput < paddingTop || yInput >= inputHeight + paddingTop || - xInput < paddingLeft || xInput >= inputWidth + paddingLeft ) + xInput < paddingLeft || xInput >= inputWidth + paddingLeft) { inputValue = 0.0f; } @@ -185,9 +208,9 @@ void Convolve(const TensorShape& rInputShape, // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great // performance regression. - if (dataLayout == DataLayout::NHWC) + if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC) { - inputIndex = batchIdx * inputHeight * inputWidth * inputChannels + + inputIndex = batchIdx * inputHeight * inputWidth * inputChannels + (yInput - paddingTop) * inputWidth * inputChannels + (xInput - paddingLeft) * inputChannels + cInput; @@ -199,23 +222,34 @@ void Convolve(const TensorShape& rInputShape, inputWidth * (yInput - paddingTop) + xInput - paddingLeft; } - - rInputDecoder[inputIndex]; - inputValue = rInputDecoder.Get(); + inputValue = inputVec[inputIndex]; } - sum += filterValue * inputValue; + sum += filterVec[filterIndex] * inputValue; } } } if (biasEnabled) { - (*pBiasDecoder).SetIndex(cOutput, cOutput); - sum += pBiasDecoder->Get(); + sum += biasVec[cOutput]; } - unsigned int outIdx = dataLayoutIndexed.GetIndex(rOutputShape, batchIdx, cOutput, yOutput, xOutput); + unsigned int outIdx; + if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC) + { + outIdx = batchIdx * outputHeight * outputWidth * outputChannels + + yOutput * outputWidth * outputChannels + + xOutput * outputChannels + + cOutput; + } + else + { + outIdx = batchIdx * outputHeight * outputWidth * outputChannels + + cOutput * outputHeight * outputWidth + + yOutput * outputWidth + + xOutput; + } rOutputEncoder[outIdx]; rOutputEncoder.Set(sum); diff --git a/src/backends/reference/workloads/FullyConnected.cpp b/src/backends/reference/workloads/FullyConnected.cpp index 8016c1b628..61c8e88bce 100644 --- a/src/backends/reference/workloads/FullyConnected.cpp +++ b/src/backends/reference/workloads/FullyConnected.cpp @@ -14,6 +14,7 @@ void FullyConnected(const TensorShape& rInputShape, Decoder& rInputDecoder, const TensorShape& rOutputShape, Encoder& rOutputEncoder, + const TensorShape& rWeightsShape, Decoder& rWeightDecoder, Decoder& rBiasDecoder, const bool biasEnabled, @@ -23,6 +24,12 @@ void FullyConnected(const TensorShape& rInputShape, // Perform FullyConnected implementation unsigned int outputSize = rOutputShape[1]; + const std::vector decodedInputs = rInputDecoder.DecodeTensor(rInputShape.GetNumElements()); + const std::vector decodedWeights = rWeightDecoder.DecodeTensor(rWeightsShape.GetNumElements()); + const std::vector decodedBiases = biasEnabled ? + rBiasDecoder.DecodeTensor(outputSize) : std::vector(); + + for (unsigned int n = 0; n < rInputShape[0]; n++) { for (unsigned int channelOutput = 0; channelOutput < outputSize; channelOutput++) @@ -34,23 +41,19 @@ void FullyConnected(const TensorShape& rInputShape, float weight; if (transposeWeights) { - rWeightDecoder[channelOutput * K + channelInput]; - weight = rWeightDecoder.Get(); + weight = decodedWeights[channelOutput * K + channelInput]; } else { - rWeightDecoder[channelInput * outputSize + channelOutput]; - weight = rWeightDecoder.Get(); + weight = decodedWeights[channelInput * outputSize + channelOutput]; } - rInputDecoder[n * K + channelInput]; - outval += weight * rInputDecoder.Get(); + outval += weight * decodedInputs[n * K + channelInput]; } if (biasEnabled) { - rBiasDecoder[channelOutput]; - outval += rBiasDecoder.Get(); + outval += decodedBiases[channelOutput]; } rOutputEncoder[n * outputSize + channelOutput]; diff --git a/src/backends/reference/workloads/FullyConnected.hpp b/src/backends/reference/workloads/FullyConnected.hpp index 6f9559db24..e33060631b 100644 --- a/src/backends/reference/workloads/FullyConnected.hpp +++ b/src/backends/reference/workloads/FullyConnected.hpp @@ -19,6 +19,7 @@ void FullyConnected(const TensorShape& rInputShape, Decoder& rInputDecoder, const TensorShape& rOutputShape, Encoder& rOutputEncoder, + const TensorShape& rWeightsShape, Decoder& rWeightDecoder, Decoder& rBiasDecoder, bool biasEnabled, diff --git a/src/backends/reference/workloads/Pooling2d.cpp b/src/backends/reference/workloads/Pooling2d.cpp index 435671ffad..2bc3b4f213 100644 --- a/src/backends/reference/workloads/Pooling2d.cpp +++ b/src/backends/reference/workloads/Pooling2d.cpp @@ -172,9 +172,6 @@ void Pooling2d(Decoder& rInputDecoder, Accumulator accumulate = GetAccumulator(params.m_PoolType); Executor execute = GetExecutor(params.m_PoolType); - TensorShape outputShape = outputInfo.GetShape(); - TensorShape inputShape = inputInfo.GetShape(); - // Check supported padding methods outside the loop to simplify // the inner loop. if (params.m_PaddingMethod != PaddingMethod::Exclude && @@ -183,6 +180,8 @@ void Pooling2d(Decoder& rInputDecoder, throw armnn::InvalidArgumentException("Unsupported padding type"); } + const std::vector decodedInputVec = rInputDecoder.DecodeTensor(inputInfo.GetNumElements()); + for (int n = 0; n < batchSize; n++) { for (int c = 0; c < channels; c++) @@ -221,12 +220,24 @@ void Pooling2d(Decoder& rInputDecoder, { result = 0.0f; - unsigned int outputIndex = dataLayout.GetIndex(outputShape, - armnn::numeric_cast(n), - armnn::numeric_cast(c), - armnn::numeric_cast(yOutput), - armnn::numeric_cast(xOutput)); - rOutputEncoder[outputIndex]; + int outputIndex; + + if(dataLayout.GetDataLayout() == DataLayout::NHWC) + { + outputIndex = n * heightOutput * widthOutput * channels + + yOutput * widthOutput * channels + + xOutput * channels + + c; + } + else + { + outputIndex = n * heightOutput * widthOutput * channels + + c * heightOutput * widthOutput + + yOutput * widthOutput + + xOutput; + } + + rOutputEncoder[static_cast(outputIndex)]; rOutputEncoder.Set(result); continue; } @@ -244,28 +255,48 @@ void Pooling2d(Decoder& rInputDecoder, { for (auto xInput = wstart; xInput < wend; xInput++) { - unsigned int inputIndex = dataLayout.GetIndex(inputShape, - armnn::numeric_cast(n), - armnn::numeric_cast(c), - armnn::numeric_cast(yInput), - armnn::numeric_cast(xInput)); - - rInputDecoder[inputIndex]; - float inval = rInputDecoder.Get(); - accumulate(result, inval); + int inputIndex; + if(dataLayout.GetDataLayout() == DataLayout::NHWC) + { + inputIndex = n * heightInput * widthInput * channels + + yInput * widthInput * channels + + xInput * channels + + c; + + } + else + { + inputIndex = n * heightInput * widthInput * channels + + c * heightInput * widthInput + + yInput * widthInput + + xInput; + } + + accumulate(result, decodedInputVec[static_cast(inputIndex)]); } } execute(result, poolAreaSize); - unsigned int outputIndex = dataLayout.GetIndex(outputShape, - armnn::numeric_cast(n), - armnn::numeric_cast(c), - armnn::numeric_cast(yOutput), - armnn::numeric_cast(xOutput)); + int outputIndex; + + if(dataLayout.GetDataLayout() == DataLayout::NHWC) + { + outputIndex = n * heightOutput * widthOutput * channels + + yOutput * widthOutput * channels + + xOutput * channels + + c; + } + else + { + outputIndex = n * heightOutput * widthOutput * channels + + c * heightOutput * widthOutput + + yOutput * widthOutput + + xOutput; + } - rOutputEncoder[outputIndex]; + rOutputEncoder[static_cast(outputIndex)]; rOutputEncoder.Set(result); } } diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp index 32c65d3ebd..9acca219b5 100644 --- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp +++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp @@ -58,6 +58,7 @@ void RefFullyConnectedWorkload::Execute() const *m_InputDecoder, m_OutputShape, *m_OutputEncoder, + m_WeightShape, *m_WeightDecoder, *m_BiasDecoder, m_Data.m_Parameters.m_BiasEnabled, diff --git a/src/backends/reference/workloads/TransposeConvolution2d.cpp b/src/backends/reference/workloads/TransposeConvolution2d.cpp index 5698014181..c34a309806 100644 --- a/src/backends/reference/workloads/TransposeConvolution2d.cpp +++ b/src/backends/reference/workloads/TransposeConvolution2d.cpp @@ -30,27 +30,35 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex(); const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex(); - unsigned int numBatches = inputShape[0]; + const unsigned int numBatches = inputShape[0]; - unsigned int inputWidth = inputShape[widthIndex]; - unsigned int inputHeight = inputShape[heightIndex]; - unsigned int inputDepth = inputShape[channelsIndex]; + const unsigned int inputWidth = inputShape[widthIndex]; + const unsigned int inputHeight = inputShape[heightIndex]; + const unsigned int inputDepth = inputShape[channelsIndex]; - unsigned int weightsHeight = weightsShape[heightIndex]; - unsigned int weightsWidth = weightsShape[widthIndex]; + const unsigned int weightsHeight = weightsShape[heightIndex]; + const unsigned int weightsWidth = weightsShape[widthIndex]; + const unsigned int weightsDepth = weightsShape[channelsIndex]; - unsigned int outputHeight = outputShape[heightIndex]; - unsigned int outputWidth = outputShape[widthIndex]; - unsigned int outputDepth = outputShape[channelsIndex]; + const unsigned int outputHeight = outputShape[heightIndex]; + const unsigned int outputWidth = outputShape[widthIndex]; + const unsigned int outputDepth = outputShape[channelsIndex]; - unsigned int paddingLeft = descriptor.m_PadLeft; - unsigned int paddingTop = descriptor.m_PadTop; + const unsigned int paddingLeft = descriptor.m_PadLeft; + const unsigned int paddingTop = descriptor.m_PadTop; - unsigned int strideX = descriptor.m_StrideX; - unsigned int strideY = descriptor.m_StrideY; + const unsigned int strideX = descriptor.m_StrideX; + const unsigned int strideY = descriptor.m_StrideY; std::vector outputBuffer(outputShape.GetNumElements(), 0); + const std::vector inputVec = inputDecoder.DecodeTensor(inputShape.GetNumElements()); + + const unsigned channelStep = weightsWidth * weightsHeight * weightsDepth; + + const std::vector filterVec = + weightsDecoder.DecodeTensor(weightsShape.GetNumElements(), channelStep); + for (unsigned int batch = 0u; batch < numBatches; ++batch) { for (unsigned int yInput = 0u; yInput < inputHeight; ++yInput) @@ -73,25 +81,51 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript { for (unsigned int dInput = 0u; dInput < inputDepth; dInput++) { - const unsigned int inputIndex = - dataLayoutIndexed.GetIndex(inputShape, batch, dInput, yInput, xInput); - inputDecoder[inputIndex]; - - const unsigned int weightsIndex = - dataLayoutIndexed.GetIndex(weightsShape, dOutput, dInput, yWeights, xWeights); - weightsDecoder.SetIndex(weightsIndex, dOutput); - - const unsigned int outputIndex = - dataLayoutIndexed.GetIndex(outputShape, batch, dOutput, yOutput, xOutput); - outputEncoder[outputIndex]; - - float output = outputBuffer[outputIndex]; - output += inputDecoder.Get() * weightsDecoder.Get(); - outputBuffer[outputIndex] = output; + unsigned int inputIndex; + unsigned int outputIndex; + unsigned int weightsIndex; + + if(descriptor.m_DataLayout == armnn::DataLayout::NHWC) + { + inputIndex = batch * inputHeight * inputWidth * inputDepth + + yInput * inputWidth * inputDepth + + xInput * inputDepth + + dInput; + + weightsIndex = dOutput * weightsHeight * weightsWidth * weightsDepth + + yWeights * weightsWidth * weightsDepth + + xWeights * weightsDepth + + dInput; + + outputIndex = batch * outputHeight * outputWidth * outputDepth + + yOutput * outputWidth * outputDepth + + xOutput * outputDepth + + dOutput; + } + else + { + inputIndex = batch * inputDepth * inputHeight * inputWidth + + dInput * inputHeight * inputWidth + + yInput * inputWidth + + xInput; + + weightsIndex = dOutput * weightsDepth * weightsHeight * weightsWidth + + dInput * weightsHeight * weightsWidth + + yWeights * weightsWidth + + xWeights; + + outputIndex = batch * outputDepth * outputHeight * outputWidth + + dOutput * outputHeight * outputWidth + + yOutput * outputWidth + + xOutput; + } + + outputBuffer[outputIndex] += inputVec[inputIndex] * filterVec[weightsIndex]; } } } } + } } } -- cgit v1.2.1