From f2aaab3a06024b5d5c538cc42799fb2c91b4ca2b Mon Sep 17 00:00:00 2001 From: Matteo Martincigh Date: Thu, 6 Jun 2019 15:46:22 +0100 Subject: IVGCVSW-3223 Fix ref convolution performance regression * Do not use DataLayoutIndexed::GetIndex for weights and inputs, as it causes a large regression in performance * It turned out that the calculation of the indexes for the weights and inputs was the way it was because of an optimization done many months ago * Reverted the relevant hunks and added some comments so we won't make the same mistake again in the future * Made the GetIndex function inline to speed up other usages Change-Id: I343b2ef0446993086f58b9dea1f0de0ba2d92216 Signed-off-by: Matteo Martincigh --- src/armnnUtils/DataLayoutIndexed.cpp | 37 ---------------------- src/armnnUtils/DataLayoutIndexed.hpp | 39 ++++++++++++++++++++++-- src/backends/reference/workloads/ConvImpl.cpp | 44 +++++++++++++++++++++------ 3 files changed, 70 insertions(+), 50 deletions(-) diff --git a/src/armnnUtils/DataLayoutIndexed.cpp b/src/armnnUtils/DataLayoutIndexed.cpp index b02f07ec85..02f1e816ac 100644 --- a/src/armnnUtils/DataLayoutIndexed.cpp +++ b/src/armnnUtils/DataLayoutIndexed.cpp @@ -4,9 +4,6 @@ // #include "DataLayoutIndexed.hpp" - -#include - using namespace armnn; namespace armnnUtils @@ -33,40 +30,6 @@ DataLayoutIndexed::DataLayoutIndexed(armnn::DataLayout dataLayout) } } -unsigned int DataLayoutIndexed::GetIndex(const TensorShape& shape, - unsigned int batchIndex, unsigned int channelIndex, - unsigned int heightIndex, unsigned int widthIndex) const -{ - BOOST_ASSERT( batchIndex < shape[0] || ( shape[0] == 0 && batchIndex == 0 ) ); - BOOST_ASSERT( channelIndex < shape[m_ChannelsIndex] || - ( shape[m_ChannelsIndex] == 0 && channelIndex == 0) ); - BOOST_ASSERT( heightIndex < shape[m_HeightIndex] || - ( shape[m_HeightIndex] == 0 && heightIndex == 0) ); - BOOST_ASSERT( widthIndex < shape[m_WidthIndex] || - ( shape[m_WidthIndex] == 0 && widthIndex == 0) ); - - // Offset the given indices appropriately depending on the data layout - switch (m_DataLayout) - { - case DataLayout::NHWC: - batchIndex *= shape[1] * shape[2] * shape[3]; // batchIndex *= heightIndex * widthIndex * channelIndex - heightIndex *= shape[m_WidthIndex] * shape[m_ChannelsIndex]; - widthIndex *= shape[m_ChannelsIndex]; - // channelIndex stays unchanged - break; - case DataLayout::NCHW: - default: - batchIndex *= shape[1] * shape[2] * shape[3]; // batchIndex *= heightIndex * widthIndex * channelIndex - channelIndex *= shape[m_HeightIndex] * shape[m_WidthIndex]; - heightIndex *= shape[m_WidthIndex]; - // widthIndex stays unchanged - break; - } - - // Get the value using the correct offset - return batchIndex + channelIndex + heightIndex + widthIndex; -} - bool operator==(const DataLayout& dataLayout, const DataLayoutIndexed& indexed) { return dataLayout == indexed.GetDataLayout(); diff --git a/src/armnnUtils/DataLayoutIndexed.hpp b/src/armnnUtils/DataLayoutIndexed.hpp index 5bb8e0d93f..8bd9701a5e 100644 --- a/src/armnnUtils/DataLayoutIndexed.hpp +++ b/src/armnnUtils/DataLayoutIndexed.hpp @@ -8,6 +8,8 @@ #include #include +#include + namespace armnnUtils { @@ -21,9 +23,40 @@ public: unsigned int GetChannelsIndex() const { return m_ChannelsIndex; } unsigned int GetHeightIndex() const { return m_HeightIndex; } unsigned int GetWidthIndex() const { return m_WidthIndex; } - unsigned int GetIndex(const armnn::TensorShape& shape, - unsigned int batchIndex, unsigned int channelIndex, - unsigned int heightIndex, unsigned int widthIndex) const; + + inline unsigned int GetIndex(const armnn::TensorShape& shape, + unsigned int batchIndex, unsigned int channelIndex, + unsigned int heightIndex, unsigned int widthIndex) const + { + BOOST_ASSERT( batchIndex < shape[0] || ( shape[0] == 0 && batchIndex == 0 ) ); + BOOST_ASSERT( channelIndex < shape[m_ChannelsIndex] || + ( shape[m_ChannelsIndex] == 0 && channelIndex == 0) ); + BOOST_ASSERT( heightIndex < shape[m_HeightIndex] || + ( shape[m_HeightIndex] == 0 && heightIndex == 0) ); + BOOST_ASSERT( widthIndex < shape[m_WidthIndex] || + ( shape[m_WidthIndex] == 0 && widthIndex == 0) ); + + // Offset the given indices appropriately depending on the data layout + switch (m_DataLayout) + { + case armnn::DataLayout::NHWC: + batchIndex *= shape[1] * shape[2] * shape[3]; // batchIndex *= heightIndex * widthIndex * channelIndex + heightIndex *= shape[m_WidthIndex] * shape[m_ChannelsIndex]; + widthIndex *= shape[m_ChannelsIndex]; + // channelIndex stays unchanged + break; + case armnn::DataLayout::NCHW: + default: + batchIndex *= shape[1] * shape[2] * shape[3]; // batchIndex *= heightIndex * widthIndex * channelIndex + channelIndex *= shape[m_HeightIndex] * shape[m_WidthIndex]; + heightIndex *= shape[m_WidthIndex]; + // widthIndex stays unchanged + break; + } + + // Get the value using the correct offset + return batchIndex + channelIndex + heightIndex + widthIndex; + } private: armnn::DataLayout m_DataLayout; diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp index 801a29af1a..92e3b2d7dd 100644 --- a/src/backends/reference/workloads/ConvImpl.cpp +++ b/src/backends/reference/workloads/ConvImpl.cpp @@ -147,11 +147,22 @@ void Convolve(const TensorShape& rInputShape, } else { - filterIndex = dataLayoutIndexed.GetIndex(rFilterShape, - cOutput, - cInput, - yFilter, - xFilter); + // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great + // performance regression. + if (dataLayout == DataLayout::NHWC) + { + filterIndex = cOutput * filterHeight * filterWidth * inputChannels + + yFilter * filterWidth * inputChannels + + xFilter * inputChannels + + cInput; + } + else + { + filterIndex = cOutput * filterWidth * filterHeight * inputChannels + + cInput * filterWidth * filterHeight + + yFilter * filterWidth + + xFilter; + } } rFilterDecoder[filterIndex]; @@ -170,11 +181,24 @@ void Convolve(const TensorShape& rInputShape, } else { - unsigned int inputIndex = dataLayoutIndexed.GetIndex(rInputShape, - batchIdx, - cInput, - yInput - paddingTop, - xInput - paddingLeft); + unsigned int inputIndex = 0; + + // Keep this implementation, as using DataLayoutIndexed::GetIndex causes great + // performance regression. + if (dataLayout == DataLayout::NHWC) + { + inputIndex = batchIdx * inputHeight * inputWidth * inputChannels + + (yInput - paddingTop) * inputWidth * inputChannels + + (xInput - paddingLeft) * inputChannels + + cInput; + } + else + { + inputIndex = batchIdx * inputWidth * inputHeight * inputChannels + + inputWidth * inputHeight * cInput + + inputWidth * (yInput - paddingTop) + + xInput - paddingLeft; + } rInputDecoder[inputIndex]; inputValue = rInputDecoder.Get(); -- cgit v1.2.1