aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorFinn Williams <Finn.Williams@arm.com>2020-09-17 15:58:31 +0100
committerfinn.williams <finn.williams@arm.com>2020-09-28 09:01:58 +0000
commitb9dcfe63b87f024c6f8c5f4b68447de04119dc19 (patch)
tree0c58376c59190ecbc8df0dd2abedbf85983d5256 /src
parentbe727becad9fe048480ab53a0281b46594f95ca7 (diff)
downloadarmnn-b9dcfe63b87f024c6f8c5f4b68447de04119dc19.tar.gz
IVGCVSW-5325 Speed up the reference backend
Change-Id: Id8bd0a0418be31d975b944b54bbacb25051ffb2e Signed-off-by: Finn Williams <Finn.Williams@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/backends/reference/workloads/BaseIterator.hpp248
-rw-r--r--src/backends/reference/workloads/ConvImpl.cpp88
-rw-r--r--src/backends/reference/workloads/FullyConnected.cpp19
-rw-r--r--src/backends/reference/workloads/FullyConnected.hpp1
-rw-r--r--src/backends/reference/workloads/Pooling2d.cpp79
-rw-r--r--src/backends/reference/workloads/RefFullyConnectedWorkload.cpp1
-rw-r--r--src/backends/reference/workloads/TransposeConvolution2d.cpp90
7 files changed, 438 insertions, 88 deletions
diff --git a/src/backends/reference/workloads/BaseIterator.hpp b/src/backends/reference/workloads/BaseIterator.hpp
index 1f4f2da717..0165ec7c7a 100644
--- a/src/backends/reference/workloads/BaseIterator.hpp
+++ b/src/backends/reference/workloads/BaseIterator.hpp
@@ -44,6 +44,10 @@ public:
virtual void Reset(void*) = 0;
virtual IType Get() const = 0;
+
+ virtual std::vector<float> DecodeTensor(uint32_t size,
+ uint32_t channelStep = 1,
+ uint32_t channelMultiplier = 1) = 0;
};
template<typename IType>
@@ -130,7 +134,24 @@ public:
return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+ }
+
+ return decodedTensor;
+ }
+
private:
+
const float m_Scale;
const int32_t m_Offset;
};
@@ -149,9 +170,26 @@ public:
return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+ }
+
+ return decodedTensor;
+ }
+
private:
const float m_Scale;
const int32_t m_Offset;
+
};
class QSymmS8Decoder : public TypedIterator<const int8_t, Decoder<float>>
@@ -168,9 +206,26 @@ public:
return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+ }
+
+ return decodedTensor;
+ }
+
private:
const float m_Scale;
const int32_t m_Offset;
+
};
class QSymm16Decoder : public TypedIterator<const int16_t, Decoder<float>>
@@ -187,9 +242,28 @@ public:
return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
}
+
+
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale, m_Offset));
+ }
+
+ return decodedTensor;
+ }
+
private:
const float m_Scale;
const int32_t m_Offset;
+
};
class BFloat16Decoder : public TypedIterator<const BFloat16, Decoder<float>>
@@ -207,6 +281,26 @@ public:
armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val);
return val;
}
+
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+
+ float val = 0.f;
+ armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(m_Iterator, 1, &val);
+ decodedTensor.emplace_back(val);
+ }
+
+ return decodedTensor;
+ }
+
};
class Float16Decoder : public TypedIterator<const Half, Decoder<float>>
@@ -224,6 +318,26 @@ public:
armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val);
return val;
}
+
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ float val = 0.f;
+ this->operator[](i);
+ armnnUtils::FloatingPointConverter::ConvertFloat16To32(m_Iterator, 1, &val);
+ decodedTensor.emplace_back(val);
+ }
+
+ return decodedTensor;
+ }
+
+
};
class Float32Decoder : public TypedIterator<const float, Decoder<float>>
@@ -239,6 +353,16 @@ public:
{
return *m_Iterator;
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+ std::vector<float> decodedTensor;
+
+ decodedTensor.reserve(size);
+ decodedTensor.assign(m_Start, m_Start + size);
+
+ return decodedTensor;
+ }
};
class ScaledInt32Decoder : public TypedIterator<const int32_t, Decoder<float>>
@@ -255,8 +379,25 @@ public:
return static_cast<float>(*m_Iterator) * m_Scale;
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(static_cast<float>(*m_Iterator) * m_Scale);
+ }
+
+ return decodedTensor;
+ }
+
private:
const float m_Scale;
+
};
class Int32Decoder : public TypedIterator<const int32_t, Decoder<float>>
@@ -272,6 +413,22 @@ public:
{
return static_cast<float>(*m_Iterator);
}
+
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(static_cast<float>(*m_Iterator));
+ }
+
+ return decodedTensor;
+ }
};
class Int32ToInt32tDecoder : public TypedIterator<const int32_t, Decoder<int32_t>>
@@ -287,6 +444,22 @@ public:
{
return *m_Iterator;
}
+
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(*m_Iterator);
+ }
+
+ return decodedTensor;
+ }
};
class BooleanDecoder : public TypedIterator<const uint8_t, Decoder<float>>
@@ -303,6 +476,21 @@ public:
return *m_Iterator;
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ IgnoreUnused(channelStepSize, channelMultiplier);
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ for (uint32_t i = 0; i < size; ++i)
+ {
+ this->operator[](i);
+ decodedTensor.emplace_back(*m_Iterator);
+ }
+
+ return decodedTensor;
+ }
};
class QASymm8Encoder : public TypedIterator<uint8_t, Encoder<float>>
@@ -530,7 +718,7 @@ template<typename T, typename Base>
class PerAxisIterator : public Base
{
public:
- // axisFactor is used to calculate axisIndex
+ // axisFactor is used to calculate channelStep
PerAxisIterator(T* data = nullptr, unsigned int axisFactor = 0)
: m_Iterator(data), m_Start(data), m_AxisIndex(0), m_AxisFactor(axisFactor)
{}
@@ -607,6 +795,35 @@ public:
return m_Scale[m_AxisIndex];
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ uint32_t channels = static_cast<uint32_t>(m_Scale.size());
+ uint32_t channelSteps = size / (channelStepSize * channelMultiplier);
+ uint32_t scale;
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out
+ // channelStepSize is the length of a contiguous section of a channel within a tensor
+ // channelSteps is the number of those steps/blocks in the tensor
+ for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+ {
+ for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep)
+ {
+ scale = (channelMultiplier * channelStep + mult) % channels;
+ for (uint32_t i = 0; i < channelStepSize; ++i)
+ {
+ unsigned int index = mult * channelStepSize * channelMultiplier +
+ channelStep * channelStepSize + i;
+ this->operator[](index);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scale[scale], 0));
+ }
+ }
+ }
+ return decodedTensor;
+ }
+
private:
std::vector<float> m_Scale;
};
@@ -654,6 +871,35 @@ public:
return m_Scales[m_AxisIndex];
}
+ std::vector<float> DecodeTensor(uint32_t size, uint32_t channelStepSize, uint32_t channelMultiplier) override
+ {
+ uint32_t channels = static_cast<uint32_t>(m_Scales.size());
+ uint32_t channelSteps = size / (channelStepSize * channelMultiplier);
+ uint32_t scale;
+
+ std::vector<float> decodedTensor;
+ decodedTensor.reserve(size);
+
+ // channelMultiplier is only used in depthwise convolutions and in other cases will cancel out
+ // channelStepSize is the length of a contiguous section of a channel within a tensor
+ // channelSteps is the number of those steps/blocks in the tensor
+ for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+ {
+ for (uint32_t channelStep = 0; channelStep < channelSteps; ++channelStep)
+ {
+ scale = (channelMultiplier * channelStep + mult) % channels;
+ for (uint32_t i = 0; i < channelStepSize; ++i)
+ {
+ unsigned int index = mult * channelStepSize * channelMultiplier +
+ channelStep * channelStepSize + i;
+ this->operator[](index);
+ decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0));
+ }
+ }
+ }
+ return decodedTensor;
+ }
+
private:
std::vector<float> m_Scales;
};
diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp
index 7e8b8fffb6..f11c351c61 100644
--- a/src/backends/reference/workloads/ConvImpl.cpp
+++ b/src/backends/reference/workloads/ConvImpl.cpp
@@ -95,19 +95,45 @@ void Convolve(const TensorShape& rInputShape,
const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
- unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1;
- unsigned int inputChannels = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex];
- unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : rFilterShape[0];
+ const unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1;
+ const unsigned int inputChannels = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex];
+ const unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : rFilterShape[0];
- unsigned int batchSize = rOutputShape[0];
- unsigned int outputHeight = rOutputShape[heightIndex];
- unsigned int outputWidth = rOutputShape[widthIndex];
- unsigned int inputHeight = rInputShape[heightIndex];
- unsigned int inputWidth = rInputShape[widthIndex];
+ const unsigned int batchSize = rOutputShape[0];
+ const unsigned int outputHeight = rOutputShape[heightIndex];
+ const unsigned int outputWidth = rOutputShape[widthIndex];
+ const unsigned int inputHeight = rInputShape[heightIndex];
+ const unsigned int inputWidth = rInputShape[widthIndex];
- unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex];
- unsigned int filterWidth = depthwise ? rFilterShape[3] : rFilterShape[widthIndex];
+ const unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex];
+ const unsigned int filterWidth = depthwise ? rFilterShape[3] : rFilterShape[widthIndex];
+ const std::vector<float> inputVec = rInputDecoder.DecodeTensor(rInputShape.GetNumElements());
+
+ uint32_t channelStepSize;
+ if (depthwise)
+ {
+ channelStepSize = filterHeight * filterWidth;
+ }
+ else
+ {
+ if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
+ {
+ channelStepSize = rFilterShape[3];
+ }
+ else
+ {
+ channelStepSize = rFilterShape[1] * rFilterShape[2] * rFilterShape[3];
+ }
+ }
+
+ const std::vector<float> filterVec = rFilterDecoder.DecodeTensor(rFilterShape.GetNumElements(),
+ channelStepSize,
+ depthMultiplier);
+ const std::vector<float> biasVec = biasEnabled ?
+ pBiasDecoder->DecodeTensor(outputChannels) : std::vector<float>();
+
+ unsigned int depthwiseMultiplierIdx = 0;
for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
{
for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
@@ -117,15 +143,15 @@ void Convolve(const TensorShape& rInputShape,
for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
{
// This loop goes over each output element.
- float sum = 0.0f;
+ float sum = 0.0f;
// For depthwise, each output channel corresponds to exactly one input channel.
// For normal, must loop over each input channel.
for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
{
- unsigned int depthwiseMultiplierIdx = 0;
if (depthwise)
{
+ depthwiseMultiplierIdx = 0;
cInput = cOutput / depthMultiplier;
depthwiseMultiplierIdx = cOutput % depthMultiplier;
}
@@ -149,7 +175,7 @@ void Convolve(const TensorShape& rInputShape,
{
// Keep this implementation, as using DataLayoutIndexed::GetIndex causes great
// performance regression.
- if (dataLayout == DataLayout::NHWC)
+ if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
{
filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
yFilter * filterWidth * inputChannels +
@@ -159,15 +185,12 @@ void Convolve(const TensorShape& rInputShape,
else
{
filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
- cInput * filterWidth * filterHeight +
+ cInput * filterWidth * filterHeight +
yFilter * filterWidth +
xFilter;
}
}
- rFilterDecoder.SetIndex(filterIndex, cOutput);
- float filterValue = rFilterDecoder.Get();
-
unsigned int yInput = yOutput * yStride + yFilter * yDilation;
unsigned int xInput = xOutput * xStride + xFilter * xDilation;
@@ -175,7 +198,7 @@ void Convolve(const TensorShape& rInputShape,
// Check if we're in the padding.
if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
- xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
+ xInput < paddingLeft || xInput >= inputWidth + paddingLeft)
{
inputValue = 0.0f;
}
@@ -185,9 +208,9 @@ void Convolve(const TensorShape& rInputShape,
// Keep this implementation, as using DataLayoutIndexed::GetIndex causes great
// performance regression.
- if (dataLayout == DataLayout::NHWC)
+ if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
{
- inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
+ inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
(yInput - paddingTop) * inputWidth * inputChannels +
(xInput - paddingLeft) * inputChannels +
cInput;
@@ -199,23 +222,34 @@ void Convolve(const TensorShape& rInputShape,
inputWidth * (yInput - paddingTop) +
xInput - paddingLeft;
}
-
- rInputDecoder[inputIndex];
- inputValue = rInputDecoder.Get();
+ inputValue = inputVec[inputIndex];
}
- sum += filterValue * inputValue;
+ sum += filterVec[filterIndex] * inputValue;
}
}
}
if (biasEnabled)
{
- (*pBiasDecoder).SetIndex(cOutput, cOutput);
- sum += pBiasDecoder->Get();
+ sum += biasVec[cOutput];
}
- unsigned int outIdx = dataLayoutIndexed.GetIndex(rOutputShape, batchIdx, cOutput, yOutput, xOutput);
+ unsigned int outIdx;
+ if (dataLayoutIndexed.GetDataLayout() == DataLayout::NHWC)
+ {
+ outIdx = batchIdx * outputHeight * outputWidth * outputChannels +
+ yOutput * outputWidth * outputChannels +
+ xOutput * outputChannels +
+ cOutput;
+ }
+ else
+ {
+ outIdx = batchIdx * outputHeight * outputWidth * outputChannels +
+ cOutput * outputHeight * outputWidth +
+ yOutput * outputWidth +
+ xOutput;
+ }
rOutputEncoder[outIdx];
rOutputEncoder.Set(sum);
diff --git a/src/backends/reference/workloads/FullyConnected.cpp b/src/backends/reference/workloads/FullyConnected.cpp
index 8016c1b628..61c8e88bce 100644
--- a/src/backends/reference/workloads/FullyConnected.cpp
+++ b/src/backends/reference/workloads/FullyConnected.cpp
@@ -14,6 +14,7 @@ void FullyConnected(const TensorShape& rInputShape,
Decoder<float>& rInputDecoder,
const TensorShape& rOutputShape,
Encoder<float>& rOutputEncoder,
+ const TensorShape& rWeightsShape,
Decoder<float>& rWeightDecoder,
Decoder<float>& rBiasDecoder,
const bool biasEnabled,
@@ -23,6 +24,12 @@ void FullyConnected(const TensorShape& rInputShape,
// Perform FullyConnected implementation
unsigned int outputSize = rOutputShape[1];
+ const std::vector<float> decodedInputs = rInputDecoder.DecodeTensor(rInputShape.GetNumElements());
+ const std::vector<float> decodedWeights = rWeightDecoder.DecodeTensor(rWeightsShape.GetNumElements());
+ const std::vector<float> decodedBiases = biasEnabled ?
+ rBiasDecoder.DecodeTensor(outputSize) : std::vector<float>();
+
+
for (unsigned int n = 0; n < rInputShape[0]; n++)
{
for (unsigned int channelOutput = 0; channelOutput < outputSize; channelOutput++)
@@ -34,23 +41,19 @@ void FullyConnected(const TensorShape& rInputShape,
float weight;
if (transposeWeights)
{
- rWeightDecoder[channelOutput * K + channelInput];
- weight = rWeightDecoder.Get();
+ weight = decodedWeights[channelOutput * K + channelInput];
}
else
{
- rWeightDecoder[channelInput * outputSize + channelOutput];
- weight = rWeightDecoder.Get();
+ weight = decodedWeights[channelInput * outputSize + channelOutput];
}
- rInputDecoder[n * K + channelInput];
- outval += weight * rInputDecoder.Get();
+ outval += weight * decodedInputs[n * K + channelInput];
}
if (biasEnabled)
{
- rBiasDecoder[channelOutput];
- outval += rBiasDecoder.Get();
+ outval += decodedBiases[channelOutput];
}
rOutputEncoder[n * outputSize + channelOutput];
diff --git a/src/backends/reference/workloads/FullyConnected.hpp b/src/backends/reference/workloads/FullyConnected.hpp
index 6f9559db24..e33060631b 100644
--- a/src/backends/reference/workloads/FullyConnected.hpp
+++ b/src/backends/reference/workloads/FullyConnected.hpp
@@ -19,6 +19,7 @@ void FullyConnected(const TensorShape& rInputShape,
Decoder<float>& rInputDecoder,
const TensorShape& rOutputShape,
Encoder<float>& rOutputEncoder,
+ const TensorShape& rWeightsShape,
Decoder<float>& rWeightDecoder,
Decoder<float>& rBiasDecoder,
bool biasEnabled,
diff --git a/src/backends/reference/workloads/Pooling2d.cpp b/src/backends/reference/workloads/Pooling2d.cpp
index 435671ffad..2bc3b4f213 100644
--- a/src/backends/reference/workloads/Pooling2d.cpp
+++ b/src/backends/reference/workloads/Pooling2d.cpp
@@ -172,9 +172,6 @@ void Pooling2d(Decoder<float>& rInputDecoder,
Accumulator accumulate = GetAccumulator(params.m_PoolType);
Executor execute = GetExecutor(params.m_PoolType);
- TensorShape outputShape = outputInfo.GetShape();
- TensorShape inputShape = inputInfo.GetShape();
-
// Check supported padding methods outside the loop to simplify
// the inner loop.
if (params.m_PaddingMethod != PaddingMethod::Exclude &&
@@ -183,6 +180,8 @@ void Pooling2d(Decoder<float>& rInputDecoder,
throw armnn::InvalidArgumentException("Unsupported padding type");
}
+ const std::vector<float> decodedInputVec = rInputDecoder.DecodeTensor(inputInfo.GetNumElements());
+
for (int n = 0; n < batchSize; n++)
{
for (int c = 0; c < channels; c++)
@@ -221,12 +220,24 @@ void Pooling2d(Decoder<float>& rInputDecoder,
{
result = 0.0f;
- unsigned int outputIndex = dataLayout.GetIndex(outputShape,
- armnn::numeric_cast<unsigned int>(n),
- armnn::numeric_cast<unsigned int>(c),
- armnn::numeric_cast<unsigned int>(yOutput),
- armnn::numeric_cast<unsigned int>(xOutput));
- rOutputEncoder[outputIndex];
+ int outputIndex;
+
+ if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+ {
+ outputIndex = n * heightOutput * widthOutput * channels +
+ yOutput * widthOutput * channels +
+ xOutput * channels +
+ c;
+ }
+ else
+ {
+ outputIndex = n * heightOutput * widthOutput * channels +
+ c * heightOutput * widthOutput +
+ yOutput * widthOutput +
+ xOutput;
+ }
+
+ rOutputEncoder[static_cast<unsigned int>(outputIndex)];
rOutputEncoder.Set(result);
continue;
}
@@ -244,28 +255,48 @@ void Pooling2d(Decoder<float>& rInputDecoder,
{
for (auto xInput = wstart; xInput < wend; xInput++)
{
- unsigned int inputIndex = dataLayout.GetIndex(inputShape,
- armnn::numeric_cast<unsigned int>(n),
- armnn::numeric_cast<unsigned int>(c),
- armnn::numeric_cast<unsigned int>(yInput),
- armnn::numeric_cast<unsigned int>(xInput));
-
- rInputDecoder[inputIndex];
- float inval = rInputDecoder.Get();
- accumulate(result, inval);
+ int inputIndex;
+ if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+ {
+ inputIndex = n * heightInput * widthInput * channels +
+ yInput * widthInput * channels +
+ xInput * channels +
+ c;
+
+ }
+ else
+ {
+ inputIndex = n * heightInput * widthInput * channels +
+ c * heightInput * widthInput +
+ yInput * widthInput +
+ xInput;
+ }
+
+ accumulate(result, decodedInputVec[static_cast<unsigned int>(inputIndex)]);
}
}
execute(result, poolAreaSize);
- unsigned int outputIndex = dataLayout.GetIndex(outputShape,
- armnn::numeric_cast<unsigned int>(n),
- armnn::numeric_cast<unsigned int>(c),
- armnn::numeric_cast<unsigned int>(yOutput),
- armnn::numeric_cast<unsigned int>(xOutput));
+ int outputIndex;
+
+ if(dataLayout.GetDataLayout() == DataLayout::NHWC)
+ {
+ outputIndex = n * heightOutput * widthOutput * channels +
+ yOutput * widthOutput * channels +
+ xOutput * channels +
+ c;
+ }
+ else
+ {
+ outputIndex = n * heightOutput * widthOutput * channels +
+ c * heightOutput * widthOutput +
+ yOutput * widthOutput +
+ xOutput;
+ }
- rOutputEncoder[outputIndex];
+ rOutputEncoder[static_cast<unsigned int>(outputIndex)];
rOutputEncoder.Set(result);
}
}
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index 32c65d3ebd..9acca219b5 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -58,6 +58,7 @@ void RefFullyConnectedWorkload::Execute() const
*m_InputDecoder,
m_OutputShape,
*m_OutputEncoder,
+ m_WeightShape,
*m_WeightDecoder,
*m_BiasDecoder,
m_Data.m_Parameters.m_BiasEnabled,
diff --git a/src/backends/reference/workloads/TransposeConvolution2d.cpp b/src/backends/reference/workloads/TransposeConvolution2d.cpp
index 5698014181..c34a309806 100644
--- a/src/backends/reference/workloads/TransposeConvolution2d.cpp
+++ b/src/backends/reference/workloads/TransposeConvolution2d.cpp
@@ -30,27 +30,35 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript
const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
- unsigned int numBatches = inputShape[0];
+ const unsigned int numBatches = inputShape[0];
- unsigned int inputWidth = inputShape[widthIndex];
- unsigned int inputHeight = inputShape[heightIndex];
- unsigned int inputDepth = inputShape[channelsIndex];
+ const unsigned int inputWidth = inputShape[widthIndex];
+ const unsigned int inputHeight = inputShape[heightIndex];
+ const unsigned int inputDepth = inputShape[channelsIndex];
- unsigned int weightsHeight = weightsShape[heightIndex];
- unsigned int weightsWidth = weightsShape[widthIndex];
+ const unsigned int weightsHeight = weightsShape[heightIndex];
+ const unsigned int weightsWidth = weightsShape[widthIndex];
+ const unsigned int weightsDepth = weightsShape[channelsIndex];
- unsigned int outputHeight = outputShape[heightIndex];
- unsigned int outputWidth = outputShape[widthIndex];
- unsigned int outputDepth = outputShape[channelsIndex];
+ const unsigned int outputHeight = outputShape[heightIndex];
+ const unsigned int outputWidth = outputShape[widthIndex];
+ const unsigned int outputDepth = outputShape[channelsIndex];
- unsigned int paddingLeft = descriptor.m_PadLeft;
- unsigned int paddingTop = descriptor.m_PadTop;
+ const unsigned int paddingLeft = descriptor.m_PadLeft;
+ const unsigned int paddingTop = descriptor.m_PadTop;
- unsigned int strideX = descriptor.m_StrideX;
- unsigned int strideY = descriptor.m_StrideY;
+ const unsigned int strideX = descriptor.m_StrideX;
+ const unsigned int strideY = descriptor.m_StrideY;
std::vector<float> outputBuffer(outputShape.GetNumElements(), 0);
+ const std::vector<float> inputVec = inputDecoder.DecodeTensor(inputShape.GetNumElements());
+
+ const unsigned channelStep = weightsWidth * weightsHeight * weightsDepth;
+
+ const std::vector<float> filterVec =
+ weightsDecoder.DecodeTensor(weightsShape.GetNumElements(), channelStep);
+
for (unsigned int batch = 0u; batch < numBatches; ++batch)
{
for (unsigned int yInput = 0u; yInput < inputHeight; ++yInput)
@@ -73,25 +81,51 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript
{
for (unsigned int dInput = 0u; dInput < inputDepth; dInput++)
{
- const unsigned int inputIndex =
- dataLayoutIndexed.GetIndex(inputShape, batch, dInput, yInput, xInput);
- inputDecoder[inputIndex];
-
- const unsigned int weightsIndex =
- dataLayoutIndexed.GetIndex(weightsShape, dOutput, dInput, yWeights, xWeights);
- weightsDecoder.SetIndex(weightsIndex, dOutput);
-
- const unsigned int outputIndex =
- dataLayoutIndexed.GetIndex(outputShape, batch, dOutput, yOutput, xOutput);
- outputEncoder[outputIndex];
-
- float output = outputBuffer[outputIndex];
- output += inputDecoder.Get() * weightsDecoder.Get();
- outputBuffer[outputIndex] = output;
+ unsigned int inputIndex;
+ unsigned int outputIndex;
+ unsigned int weightsIndex;
+
+ if(descriptor.m_DataLayout == armnn::DataLayout::NHWC)
+ {
+ inputIndex = batch * inputHeight * inputWidth * inputDepth +
+ yInput * inputWidth * inputDepth +
+ xInput * inputDepth +
+ dInput;
+
+ weightsIndex = dOutput * weightsHeight * weightsWidth * weightsDepth +
+ yWeights * weightsWidth * weightsDepth +
+ xWeights * weightsDepth +
+ dInput;
+
+ outputIndex = batch * outputHeight * outputWidth * outputDepth +
+ yOutput * outputWidth * outputDepth +
+ xOutput * outputDepth +
+ dOutput;
+ }
+ else
+ {
+ inputIndex = batch * inputDepth * inputHeight * inputWidth +
+ dInput * inputHeight * inputWidth +
+ yInput * inputWidth +
+ xInput;
+
+ weightsIndex = dOutput * weightsDepth * weightsHeight * weightsWidth +
+ dInput * weightsHeight * weightsWidth +
+ yWeights * weightsWidth +
+ xWeights;
+
+ outputIndex = batch * outputDepth * outputHeight * outputWidth +
+ dOutput * outputHeight * outputWidth +
+ yOutput * outputWidth +
+ xOutput;
+ }
+
+ outputBuffer[outputIndex] += inputVec[inputIndex] * filterVec[weightsIndex];
}
}
}
}
+
}
}
}