From 10b4dfd8e9ccd7a03df7bb053ee1c644cb37f8ab Mon Sep 17 00:00:00 2001 From: David Beck Date: Wed, 19 Sep 2018 12:03:20 +0100 Subject: IVGCVSW-1897 : build infrastructure for the src/backends folder Change-Id: I7ebafb675ccc77ad54d1deb01412a8379a5356bb --- src/backends/RefWorkloads/ConvImpl.hpp | 187 +++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 src/backends/RefWorkloads/ConvImpl.hpp (limited to 'src/backends/RefWorkloads/ConvImpl.hpp') diff --git a/src/backends/RefWorkloads/ConvImpl.hpp b/src/backends/RefWorkloads/ConvImpl.hpp new file mode 100644 index 0000000000..4c9ab2a644 --- /dev/null +++ b/src/backends/RefWorkloads/ConvImpl.hpp @@ -0,0 +1,187 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "RefWorkloadUtils.hpp" + +#include + +#include +#include + +#include +#include + +namespace armnn +{ + +/// Performs multiplication of an integer with a multiplier which is less than one, +/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor. +struct QuantizedMultiplierSmallerThanOne +{ +public: + /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier. + /// This stores the appropriate integer quantities (derived from the given multiplier) for later use. + /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne(). + QuantizedMultiplierSmallerThanOne(float multiplier); + + /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne(). + int32_t operator*(int32_t rhs) const; + +private: + /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul(). + static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); + + /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT(). + static int32_t RoundingDivideByPOT(int32_t x, int exponent); + + int32_t m_Multiplier; + int32_t m_RightShift; +}; + +/// An implementation shared by normal and depthwise convolution. +template +static void ConvImpl(ConvData data, + const InputType* inputData, + float inputScale, + int32_t inputOffset, + const InputType* filterData, + float filterScale, + int32_t filterOffset, + const BiasType* biasData, + InputType* outputData, + float outputScale, + int32_t outputOffset, + const TensorInfo& filterInfo, + bool depthwise = false) +{ + if (data.m_Parameters.m_BiasEnabled && !biasData) + { + throw InvalidArgumentException("Bias is enabled but the bias data is invalid"); + } + + const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]); + const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]); + + unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1; + unsigned int channelsInput = filterInfo.GetShape()[1]; + unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0]; + + unsigned int batchSize = outputInfo0.GetShape()[0]; + unsigned int heightOutput = outputInfo0.GetShape()[2]; + unsigned int widthOutput = outputInfo0.GetShape()[3]; + unsigned int heightInput = inputInfo0.GetShape()[2]; + unsigned int widthInput = inputInfo0.GetShape()[3]; + + unsigned int heightFilter = filterInfo.GetShape()[2]; + unsigned int widthFilter = filterInfo.GetShape()[3]; + + unsigned int paddingTop = data.m_Parameters.m_PadTop; + unsigned int paddingLeft = data.m_Parameters.m_PadLeft; + unsigned int hStride = data.m_Parameters.m_StrideY; + unsigned int xStride = data.m_Parameters.m_StrideX; + + // The world's least efficient convolution. + for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) + { + for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++) + { + for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++) + { + for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++) + { + // This loop goes over each output element. + AccumulatorType sum = AccumulatorType(); + + // For depthwise, each output channel corresponds to exactly one input channel. + // For normal, must loop over each input channel. + for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++) + { + unsigned int depthwiseMultiplierIdx = 0; + if (depthwise) + { + cInput = cOutput / depthMult; + depthwiseMultiplierIdx = cOutput % depthMult; + } + + for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++) + { + for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++) + { + // This loop goes over each input element for each output element. + + unsigned int filterIndex; + + // Since dimensionality of kernel depends on depthwiseness, so does index. + if (depthwise) + { + filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput + + cInput * widthFilter * heightFilter + + yFilter * widthFilter + + xFilter; + } + else + { + filterIndex = cOutput * widthFilter * heightFilter * channelsInput + + cInput * widthFilter * heightFilter + + yFilter * widthFilter + + xFilter; + } + AccumulatorType filterValue = filterData[filterIndex] - + boost::numeric_cast(filterOffset); + + unsigned int yInput = yOutput * hStride + yFilter; + unsigned int xInput = xOutput * xStride + xFilter; + + AccumulatorType inputValue; + + // Check if we're in the padding. + if (yInput < paddingTop || yInput >= heightInput + paddingTop || + xInput < paddingLeft || xInput >= widthInput + paddingLeft ) + { + inputValue = AccumulatorType(); + } + else + { + inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput + + widthInput * heightInput * cInput + + widthInput * (yInput - paddingTop) + + xInput - paddingLeft] - + boost::numeric_cast(inputOffset); + } + sum += filterValue * inputValue; + } + } + } + + if (data.m_Parameters.m_BiasEnabled) + { + sum += biasData[cOutput]; + } + + if (outputScale != 0.0f) + { + float multiplier = (inputScale * filterScale) / outputScale; + // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent + // with the AndroidNN CPU implementation. This should be (roughly) equivalent to: + // sum = std::round(multiplier * sum + outputOffset); + sum = boost::numeric_cast( + QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast(sum)) + + boost::numeric_cast(outputOffset); + sum = std::min(std::max(sum, 0), 255); + } + + outputData[batchIdx * widthOutput * heightOutput * channelsOutput + + widthOutput * heightOutput * cOutput + + widthOutput * yOutput + + xOutput] = boost::numeric_cast(sum); + } + } + } + } +} + +} //namespace armnn -- cgit v1.2.1