// // Copyright © 2017 Arm Ltd. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once #include "RefWorkloadUtils.hpp" #include "TensorBufferArrayView.hpp" #include "BaseIterator.hpp" #include "Decoders.hpp" #include "Encoders.hpp" #include #include #include #include #include #include #include namespace armnn { /// Performs multiplication of an integer with a multiplier which is less than one, /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor. struct QuantizedMultiplierSmallerThanOne { public: /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier. /// This stores the appropriate integer quantities (derived from the given multiplier) for later use. /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne(). QuantizedMultiplierSmallerThanOne(float multiplier); /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne(). int32_t operator*(int32_t rhs) const; private: /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul(). static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT(). static int32_t RoundingDivideByPOT(int32_t x, int exponent); int32_t m_Multiplier; int32_t m_RightShift; }; /// An implementation shared by normal and depthwise convolution. template static void ConvImpl(ConvData data, const InputType* inputData, float inputScale, int32_t inputOffset, const InputType* filterData, float filterScale, int32_t filterOffset, const BiasType* biasData, float outputScale, int32_t outputOffset, const TensorInfo& filterInfo, bool depthwise = false) { if (data.m_Parameters.m_BiasEnabled && !biasData) { throw InvalidArgumentException("Bias is enabled but the bias data is invalid"); } const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]); const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]); TensorBufferArrayView output(outputInfo.GetShape(), GetOutputTensorData(0, data), data.m_Parameters.m_DataLayout); const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout); const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex(); const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex(); const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex(); unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1; unsigned int inputChannels = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex]; unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0]; unsigned int batchSize = outputInfo.GetShape()[0]; unsigned int outputHeight = outputInfo.GetShape()[heightIndex]; unsigned int outputWidth = outputInfo.GetShape()[widthIndex]; unsigned int inputHeight = inputInfo.GetShape()[heightIndex]; unsigned int inputWidth = inputInfo.GetShape()[widthIndex]; unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex]; unsigned int filterWidth = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex]; unsigned int paddingTop = data.m_Parameters.m_PadTop; unsigned int paddingLeft = data.m_Parameters.m_PadLeft; unsigned int xStride = data.m_Parameters.m_StrideX; unsigned int yStride = data.m_Parameters.m_StrideY; unsigned int xDilation = data.m_Parameters.m_DilationX; unsigned int yDilation = data.m_Parameters.m_DilationY; // The world's least efficient convolution. for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++) { for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++) { for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++) { // This loop goes over each output element. AccumulatorType sum = AccumulatorType(); // For depthwise, each output channel corresponds to exactly one input channel. // For normal, must loop over each input channel. for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++) { unsigned int depthwiseMultiplierIdx = 0; if (depthwise) { cInput = cOutput / depthMultiplier; depthwiseMultiplierIdx = cOutput % depthMultiplier; } for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++) { for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++) { // This loop goes over each input element for each output element. unsigned int filterIndex = 0; // Since dimensionality of kernel depends on depthwiseness, so does index. if (depthwise) { filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels + cInput * filterWidth * filterHeight + yFilter * filterWidth + xFilter; } else { if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) { filterIndex = cOutput * filterHeight * filterWidth * inputChannels + yFilter * filterWidth * inputChannels + xFilter * inputChannels + cInput; } else { filterIndex = cOutput * filterWidth * filterHeight * inputChannels + cInput * filterWidth * filterHeight + yFilter * filterWidth + xFilter; } } AccumulatorType filterValue = filterData[filterIndex] - boost::numeric_cast(filterOffset); unsigned int yInput = yOutput * yStride + yFilter * yDilation; unsigned int xInput = xOutput * xStride + xFilter * xDilation; AccumulatorType inputValue; // Check if we're in the padding. if (yInput < paddingTop || yInput >= inputHeight + paddingTop || xInput < paddingLeft || xInput >= inputWidth + paddingLeft ) { inputValue = AccumulatorType(); } else { unsigned int inputIndex; if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) { inputIndex = batchIdx * inputHeight * inputWidth * inputChannels + (yInput - paddingTop) * inputWidth * inputChannels + (xInput - paddingLeft) * inputChannels + cInput; } else { inputIndex = batchIdx * inputWidth * inputHeight * inputChannels + inputWidth * inputHeight * cInput + inputWidth * (yInput - paddingTop) + xInput - paddingLeft; } inputValue = inputData[inputIndex] - boost::numeric_cast(inputOffset); } sum += filterValue * inputValue; } } } if (data.m_Parameters.m_BiasEnabled) { sum += biasData[cOutput]; } if (outputScale != 0.0f) { float multiplier = (inputScale * filterScale) / outputScale; // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent // with the AndroidNN CPU implementation. This should be (roughly) equivalent to: // sum = std::round(multiplier * sum + outputOffset); sum = boost::numeric_cast( QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast(sum)) + boost::numeric_cast(outputOffset); sum = std::min(std::max(sum, 0), 255); } output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast(sum); } } } } } void Convolve(const TensorShape& rInputShape, Decoder& rInputDecoder, const TensorShape& rOutputShape, Encoder& rOutputEncoder, const TensorShape& rFilterShape, Decoder& rFilterDecoder, bool biasEnabled, Decoder* pBiasDecoder, DataLayout dataLayout, unsigned int paddingTop, unsigned int paddingLeft, unsigned int xStride, unsigned int yStride, unsigned int xDilation, unsigned int yDilation, bool depthwise = false); } //namespace armnn