plain/22.08/_ethosn_ref_conv_impl_8hpp_source.xhtml

 //
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #pragma once

 #include "EthosnRefWorkloadUtils.hpp"
 #include "TensorBufferArrayView.hpp"

 #include <armnn/Tensor.hpp>

 #include <armnnUtils/DataLayoutIndexed.hpp>

 #include <armnn/utility/NumericCast.hpp>

 #include <cmath>
 #include <limits>

 namespace armnn
 {

 /// Performs multiplication of an integer with a multiplier which is less than one,
 /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
 struct EthosnRefQuantizedMultiplierSmallerThanOne
 {
 public:
     /// Constructs a EthosnRefQuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
     /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
     /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
     EthosnRefQuantizedMultiplierSmallerThanOne(float multiplier);

     /// The implementation of this function is adapted from Android NN's MultiplyByEthosnRefQuantizedMultiplierSmallerThanOne().
     int32_t operator*(int32_t rhs) const;
     int64_t operator*(int64_t rhs) const;

 private:
     int32_t m_Multiplier;
     int32_t m_RightShift;
 };

 /// An implementation shared by normal and depthwise convolution.
 template<typename ConvData, typename InputType, typename WeightType, typename BiasType, typename AccumulatorType>
 static void EthosnRefConvImpl(ConvData data,
                      const InputType* inputData,
                      float inputScale,
                      int32_t inputOffset,
                      const WeightType* filterData,
                      float filterScale,
                      int32_t filterOffset,
                      const BiasType* biasData,
                      float outputScale,
                      int32_t outputOffset,
                      const TensorInfo& filterInfo,
                      bool depthwise = false)
 {
     if (data.m_Parameters.m_BiasEnabled && !biasData)
     {
         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
     }

     const TensorInfo& inputInfo  = armnn::ethosnref::GetTensorInfo(data.m_Inputs[0]);
     const TensorInfo& outputInfo = armnn::ethosnref::GetTensorInfo(data.m_Outputs[0]);

     TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
                                             armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
                                             data.m_Parameters.m_DataLayout);

     const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);

     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();

     unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
     unsigned int inputChannels   = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
     unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];

     unsigned int batchSize    = outputInfo.GetShape()[0];
     unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
     unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
     unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
     unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];

     unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
     unsigned int filterWidth  = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];

     unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
     unsigned int xStride     = data.m_Parameters.m_StrideX;
     unsigned int yStride     = data.m_Parameters.m_StrideY;
     unsigned int xDilation   = data.m_Parameters.m_DilationX;
     unsigned int yDilation   = data.m_Parameters.m_DilationY;

     // The world's least efficient convolution.
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
         for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
         {
             for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
             {
                 for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
                 {
                     // This loop goes over each output element.
                     AccumulatorType sum = AccumulatorType();

                     // For depthwise, each output channel corresponds to exactly one input channel.
                     // For normal, must loop over each input channel.
                     for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
                         unsigned int depthwiseMultiplierIdx = 0;
                         if (depthwise)
                         {
                             cInput = cOutput / depthMultiplier;
                             depthwiseMultiplierIdx = cOutput % depthMultiplier;
                         }

                         for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
                         {
                             for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
                             {
                                 // This loop goes over each input element for each output element.

                                 unsigned int filterIndex = 0;

                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
                                     filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
                                                   cInput * filterWidth * filterHeight +
                                                   yFilter * filterWidth +
                                                   xFilter;
                                 }
                                 else
                                 {
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
                                         filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
                                                       yFilter * filterWidth * inputChannels +
                                                       xFilter * inputChannels +
                                                       cInput;
                                     }
                                     else
                                     {
                                         filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
                                                       cInput  * filterWidth * filterHeight +
                                                       yFilter * filterWidth +
                                                       xFilter;
                                     }
                                 }

                                 AccumulatorType filterValue = filterData[filterIndex] -
                                     numeric_cast<AccumulatorType>(filterOffset);

                                 unsigned int yInput = yOutput * yStride + yFilter * yDilation;
                                 unsigned int xInput = xOutput * xStride + xFilter * xDilation;

                                 AccumulatorType inputValue;

                                 // Check if we're in the padding.
                                 if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
                                     xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
                                 {
                                     inputValue = AccumulatorType();
                                 }
                                 else
                                 {
                                     unsigned int inputIndex;

                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
                                         inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
                                                      (yInput - paddingTop) * inputWidth * inputChannels +
                                                      (xInput - paddingLeft) * inputChannels +
                                                      cInput;

                                     }
                                     else
                                     {
                                         inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
                                                      inputWidth * inputHeight * cInput +
                                                      inputWidth * (yInput - paddingTop) +
                                                      xInput - paddingLeft;
                                     }

                                     inputValue = inputData[inputIndex] -
                                                     numeric_cast<AccumulatorType>(inputOffset);

                                 }
                                 sum += filterValue * inputValue;
                             }
                         }
                     }

                     if (data.m_Parameters.m_BiasEnabled)
                     {
                         sum += biasData[cOutput];
                     }

                     if (outputScale != 0.0f)
                     {
                         float multiplier = (inputScale * filterScale) / outputScale;

                         sum = numeric_cast<AccumulatorType>(
                                 EthosnRefQuantizedMultiplierSmallerThanOne(multiplier) * sum)
                             + numeric_cast<AccumulatorType>(outputOffset);
                         sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, std::numeric_limits<InputType>::min()),
                                 std::numeric_limits<InputType>::max());
                     }

                     output.Get(batchIdx, cOutput, yOutput, xOutput) = numeric_cast<InputType>(sum);
                 }
             }
         }
     }
 }

 struct hw_float
 {
     bool        sign;
     uint32_t    man;
     uint32_t    exp;
 };

 inline uint32_t one_mask(const uint8_t size)
 {
     uint32_t mask = 0;
     if (size >= 32)
     {
         mask =  0xFFFFFFFF;
     }
     else
     {
         mask = (1u << size) - 1;
     }
     return mask;
 }

 inline bool bit_extract(
     const uint8_t pos,
     const uint32_t val)
 {
     return (((val >> pos) & 1u) == 1u);
 }

 inline uint32_t bit_field(
     const uint8_t high_bit,
     const uint8_t low_bit,
     const uint32_t val)
 {
     uint8_t size;
     int src = high_bit + 1 - low_bit;
     memcpy(&size, &src, sizeof(uint8_t));

     //return ((val >> low_bit) & one_mask(high_bit + 1 - low_bit));
     return ((val >> low_bit) & one_mask(size));
 }

 inline uint32_t wtfp_round_mantissa(
     const uint32_t unshifted_mantissa,
     const uint32_t exp)
 {
     uint32_t rounded_mantissa = 0;
     // No shifting means no rounding
     if (exp == 0)
     {
         rounded_mantissa = unshifted_mantissa;
     }
     else if ((unshifted_mantissa >> exp) == 0xff)
     {
         // Rounding overflow case?
         rounded_mantissa = 0xFF;
     }
     else
     {
         // Pull off the most significant bit that 's being dropped off'
         // and use that to round to nearest.
         uint32_t round_bit = (unshifted_mantissa >> (exp - 1)) & 1;
         rounded_mantissa = (unshifted_mantissa >> exp) + round_bit;
     }

     return rounded_mantissa;
 }

 inline struct hw_float convert_to_S12E8M(
     const int32_t val)
 {
     uint32_t unsigned_val;
     uint32_t unsigned_not_val;
     int32_t  not_val = ~val + 1;

     memcpy(&unsigned_val, &val, sizeof(uint32_t));
     memcpy(&unsigned_not_val, &not_val, sizeof(uint32_t));

     struct hw_float ret_fp;
     ret_fp.sign = bit_extract(10, unsigned_val);
     // Convert to unsigned mantissa
     if (ret_fp.sign)
     {
         ret_fp.man = bit_field(9, 0, unsigned_not_val);
     }
     else
     {
         ret_fp.man = bit_field(9, 0, unsigned_val);
     }
     // Use leading 1 to extract the exponent and mantissa
     ret_fp.exp = 0;
     if (bit_extract(9, ret_fp.man))
     {
         ret_fp.exp = 2;
     }
     else if (bit_extract(8, ret_fp.man))
     {
         ret_fp.exp = 1;
     }
     ret_fp.man = wtfp_round_mantissa(bit_field(10, 0, ret_fp.man), ret_fp.exp);
     return ret_fp;
 }

 inline struct hw_float convert_to_S13E8M(
     const int32_t val)
 {
     uint32_t unsigned_val;
     uint32_t unsigned_not_val;
     int32_t  not_val = ~val + 1;

     memcpy(&unsigned_val, &val, sizeof(uint32_t));
     memcpy(&unsigned_not_val, &not_val, sizeof(uint32_t));

     struct hw_float ret_fp;
     ret_fp.sign = bit_extract(12, unsigned_val);
     // Convert to unsigned mantissa
     if (ret_fp.sign)
     {
         ret_fp.man = bit_field(11, 0, unsigned_not_val);
     }
     else
     {
         ret_fp.man = bit_field(11, 0, unsigned_val);
     }
     // Use leading 1 to extract the exponent and mantissa
     ret_fp.exp = 0;
     if (bit_extract(11, ret_fp.man))
     {
         ret_fp.exp = 4;
     }
     else if (bit_extract(10, ret_fp.man))
     {
         ret_fp.exp = 3;
     }
     else if (bit_extract(9, ret_fp.man))
     {
         ret_fp.exp = 2;
     }
     else if (bit_extract(8, ret_fp.man))
     {
         ret_fp.exp = 1;
     }
     ret_fp.man = wtfp_round_mantissa(bit_field(11, 0, ret_fp.man), ret_fp.exp);
     return ret_fp;
 }

 inline int64_t ShiftRound (int64_t value, bool round=false) {

   int64_t round_bit = round? value & 0x1 : 0;
     int64_t out       = (value >> 1) + round_bit;
   return out;
 }

 template<typename ConvData, typename InputType, typename WeightType, typename BiasType>
 static void EthosnRefWinogradConvImpl(ConvData data,
                      const InputType* inputData,
                      float inputScale,
                      int32_t inputOffset,
                      const WeightType* filterData,
                      float filterScale,
                      int32_t filterOffset,
                      const BiasType* biasData,
                      float outputScale,
                      int32_t outputOffset,
                      const TensorInfo& filterInfo)
 {
     if (data.m_Parameters.m_BiasEnabled && !biasData)
     {
         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
     }

     const TensorInfo& inputInfo  = armnn::ethosnref::GetTensorInfo(data.m_Inputs[0]);
     const TensorInfo& outputInfo = armnn::ethosnref::GetTensorInfo(data.m_Outputs[0]);

     TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
                                             armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
                                             data.m_Parameters.m_DataLayout);

     const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);

     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();

     unsigned int inputChannels   = filterInfo.GetShape()[channelsIndex];
     unsigned int outputChannels  = filterInfo.GetShape()[0];

     unsigned int batchSize    = outputInfo.GetShape()[0];
     unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
     unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
     unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
     unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];

     unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
     unsigned int xStride     = data.m_Parameters.m_StrideX;
     unsigned int yStride     = data.m_Parameters.m_StrideY;
     unsigned int xDilation   = data.m_Parameters.m_DilationX;
     unsigned int yDilation   = data.m_Parameters.m_DilationY;

     unsigned int filterHeight = filterInfo.GetShape()[heightIndex];
     unsigned int filterWidth  = filterInfo.GetShape()[widthIndex];

     // figure out size of Kernel and how many kernel patches
     unsigned int filterHeight_roundup = (filterHeight == 1)? 1 : 3 * armnn::ethosnref::DivideRoundUp(filterHeight, 3);
     unsigned int decomposedHeight     = (filterHeight == 1)? 1 : 3;
     unsigned int filterWidth_roundup  = (filterWidth == 1) ? 1 : 3 * armnn::ethosnref::DivideRoundUp(filterWidth, 3);
     unsigned int decomposedWidth      = (filterWidth == 1) ? 1 : 3;

     unsigned int h_dim, w_dim, w_stride, h_stride;
     if (filterHeight == 1) {
         h_dim = 1;
         w_dim = 4;
         h_stride = 4;
         w_stride = 2;
     } else if (filterWidth == 1) {
         h_dim = 4;
         w_dim = 1;
         h_stride = 2;
         w_stride = 4;
     } else {
         h_dim = 4;
         w_dim = 4;
         h_stride = 2;
         w_stride = 2;
     }

     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
         for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
         {
             for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput=yOutput+h_stride)
             {
                 for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput=xOutput+w_stride)
                 {
                    // This loop goes over each output element - per winograd output patches (h_stride x w_stride).
                    std::vector<int64_t> MAC(4*4, 0);

                    for (unsigned int cInput = 0; cInput < inputChannels; cInput++)
                    {
                       // This loop goes over each filter element - per winograd filter patches (decomposedHeight x decomposedWidth).
                       for (unsigned int yFilter= 0; yFilter < filterHeight_roundup; yFilter += decomposedHeight) {
                           for (unsigned int xFilter= 0; xFilter < filterWidth_roundup; xFilter += decomposedWidth) {
                               // Create transformFilter
                               std::vector<int32_t> filterPreData(decomposedHeight*decomposedWidth, 0);

                               // Get Filter indexes
                               for (unsigned int xIncrFilter= 0; xIncrFilter < decomposedWidth; xIncrFilter++) {
                                 for (unsigned int yIncrFilter= 0; yIncrFilter < decomposedHeight; yIncrFilter++) {

                                   unsigned int filterIndex;
                                   bool validFilterIndex = ((yFilter + yIncrFilter) < filterHeight) &&  ((xFilter + xIncrFilter) < filterWidth);

                                   if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) {
                                     filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
                                                   (yFilter + yIncrFilter)* filterWidth * inputChannels +
                                                   (xFilter + xIncrFilter)* inputChannels +
                                                   cInput;
                                   } else {
                                     filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
                                                   cInput  * filterWidth * filterHeight +
                                                   (yFilter + yIncrFilter) * filterWidth +
                                                   xFilter + xIncrFilter;
                                   }

                                   filterPreData[xIncrFilter + decomposedWidth*yIncrFilter] = validFilterIndex ? filterData[filterIndex] - numeric_cast<int32_t>(filterOffset) : 0;
                                 }
                               }

                               // transform Filters
                               std::vector<int32_t> transformedFilter(4*4, 0);

                               if (h_dim == 1) { // Winograd 1x3
                                 for (unsigned int h = 0; h < 4; h++) {
                                   transformedFilter[h + 4*0] = 2*filterPreData[0];
                                   transformedFilter[h + 4*1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
                                   transformedFilter[h + 4*2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
                                   transformedFilter[h + 4*3] = 2*filterPreData[2];
                                 }
                               } else if (w_dim == 1) { // Winograd 3x1
                                 for (unsigned int w = 0; w < 4; w++) {
                                   transformedFilter[w*4  + 0] = 2*filterPreData[0];
                                   transformedFilter[w*4  + 1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
                                   transformedFilter[w*4  + 2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
                                   transformedFilter[w*4  + 3] = 2*filterPreData[2];
                                 }
                               } else { // Winograd 3x3
                                 std::vector<int32_t> tmpFilter(h_dim*decomposedWidth, 0);

                                 for (unsigned int w_filter=0; w_filter < decomposedWidth; w_filter++) {
                                   tmpFilter[w_filter + 0*decomposedWidth] = 2*filterPreData[w_filter];
                                   tmpFilter[w_filter + 1*decomposedWidth] = filterPreData[w_filter] + filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
                                   tmpFilter[w_filter + 2*decomposedWidth] = filterPreData[w_filter] - filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
                                   tmpFilter[w_filter + 3*decomposedWidth] = 2*filterPreData[2*decomposedWidth+w_filter];
                                 }

                                 for (unsigned int h_filter=0; h_filter < h_dim; h_filter++) {
                                   transformedFilter[h_filter*w_dim + 0] = 2*tmpFilter[h_filter*decomposedWidth];
                                   transformedFilter[h_filter*w_dim + 1] = tmpFilter[h_filter*decomposedWidth] + tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
                                   transformedFilter[h_filter*w_dim + 2] = tmpFilter[h_filter*decomposedWidth] - tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
                                   transformedFilter[h_filter*w_dim + 3] = 2*tmpFilter[2+h_filter*decomposedWidth];
                                 }
                               }

                               // Create transformInput
                               std::vector<int32_t> Inputs(4*4, 0);

                               // Gathering 4x4 input patch
                               for (unsigned int yIncr= 0; yIncr < 4; yIncr++)
                               {
                                 for (unsigned int xIncr = 0; xIncr < 4; xIncr++)
                                 {
                                   unsigned int yInput = yOutput * yStride + (yFilter + yIncr)*yDilation;
                                   unsigned int xInput = xOutput * xStride + (xFilter + xIncr)*xDilation;

                                   bool validIndex = (xInput  >= paddingLeft) && (xInput < inputWidth + paddingLeft) &&
                                                     (yInput >= paddingTop) && (yInput < inputHeight + paddingTop);

                                   unsigned int inputIndex;
                                   if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) {
                                     inputIndex =  batchIdx * inputHeight * inputWidth  * inputChannels +
                                                   (yInput - paddingTop) * inputWidth * inputChannels +
                                                   (xInput - paddingLeft) * inputChannels +
                                                   cInput;

                                   } else {
                                       inputIndex =  batchIdx * inputWidth * inputHeight * inputChannels +
                                                     inputWidth * inputHeight * cInput +
                                                     inputWidth * (yInput - paddingTop) +
                                                     xInput - paddingLeft;
                                   }
                                   if (h_dim ==1) Inputs[yIncr + 4*xIncr] = validIndex? inputData[inputIndex] - numeric_cast<int32_t>(inputOffset) : 0;
                                   else Inputs[xIncr + 4*yIncr] = validIndex? inputData[inputIndex] - numeric_cast<int32_t>(inputOffset) : 0;
                                 }
                               }

                               std::vector<int32_t> transformedInput(4*4, 0);
                               std::vector<int32_t> tmpInput(4*4, 0);

                               for (unsigned int in_w=0; in_w < 4; in_w++) {
                                   tmpInput[in_w + 0*4] = (w_dim == 4)?  Inputs[in_w + 4*0] - Inputs[in_w + 4*2] : Inputs[in_w*4 + 0];
                                   tmpInput[in_w + 1*4] = (w_dim == 4)?  Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 1];
                                   tmpInput[in_w + 2*4] = (w_dim == 4)? -Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 2];
                                   tmpInput[in_w + 3*4] = (w_dim == 4)?  Inputs[in_w + 4*1] - Inputs[in_w + 4*3] : Inputs[in_w*4 + 3];
                               }

                               for (unsigned int in_h=0; in_h < 4; in_h++) {
                                   transformedInput[in_h*4 + 0] = (h_dim == 4)?  tmpInput[0 + 4*in_h] - tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 0];
                                   transformedInput[in_h*4 + 1] = (h_dim == 4)?  tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 1];
                                   transformedInput[in_h*4 + 2] = (h_dim == 4)? -tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 2];
                                   transformedInput[in_h*4 + 3] = (h_dim == 4)?  tmpInput[1 + 4*in_h] - tmpInput[3 + 4*in_h] : tmpInput[in_h*4 + 3];
                               }

                               // Convert in WTFP format
                               // and MAC units WTFP domain

                               for (unsigned int y= 0; y < 4 ; y++) {
                                   for (unsigned int x = 0; x < 4; x++) {
                                       const struct hw_float weight_fp = convert_to_S13E8M(numeric_cast<int32_t>(transformedFilter[x+4*y]));
                                       const struct hw_float ifm_fp = convert_to_S12E8M(numeric_cast<int32_t>(transformedInput[x+4*y]));
                                       const bool sign = (weight_fp.sign != ifm_fp.sign);
                                       uint64_t unsigned_ofm_value = numeric_cast<uint64_t>(ifm_fp.man) * numeric_cast<uint64_t>(weight_fp.man);
                                       uint64_t total_shift = numeric_cast<uint64_t>(weight_fp.exp) + numeric_cast<uint64_t>(ifm_fp.exp);
                                       unsigned_ofm_value <<= total_shift;
                                       if (sign)
                                       {
                                         unsigned_ofm_value = ~unsigned_ofm_value;
                                         unsigned_ofm_value += 1;
                                       }
                                       int64_t ofm_value = 0;
                                       memcpy(&ofm_value, &unsigned_ofm_value, sizeof(int64_t));
                                       MAC[x + 4*y] += ofm_value;
                                 }
                               }
                            }
                         }
                      }

                      // Create transformedFinal
                      std::vector<int64_t> transformedFinal(h_stride*w_stride, 0);

                      if (h_dim == 1) { // Winograd 1x3
                          transformedFinal[0] += ShiftRound(MAC[0] + MAC[4] + MAC[8]);
                          transformedFinal[1] += ShiftRound(MAC[4] - MAC[8] - MAC[12]);
                          transformedFinal[2] += ShiftRound(MAC[1] + MAC[5] + MAC[9]);
                          transformedFinal[3] += ShiftRound(MAC[5] - MAC[9] - MAC[13]);
                          transformedFinal[4] += ShiftRound(MAC[2] + MAC[6] + MAC[10]);
                          transformedFinal[5] += ShiftRound(MAC[6] - MAC[10] - MAC[14]);
                          transformedFinal[6] += ShiftRound(MAC[3] + MAC[7] + MAC[11]);
                          transformedFinal[7] += ShiftRound(MAC[7] - MAC[11] - MAC[15]);
                      } else if (w_dim == 1) { // Winograd 3x1
                          transformedFinal[0] += ShiftRound(MAC[0] + MAC[1] + MAC[2]);
                          transformedFinal[1] += ShiftRound(MAC[4] + MAC[5] + MAC[6]);
                          transformedFinal[2] += ShiftRound(MAC[8] + MAC[9] + MAC[10]);
                          transformedFinal[3] += ShiftRound(MAC[12] + MAC[13] + MAC[14]);
                          transformedFinal[4] += ShiftRound(MAC[1] - MAC[2] - MAC[3]);
                          transformedFinal[5] += ShiftRound(MAC[5] - MAC[6] - MAC[7]);
                          transformedFinal[6] += ShiftRound(MAC[9] - MAC[10] - MAC[11]);
                          transformedFinal[7] += ShiftRound(MAC[13] - MAC[14] - MAC[15]);
                      } else { // Winograd 3x3
                          transformedFinal[0] += ShiftRound(   ShiftRound(MAC[0] + MAC[4]  + MAC[8])
                                                             + ShiftRound(MAC[1] + MAC[5]  + MAC[9])
                                                             + ShiftRound(MAC[2] + MAC[6]  + MAC[10])  );
                          transformedFinal[1] += ShiftRound(   ShiftRound(MAC[1] + MAC[5]  + MAC[9])
                                                             - ShiftRound(MAC[2] + MAC[6]  + MAC[10])
                                                             - ShiftRound(MAC[3] + MAC[7]  + MAC[11])  );
                          transformedFinal[2] += ShiftRound(   ShiftRound(MAC[4] - MAC[8]  - MAC[12])
                                                             + ShiftRound(MAC[5] - MAC[9]  - MAC[13])
                                                             + ShiftRound(MAC[6] - MAC[10] - MAC[14])  );
                          transformedFinal[3] += ShiftRound(   ShiftRound(MAC[5] - MAC[9]  - MAC[13])
                                                             - ShiftRound(MAC[6] - MAC[10] - MAC[14])
                                                             - ShiftRound(MAC[7] - MAC[11] - MAC[15])  );
                      }

                     // Create output pacthes
                     for (unsigned int yOutIncr= 0; yOutIncr < h_stride; yOutIncr++)
                     {
                       for (unsigned int xOutIncr = 0; xOutIncr < w_stride; xOutIncr++)
                       {
                         unsigned int xOut = xOutput + xOutIncr;
                         unsigned int yOut = yOutput + yOutIncr;

                         unsigned int sumIndex = xOutIncr + w_stride*yOutIncr;
                         int64_t value = std::min<int64_t>(std::max<int64_t>(transformedFinal[sumIndex], std::numeric_limits<int32_t>::min()),
                                           std::numeric_limits<int32_t>::max());

                         if (data.m_Parameters.m_BiasEnabled)
                         {
                           value += biasData[cOutput];
                         }
                         if (outputScale != 0.0f)
                         {
                           float multiplier = (inputScale*filterScale) / outputScale;

                           value = numeric_cast<int64_t>(
                                     EthosnRefQuantizedMultiplierSmallerThanOne(multiplier) * numeric_cast<int64_t>(value))
                                   + numeric_cast<int64_t>(outputOffset);
                           value = std::min<int64_t>(std::max<int64_t>(value, std::numeric_limits<InputType>::min()),
                                     std::numeric_limits<InputType>::max());
                         }

                         if (yOut < outputHeight && xOut < outputWidth) output.Get(batchIdx, cOutput, yOut, xOut) = numeric_cast<InputType>(value);
                      }
                   }
                }
             }
          }
       }
    }
 } //namespace armnn
armnn::EthosnRefQuantizedMultiplierSmallerThanOne::operator*
int32_t operator*(int32_t rhs) const
The implementation of this function is adapted from Android NN&#39;s MultiplyByEthosnRefQuantizedMultipli...
Definition: EthosnRefConvImpl.cpp:48

armnn::convert_to_S12E8M
struct hw_float convert_to_S12E8M(const int32_t val)
Definition: EthosnRefConvImpl.hpp:285

DataLayoutIndexed.hpp

armnn::bit_field
uint32_t bit_field(const uint8_t high_bit, const uint8_t low_bit, const uint32_t val)
Definition: EthosnRefConvImpl.hpp:246

armnnUtils::DataLayoutIndexed::GetWidthIndex
unsigned int GetWidthIndex() const
Definition: DataLayoutIndexed.hpp:25

armnn::one_mask
uint32_t one_mask(const uint8_t size)
Definition: EthosnRefConvImpl.hpp:225

Tensor.hpp

armnn::TensorInfo::GetShape
const TensorShape & GetShape() const
Definition: Tensor.hpp:191

armnn::TensorInfo
Definition: Tensor.hpp:152

EthosnRefWorkloadUtils.hpp

armnn::wtfp_round_mantissa
uint32_t wtfp_round_mantissa(const uint32_t unshifted_mantissa, const uint32_t exp)
Definition: EthosnRefConvImpl.hpp:259

armnn::EthosnRefQuantizedMultiplierSmallerThanOne
Performs multiplication of an integer with a multiplier which is less than one, using quantized integ...
Definition: EthosnRefConvImpl.hpp:25

armnn::hw_float::exp
uint32_t exp
Definition: EthosnRefConvImpl.hpp:222

armnn
Copyright (c) 2021 ARM Limited and Contributors.
Definition: 01_00_quick_start.dox:6

TensorBufferArrayView.hpp

armnnUtils::DataLayoutIndexed::GetHeightIndex
unsigned int GetHeightIndex() const
Definition: DataLayoutIndexed.hpp:24

NumericCast.hpp

armnn::ethosnref::DivideRoundUp
uint32_t DivideRoundUp(uint32_t numerator, uint32_t denominator)
Definition: EthosnRefWorkloadUtils.hpp:51

armnn::TensorBufferArrayView::Get
DataType & Get(unsigned int b, unsigned int c, unsigned int h, unsigned int w) const
Definition: TensorBufferArrayView.hpp:31

armnn::ethosnref::GetTensorInfo
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
Definition: EthosnRefWorkloadUtils.hpp:29

armnn::TensorBufferArrayView
Definition: TensorBufferArrayView.hpp:19

armnnUtils::DataLayoutIndexed
Provides access to the appropriate indexes for Channels, Height and Width based on DataLayout...
Definition: DataLayoutIndexed.hpp:17

armnn::bit_extract
bool bit_extract(const uint8_t pos, const uint32_t val)
Definition: EthosnRefConvImpl.hpp:239

armnn::convert_to_S13E8M
struct hw_float convert_to_S13E8M(const int32_t val)
Definition: EthosnRefConvImpl.hpp:320

armnn::exp
Definition: Exp.hpp:13

armnn::InvalidArgumentException
Definition: Exceptions.hpp:80

armnn::ShiftRound
int64_t ShiftRound(int64_t value, bool round=false)
Definition: EthosnRefConvImpl.hpp:363

armnn::EthosnRefQuantizedMultiplierSmallerThanOne::EthosnRefQuantizedMultiplierSmallerThanOne
EthosnRefQuantizedMultiplierSmallerThanOne(float multiplier)
Constructs a EthosnRefQuantizedMultiplierSmallerThanOne which will multiply by the given multiplier...
Definition: EthosnRefConvImpl.cpp:16

armnn::hw_float::man
uint32_t man
Definition: EthosnRefConvImpl.hpp:221

armnn::numeric_cast
std::enable_if_t< std::is_unsigned< Source >::value &&std::is_unsigned< Dest >::value, Dest > numeric_cast(Source source)
Definition: NumericCast.hpp:35

armnn::hw_float::sign
bool sign
Definition: EthosnRefConvImpl.hpp:220

armnn::hw_float
Definition: EthosnRefConvImpl.hpp:218

armnnUtils::DataLayoutIndexed::GetChannelsIndex
unsigned int GetChannelsIndex() const
Definition: DataLayoutIndexed.hpp:23

armnn::DataLayout::NHWC