ArmNN
 22.08
EthosnRefConvImpl.hpp
Go to the documentation of this file.
1 //
2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #pragma once
7 
10 
11 #include <armnn/Tensor.hpp>
12 
14 
16 
17 #include <cmath>
18 #include <limits>
19 
20 namespace armnn
21 {
22 
23 /// Performs multiplication of an integer with a multiplier which is less than one,
24 /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
26 {
27 public:
28  /// Constructs a EthosnRefQuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
29  /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
30  /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
32 
33  /// The implementation of this function is adapted from Android NN's MultiplyByEthosnRefQuantizedMultiplierSmallerThanOne().
34  int32_t operator*(int32_t rhs) const;
35  int64_t operator*(int64_t rhs) const;
36 
37 private:
38  int32_t m_Multiplier;
39  int32_t m_RightShift;
40 };
41 
42 /// An implementation shared by normal and depthwise convolution.
43 template<typename ConvData, typename InputType, typename WeightType, typename BiasType, typename AccumulatorType>
44 static void EthosnRefConvImpl(ConvData data,
45  const InputType* inputData,
46  float inputScale,
47  int32_t inputOffset,
48  const WeightType* filterData,
49  float filterScale,
50  int32_t filterOffset,
51  const BiasType* biasData,
52  float outputScale,
53  int32_t outputOffset,
54  const TensorInfo& filterInfo,
55  bool depthwise = false)
56 {
57  if (data.m_Parameters.m_BiasEnabled && !biasData)
58  {
59  throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
60  }
61 
62  const TensorInfo& inputInfo = armnn::ethosnref::GetTensorInfo(data.m_Inputs[0]);
63  const TensorInfo& outputInfo = armnn::ethosnref::GetTensorInfo(data.m_Outputs[0]);
64 
65  TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
66  armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
67  data.m_Parameters.m_DataLayout);
68 
69  const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
70 
71  const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
72  const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
73  const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
74 
75  unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
76  unsigned int inputChannels = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
77  unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
78 
79  unsigned int batchSize = outputInfo.GetShape()[0];
80  unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
81  unsigned int outputWidth = outputInfo.GetShape()[widthIndex];
82  unsigned int inputHeight = inputInfo.GetShape()[heightIndex];
83  unsigned int inputWidth = inputInfo.GetShape()[widthIndex];
84 
85  unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
86  unsigned int filterWidth = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
87 
88  unsigned int paddingTop = data.m_Parameters.m_PadTop;
89  unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
90  unsigned int xStride = data.m_Parameters.m_StrideX;
91  unsigned int yStride = data.m_Parameters.m_StrideY;
92  unsigned int xDilation = data.m_Parameters.m_DilationX;
93  unsigned int yDilation = data.m_Parameters.m_DilationY;
94 
95  // The world's least efficient convolution.
96  for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
97  {
98  for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
99  {
100  for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
101  {
102  for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
103  {
104  // This loop goes over each output element.
105  AccumulatorType sum = AccumulatorType();
106 
107  // For depthwise, each output channel corresponds to exactly one input channel.
108  // For normal, must loop over each input channel.
109  for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
110  {
111  unsigned int depthwiseMultiplierIdx = 0;
112  if (depthwise)
113  {
114  cInput = cOutput / depthMultiplier;
115  depthwiseMultiplierIdx = cOutput % depthMultiplier;
116  }
117 
118  for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
119  {
120  for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
121  {
122  // This loop goes over each input element for each output element.
123 
124  unsigned int filterIndex = 0;
125 
126  // Since dimensionality of kernel depends on depthwiseness, so does index.
127  if (depthwise)
128  {
129  filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
130  cInput * filterWidth * filterHeight +
131  yFilter * filterWidth +
132  xFilter;
133  }
134  else
135  {
136  if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
137  {
138  filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
139  yFilter * filterWidth * inputChannels +
140  xFilter * inputChannels +
141  cInput;
142  }
143  else
144  {
145  filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
146  cInput * filterWidth * filterHeight +
147  yFilter * filterWidth +
148  xFilter;
149  }
150  }
151 
152  AccumulatorType filterValue = filterData[filterIndex] -
153  numeric_cast<AccumulatorType>(filterOffset);
154 
155  unsigned int yInput = yOutput * yStride + yFilter * yDilation;
156  unsigned int xInput = xOutput * xStride + xFilter * xDilation;
157 
158  AccumulatorType inputValue;
159 
160  // Check if we're in the padding.
161  if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
162  xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
163  {
164  inputValue = AccumulatorType();
165  }
166  else
167  {
168  unsigned int inputIndex;
169 
170  if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
171  {
172  inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
173  (yInput - paddingTop) * inputWidth * inputChannels +
174  (xInput - paddingLeft) * inputChannels +
175  cInput;
176 
177  }
178  else
179  {
180  inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
181  inputWidth * inputHeight * cInput +
182  inputWidth * (yInput - paddingTop) +
183  xInput - paddingLeft;
184  }
185 
186  inputValue = inputData[inputIndex] -
187  numeric_cast<AccumulatorType>(inputOffset);
188 
189  }
190  sum += filterValue * inputValue;
191  }
192  }
193  }
194 
195  if (data.m_Parameters.m_BiasEnabled)
196  {
197  sum += biasData[cOutput];
198  }
199 
200  if (outputScale != 0.0f)
201  {
202  float multiplier = (inputScale * filterScale) / outputScale;
203 
204  sum = numeric_cast<AccumulatorType>(
206  + numeric_cast<AccumulatorType>(outputOffset);
207  sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, std::numeric_limits<InputType>::min()),
208  std::numeric_limits<InputType>::max());
209  }
210 
211  output.Get(batchIdx, cOutput, yOutput, xOutput) = numeric_cast<InputType>(sum);
212  }
213  }
214  }
215  }
216 }
217 
218 struct hw_float
219 {
220  bool sign;
221  uint32_t man;
222  uint32_t exp;
223 };
224 
225 inline uint32_t one_mask(const uint8_t size)
226 {
227  uint32_t mask = 0;
228  if (size >= 32)
229  {
230  mask = 0xFFFFFFFF;
231  }
232  else
233  {
234  mask = (1u << size) - 1;
235  }
236  return mask;
237 }
238 
239 inline bool bit_extract(
240  const uint8_t pos,
241  const uint32_t val)
242 {
243  return (((val >> pos) & 1u) == 1u);
244 }
245 
246 inline uint32_t bit_field(
247  const uint8_t high_bit,
248  const uint8_t low_bit,
249  const uint32_t val)
250 {
251  uint8_t size;
252  int src = high_bit + 1 - low_bit;
253  memcpy(&size, &src, sizeof(uint8_t));
254 
255  //return ((val >> low_bit) & one_mask(high_bit + 1 - low_bit));
256  return ((val >> low_bit) & one_mask(size));
257 }
258 
259 inline uint32_t wtfp_round_mantissa(
260  const uint32_t unshifted_mantissa,
261  const uint32_t exp)
262 {
263  uint32_t rounded_mantissa = 0;
264  // No shifting means no rounding
265  if (exp == 0)
266  {
267  rounded_mantissa = unshifted_mantissa;
268  }
269  else if ((unshifted_mantissa >> exp) == 0xff)
270  {
271  // Rounding overflow case?
272  rounded_mantissa = 0xFF;
273  }
274  else
275  {
276  // Pull off the most significant bit that 's being dropped off'
277  // and use that to round to nearest.
278  uint32_t round_bit = (unshifted_mantissa >> (exp - 1)) & 1;
279  rounded_mantissa = (unshifted_mantissa >> exp) + round_bit;
280  }
281 
282  return rounded_mantissa;
283 }
284 
286  const int32_t val)
287 {
288  uint32_t unsigned_val;
289  uint32_t unsigned_not_val;
290  int32_t not_val = ~val + 1;
291 
292  memcpy(&unsigned_val, &val, sizeof(uint32_t));
293  memcpy(&unsigned_not_val, &not_val, sizeof(uint32_t));
294 
295  struct hw_float ret_fp;
296  ret_fp.sign = bit_extract(10, unsigned_val);
297  // Convert to unsigned mantissa
298  if (ret_fp.sign)
299  {
300  ret_fp.man = bit_field(9, 0, unsigned_not_val);
301  }
302  else
303  {
304  ret_fp.man = bit_field(9, 0, unsigned_val);
305  }
306  // Use leading 1 to extract the exponent and mantissa
307  ret_fp.exp = 0;
308  if (bit_extract(9, ret_fp.man))
309  {
310  ret_fp.exp = 2;
311  }
312  else if (bit_extract(8, ret_fp.man))
313  {
314  ret_fp.exp = 1;
315  }
316  ret_fp.man = wtfp_round_mantissa(bit_field(10, 0, ret_fp.man), ret_fp.exp);
317  return ret_fp;
318 }
319 
321  const int32_t val)
322 {
323  uint32_t unsigned_val;
324  uint32_t unsigned_not_val;
325  int32_t not_val = ~val + 1;
326 
327  memcpy(&unsigned_val, &val, sizeof(uint32_t));
328  memcpy(&unsigned_not_val, &not_val, sizeof(uint32_t));
329 
330  struct hw_float ret_fp;
331  ret_fp.sign = bit_extract(12, unsigned_val);
332  // Convert to unsigned mantissa
333  if (ret_fp.sign)
334  {
335  ret_fp.man = bit_field(11, 0, unsigned_not_val);
336  }
337  else
338  {
339  ret_fp.man = bit_field(11, 0, unsigned_val);
340  }
341  // Use leading 1 to extract the exponent and mantissa
342  ret_fp.exp = 0;
343  if (bit_extract(11, ret_fp.man))
344  {
345  ret_fp.exp = 4;
346  }
347  else if (bit_extract(10, ret_fp.man))
348  {
349  ret_fp.exp = 3;
350  }
351  else if (bit_extract(9, ret_fp.man))
352  {
353  ret_fp.exp = 2;
354  }
355  else if (bit_extract(8, ret_fp.man))
356  {
357  ret_fp.exp = 1;
358  }
359  ret_fp.man = wtfp_round_mantissa(bit_field(11, 0, ret_fp.man), ret_fp.exp);
360  return ret_fp;
361 }
362 
363 inline int64_t ShiftRound (int64_t value, bool round=false) {
364 
365  int64_t round_bit = round? value & 0x1 : 0;
366  int64_t out = (value >> 1) + round_bit;
367  return out;
368 }
369 
370 template<typename ConvData, typename InputType, typename WeightType, typename BiasType>
371 static void EthosnRefWinogradConvImpl(ConvData data,
372  const InputType* inputData,
373  float inputScale,
374  int32_t inputOffset,
375  const WeightType* filterData,
376  float filterScale,
377  int32_t filterOffset,
378  const BiasType* biasData,
379  float outputScale,
380  int32_t outputOffset,
381  const TensorInfo& filterInfo)
382 {
383  if (data.m_Parameters.m_BiasEnabled && !biasData)
384  {
385  throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
386  }
387 
388  const TensorInfo& inputInfo = armnn::ethosnref::GetTensorInfo(data.m_Inputs[0]);
389  const TensorInfo& outputInfo = armnn::ethosnref::GetTensorInfo(data.m_Outputs[0]);
390 
391  TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
392  armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
393  data.m_Parameters.m_DataLayout);
394 
395  const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
396 
397  const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
398  const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
399  const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
400 
401  unsigned int inputChannels = filterInfo.GetShape()[channelsIndex];
402  unsigned int outputChannels = filterInfo.GetShape()[0];
403 
404  unsigned int batchSize = outputInfo.GetShape()[0];
405  unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
406  unsigned int outputWidth = outputInfo.GetShape()[widthIndex];
407  unsigned int inputHeight = inputInfo.GetShape()[heightIndex];
408  unsigned int inputWidth = inputInfo.GetShape()[widthIndex];
409 
410  unsigned int paddingTop = data.m_Parameters.m_PadTop;
411  unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
412  unsigned int xStride = data.m_Parameters.m_StrideX;
413  unsigned int yStride = data.m_Parameters.m_StrideY;
414  unsigned int xDilation = data.m_Parameters.m_DilationX;
415  unsigned int yDilation = data.m_Parameters.m_DilationY;
416 
417  unsigned int filterHeight = filterInfo.GetShape()[heightIndex];
418  unsigned int filterWidth = filterInfo.GetShape()[widthIndex];
419 
420  // figure out size of Kernel and how many kernel patches
421  unsigned int filterHeight_roundup = (filterHeight == 1)? 1 : 3 * armnn::ethosnref::DivideRoundUp(filterHeight, 3);
422  unsigned int decomposedHeight = (filterHeight == 1)? 1 : 3;
423  unsigned int filterWidth_roundup = (filterWidth == 1) ? 1 : 3 * armnn::ethosnref::DivideRoundUp(filterWidth, 3);
424  unsigned int decomposedWidth = (filterWidth == 1) ? 1 : 3;
425 
426  unsigned int h_dim, w_dim, w_stride, h_stride;
427  if (filterHeight == 1) {
428  h_dim = 1;
429  w_dim = 4;
430  h_stride = 4;
431  w_stride = 2;
432  } else if (filterWidth == 1) {
433  h_dim = 4;
434  w_dim = 1;
435  h_stride = 2;
436  w_stride = 4;
437  } else {
438  h_dim = 4;
439  w_dim = 4;
440  h_stride = 2;
441  w_stride = 2;
442  }
443 
444  for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
445  {
446  for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
447  {
448  for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput=yOutput+h_stride)
449  {
450  for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput=xOutput+w_stride)
451  {
452  // This loop goes over each output element - per winograd output patches (h_stride x w_stride).
453  std::vector<int64_t> MAC(4*4, 0);
454 
455  for (unsigned int cInput = 0; cInput < inputChannels; cInput++)
456  {
457  // This loop goes over each filter element - per winograd filter patches (decomposedHeight x decomposedWidth).
458  for (unsigned int yFilter= 0; yFilter < filterHeight_roundup; yFilter += decomposedHeight) {
459  for (unsigned int xFilter= 0; xFilter < filterWidth_roundup; xFilter += decomposedWidth) {
460  // Create transformFilter
461  std::vector<int32_t> filterPreData(decomposedHeight*decomposedWidth, 0);
462 
463  // Get Filter indexes
464  for (unsigned int xIncrFilter= 0; xIncrFilter < decomposedWidth; xIncrFilter++) {
465  for (unsigned int yIncrFilter= 0; yIncrFilter < decomposedHeight; yIncrFilter++) {
466 
467  unsigned int filterIndex;
468  bool validFilterIndex = ((yFilter + yIncrFilter) < filterHeight) && ((xFilter + xIncrFilter) < filterWidth);
469 
470  if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) {
471  filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
472  (yFilter + yIncrFilter)* filterWidth * inputChannels +
473  (xFilter + xIncrFilter)* inputChannels +
474  cInput;
475  } else {
476  filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
477  cInput * filterWidth * filterHeight +
478  (yFilter + yIncrFilter) * filterWidth +
479  xFilter + xIncrFilter;
480  }
481 
482  filterPreData[xIncrFilter + decomposedWidth*yIncrFilter] = validFilterIndex ? filterData[filterIndex] - numeric_cast<int32_t>(filterOffset) : 0;
483  }
484  }
485 
486  // transform Filters
487  std::vector<int32_t> transformedFilter(4*4, 0);
488 
489  if (h_dim == 1) { // Winograd 1x3
490  for (unsigned int h = 0; h < 4; h++) {
491  transformedFilter[h + 4*0] = 2*filterPreData[0];
492  transformedFilter[h + 4*1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
493  transformedFilter[h + 4*2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
494  transformedFilter[h + 4*3] = 2*filterPreData[2];
495  }
496  } else if (w_dim == 1) { // Winograd 3x1
497  for (unsigned int w = 0; w < 4; w++) {
498  transformedFilter[w*4 + 0] = 2*filterPreData[0];
499  transformedFilter[w*4 + 1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
500  transformedFilter[w*4 + 2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
501  transformedFilter[w*4 + 3] = 2*filterPreData[2];
502  }
503  } else { // Winograd 3x3
504  std::vector<int32_t> tmpFilter(h_dim*decomposedWidth, 0);
505 
506  for (unsigned int w_filter=0; w_filter < decomposedWidth; w_filter++) {
507  tmpFilter[w_filter + 0*decomposedWidth] = 2*filterPreData[w_filter];
508  tmpFilter[w_filter + 1*decomposedWidth] = filterPreData[w_filter] + filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
509  tmpFilter[w_filter + 2*decomposedWidth] = filterPreData[w_filter] - filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
510  tmpFilter[w_filter + 3*decomposedWidth] = 2*filterPreData[2*decomposedWidth+w_filter];
511  }
512 
513  for (unsigned int h_filter=0; h_filter < h_dim; h_filter++) {
514  transformedFilter[h_filter*w_dim + 0] = 2*tmpFilter[h_filter*decomposedWidth];
515  transformedFilter[h_filter*w_dim + 1] = tmpFilter[h_filter*decomposedWidth] + tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
516  transformedFilter[h_filter*w_dim + 2] = tmpFilter[h_filter*decomposedWidth] - tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
517  transformedFilter[h_filter*w_dim + 3] = 2*tmpFilter[2+h_filter*decomposedWidth];
518  }
519  }
520 
521  // Create transformInput
522  std::vector<int32_t> Inputs(4*4, 0);
523 
524  // Gathering 4x4 input patch
525  for (unsigned int yIncr= 0; yIncr < 4; yIncr++)
526  {
527  for (unsigned int xIncr = 0; xIncr < 4; xIncr++)
528  {
529  unsigned int yInput = yOutput * yStride + (yFilter + yIncr)*yDilation;
530  unsigned int xInput = xOutput * xStride + (xFilter + xIncr)*xDilation;
531 
532  bool validIndex = (xInput >= paddingLeft) && (xInput < inputWidth + paddingLeft) &&
533  (yInput >= paddingTop) && (yInput < inputHeight + paddingTop);
534 
535  unsigned int inputIndex;
536  if (data.m_Parameters.m_DataLayout == DataLayout::NHWC) {
537  inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
538  (yInput - paddingTop) * inputWidth * inputChannels +
539  (xInput - paddingLeft) * inputChannels +
540  cInput;
541 
542  } else {
543  inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
544  inputWidth * inputHeight * cInput +
545  inputWidth * (yInput - paddingTop) +
546  xInput - paddingLeft;
547  }
548  if (h_dim ==1) Inputs[yIncr + 4*xIncr] = validIndex? inputData[inputIndex] - numeric_cast<int32_t>(inputOffset) : 0;
549  else Inputs[xIncr + 4*yIncr] = validIndex? inputData[inputIndex] - numeric_cast<int32_t>(inputOffset) : 0;
550  }
551  }
552 
553  std::vector<int32_t> transformedInput(4*4, 0);
554  std::vector<int32_t> tmpInput(4*4, 0);
555 
556  for (unsigned int in_w=0; in_w < 4; in_w++) {
557  tmpInput[in_w + 0*4] = (w_dim == 4)? Inputs[in_w + 4*0] - Inputs[in_w + 4*2] : Inputs[in_w*4 + 0];
558  tmpInput[in_w + 1*4] = (w_dim == 4)? Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 1];
559  tmpInput[in_w + 2*4] = (w_dim == 4)? -Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 2];
560  tmpInput[in_w + 3*4] = (w_dim == 4)? Inputs[in_w + 4*1] - Inputs[in_w + 4*3] : Inputs[in_w*4 + 3];
561  }
562 
563  for (unsigned int in_h=0; in_h < 4; in_h++) {
564  transformedInput[in_h*4 + 0] = (h_dim == 4)? tmpInput[0 + 4*in_h] - tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 0];
565  transformedInput[in_h*4 + 1] = (h_dim == 4)? tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 1];
566  transformedInput[in_h*4 + 2] = (h_dim == 4)? -tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 2];
567  transformedInput[in_h*4 + 3] = (h_dim == 4)? tmpInput[1 + 4*in_h] - tmpInput[3 + 4*in_h] : tmpInput[in_h*4 + 3];
568  }
569 
570  // Convert in WTFP format
571  // and MAC units WTFP domain
572 
573  for (unsigned int y= 0; y < 4 ; y++) {
574  for (unsigned int x = 0; x < 4; x++) {
575  const struct hw_float weight_fp = convert_to_S13E8M(numeric_cast<int32_t>(transformedFilter[x+4*y]));
576  const struct hw_float ifm_fp = convert_to_S12E8M(numeric_cast<int32_t>(transformedInput[x+4*y]));
577  const bool sign = (weight_fp.sign != ifm_fp.sign);
578  uint64_t unsigned_ofm_value = numeric_cast<uint64_t>(ifm_fp.man) * numeric_cast<uint64_t>(weight_fp.man);
579  uint64_t total_shift = numeric_cast<uint64_t>(weight_fp.exp) + numeric_cast<uint64_t>(ifm_fp.exp);
580  unsigned_ofm_value <<= total_shift;
581  if (sign)
582  {
583  unsigned_ofm_value = ~unsigned_ofm_value;
584  unsigned_ofm_value += 1;
585  }
586  int64_t ofm_value = 0;
587  memcpy(&ofm_value, &unsigned_ofm_value, sizeof(int64_t));
588  MAC[x + 4*y] += ofm_value;
589  }
590  }
591  }
592  }
593  }
594 
595  // Create transformedFinal
596  std::vector<int64_t> transformedFinal(h_stride*w_stride, 0);
597 
598  if (h_dim == 1) { // Winograd 1x3
599  transformedFinal[0] += ShiftRound(MAC[0] + MAC[4] + MAC[8]);
600  transformedFinal[1] += ShiftRound(MAC[4] - MAC[8] - MAC[12]);
601  transformedFinal[2] += ShiftRound(MAC[1] + MAC[5] + MAC[9]);
602  transformedFinal[3] += ShiftRound(MAC[5] - MAC[9] - MAC[13]);
603  transformedFinal[4] += ShiftRound(MAC[2] + MAC[6] + MAC[10]);
604  transformedFinal[5] += ShiftRound(MAC[6] - MAC[10] - MAC[14]);
605  transformedFinal[6] += ShiftRound(MAC[3] + MAC[7] + MAC[11]);
606  transformedFinal[7] += ShiftRound(MAC[7] - MAC[11] - MAC[15]);
607  } else if (w_dim == 1) { // Winograd 3x1
608  transformedFinal[0] += ShiftRound(MAC[0] + MAC[1] + MAC[2]);
609  transformedFinal[1] += ShiftRound(MAC[4] + MAC[5] + MAC[6]);
610  transformedFinal[2] += ShiftRound(MAC[8] + MAC[9] + MAC[10]);
611  transformedFinal[3] += ShiftRound(MAC[12] + MAC[13] + MAC[14]);
612  transformedFinal[4] += ShiftRound(MAC[1] - MAC[2] - MAC[3]);
613  transformedFinal[5] += ShiftRound(MAC[5] - MAC[6] - MAC[7]);
614  transformedFinal[6] += ShiftRound(MAC[9] - MAC[10] - MAC[11]);
615  transformedFinal[7] += ShiftRound(MAC[13] - MAC[14] - MAC[15]);
616  } else { // Winograd 3x3
617  transformedFinal[0] += ShiftRound( ShiftRound(MAC[0] + MAC[4] + MAC[8])
618  + ShiftRound(MAC[1] + MAC[5] + MAC[9])
619  + ShiftRound(MAC[2] + MAC[6] + MAC[10]) );
620  transformedFinal[1] += ShiftRound( ShiftRound(MAC[1] + MAC[5] + MAC[9])
621  - ShiftRound(MAC[2] + MAC[6] + MAC[10])
622  - ShiftRound(MAC[3] + MAC[7] + MAC[11]) );
623  transformedFinal[2] += ShiftRound( ShiftRound(MAC[4] - MAC[8] - MAC[12])
624  + ShiftRound(MAC[5] - MAC[9] - MAC[13])
625  + ShiftRound(MAC[6] - MAC[10] - MAC[14]) );
626  transformedFinal[3] += ShiftRound( ShiftRound(MAC[5] - MAC[9] - MAC[13])
627  - ShiftRound(MAC[6] - MAC[10] - MAC[14])
628  - ShiftRound(MAC[7] - MAC[11] - MAC[15]) );
629  }
630 
631  // Create output pacthes
632  for (unsigned int yOutIncr= 0; yOutIncr < h_stride; yOutIncr++)
633  {
634  for (unsigned int xOutIncr = 0; xOutIncr < w_stride; xOutIncr++)
635  {
636  unsigned int xOut = xOutput + xOutIncr;
637  unsigned int yOut = yOutput + yOutIncr;
638 
639  unsigned int sumIndex = xOutIncr + w_stride*yOutIncr;
640  int64_t value = std::min<int64_t>(std::max<int64_t>(transformedFinal[sumIndex], std::numeric_limits<int32_t>::min()),
641  std::numeric_limits<int32_t>::max());
642 
643  if (data.m_Parameters.m_BiasEnabled)
644  {
645  value += biasData[cOutput];
646  }
647  if (outputScale != 0.0f)
648  {
649  float multiplier = (inputScale*filterScale) / outputScale;
650 
651  value = numeric_cast<int64_t>(
652  EthosnRefQuantizedMultiplierSmallerThanOne(multiplier) * numeric_cast<int64_t>(value))
653  + numeric_cast<int64_t>(outputOffset);
654  value = std::min<int64_t>(std::max<int64_t>(value, std::numeric_limits<InputType>::min()),
655  std::numeric_limits<InputType>::max());
656  }
657 
658  if (yOut < outputHeight && xOut < outputWidth) output.Get(batchIdx, cOutput, yOut, xOut) = numeric_cast<InputType>(value);
659  }
660  }
661  }
662  }
663  }
664  }
665  }
666 } //namespace armnn
int32_t operator*(int32_t rhs) const
The implementation of this function is adapted from Android NN&#39;s MultiplyByEthosnRefQuantizedMultipli...
struct hw_float convert_to_S12E8M(const int32_t val)
uint32_t bit_field(const uint8_t high_bit, const uint8_t low_bit, const uint32_t val)
unsigned int GetWidthIndex() const
uint32_t one_mask(const uint8_t size)
const TensorShape & GetShape() const
Definition: Tensor.hpp:191
uint32_t wtfp_round_mantissa(const uint32_t unshifted_mantissa, const uint32_t exp)
Performs multiplication of an integer with a multiplier which is less than one, using quantized integ...
Copyright (c) 2021 ARM Limited and Contributors.
unsigned int GetHeightIndex() const
uint32_t DivideRoundUp(uint32_t numerator, uint32_t denominator)
DataType & Get(unsigned int b, unsigned int c, unsigned int h, unsigned int w) const
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
Provides access to the appropriate indexes for Channels, Height and Width based on DataLayout...
bool bit_extract(const uint8_t pos, const uint32_t val)
struct hw_float convert_to_S13E8M(const int32_t val)
int64_t ShiftRound(int64_t value, bool round=false)
EthosnRefQuantizedMultiplierSmallerThanOne(float multiplier)
Constructs a EthosnRefQuantizedMultiplierSmallerThanOne which will multiply by the given multiplier...
std::enable_if_t< std::is_unsigned< Source >::value &&std::is_unsigned< Dest >::value, Dest > numeric_cast(Source source)
Definition: NumericCast.hpp:35
unsigned int GetChannelsIndex() const