43 template<
typename ConvData,
typename InputType,
typename WeightType,
typename BiasType,
typename AccumulatorType>
44 static void EthosnRefConvImpl(ConvData data,
45 const InputType* inputData,
48 const WeightType* filterData,
51 const BiasType* biasData,
55 bool depthwise =
false)
57 if (data.m_Parameters.m_BiasEnabled && !biasData)
66 armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
67 data.m_Parameters.m_DataLayout);
72 const unsigned int heightIndex = dataLayoutIndexed.
GetHeightIndex();
73 const unsigned int widthIndex = dataLayoutIndexed.
GetWidthIndex();
75 unsigned int depthMultiplier = depthwise ? filterInfo.
GetShape()[0] : 1;
76 unsigned int inputChannels = depthwise ? filterInfo.
GetShape()[1] : filterInfo.
GetShape()[channelsIndex];
77 unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : filterInfo.
GetShape()[0];
79 unsigned int batchSize = outputInfo.
GetShape()[0];
80 unsigned int outputHeight = outputInfo.
GetShape()[heightIndex];
81 unsigned int outputWidth = outputInfo.
GetShape()[widthIndex];
82 unsigned int inputHeight = inputInfo.
GetShape()[heightIndex];
83 unsigned int inputWidth = inputInfo.
GetShape()[widthIndex];
85 unsigned int filterHeight = depthwise ? filterInfo.
GetShape()[2] : filterInfo.
GetShape()[heightIndex];
86 unsigned int filterWidth = depthwise ? filterInfo.
GetShape()[3] : filterInfo.
GetShape()[widthIndex];
88 unsigned int paddingTop = data.m_Parameters.m_PadTop;
89 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
90 unsigned int xStride = data.m_Parameters.m_StrideX;
91 unsigned int yStride = data.m_Parameters.m_StrideY;
92 unsigned int xDilation = data.m_Parameters.m_DilationX;
93 unsigned int yDilation = data.m_Parameters.m_DilationY;
96 for (
unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
98 for (
unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
100 for (
unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
102 for (
unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
105 AccumulatorType sum = AccumulatorType();
109 for (
unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
111 unsigned int depthwiseMultiplierIdx = 0;
114 cInput = cOutput / depthMultiplier;
115 depthwiseMultiplierIdx = cOutput % depthMultiplier;
118 for (
unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
120 for (
unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
124 unsigned int filterIndex = 0;
129 filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
130 cInput * filterWidth * filterHeight +
131 yFilter * filterWidth +
138 filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
139 yFilter * filterWidth * inputChannels +
140 xFilter * inputChannels +
145 filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
146 cInput * filterWidth * filterHeight +
147 yFilter * filterWidth +
152 AccumulatorType filterValue = filterData[filterIndex] -
155 unsigned int yInput = yOutput * yStride + yFilter * yDilation;
156 unsigned int xInput = xOutput * xStride + xFilter * xDilation;
158 AccumulatorType inputValue;
161 if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
162 xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
164 inputValue = AccumulatorType();
168 unsigned int inputIndex;
172 inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
173 (yInput - paddingTop) * inputWidth * inputChannels +
174 (xInput - paddingLeft) * inputChannels +
180 inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
181 inputWidth * inputHeight * cInput +
182 inputWidth * (yInput - paddingTop) +
183 xInput - paddingLeft;
186 inputValue = inputData[inputIndex] -
190 sum += filterValue * inputValue;
195 if (data.m_Parameters.m_BiasEnabled)
197 sum += biasData[cOutput];
200 if (outputScale != 0.0f)
202 float multiplier = (inputScale * filterScale) / outputScale;
206 + numeric_cast<AccumulatorType>(outputOffset);
207 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, std::numeric_limits<InputType>::min()),
208 std::numeric_limits<InputType>::max());
211 output.
Get(batchIdx, cOutput, yOutput, xOutput) =
numeric_cast<InputType>(sum);
234 mask = (1u << size) - 1;
243 return (((val >> pos) & 1u) == 1u);
247 const uint8_t high_bit,
248 const uint8_t low_bit,
252 int src = high_bit + 1 - low_bit;
253 memcpy(&size, &src,
sizeof(uint8_t));
256 return ((val >> low_bit) &
one_mask(size));
260 const uint32_t unshifted_mantissa,
263 uint32_t rounded_mantissa = 0;
267 rounded_mantissa = unshifted_mantissa;
269 else if ((unshifted_mantissa >> exp) == 0xff)
272 rounded_mantissa = 0xFF;
278 uint32_t round_bit = (unshifted_mantissa >> (exp - 1)) & 1;
279 rounded_mantissa = (unshifted_mantissa >> exp) + round_bit;
282 return rounded_mantissa;
288 uint32_t unsigned_val;
289 uint32_t unsigned_not_val;
290 int32_t not_val = ~val + 1;
292 memcpy(&unsigned_val, &val,
sizeof(uint32_t));
293 memcpy(&unsigned_not_val, ¬_val,
sizeof(uint32_t));
323 uint32_t unsigned_val;
324 uint32_t unsigned_not_val;
325 int32_t not_val = ~val + 1;
327 memcpy(&unsigned_val, &val,
sizeof(uint32_t));
328 memcpy(&unsigned_not_val, ¬_val,
sizeof(uint32_t));
363 inline int64_t
ShiftRound (int64_t value,
bool round=
false) {
365 int64_t round_bit = round? value & 0x1 : 0;
366 int64_t out = (value >> 1) + round_bit;
370 template<
typename ConvData,
typename InputType,
typename WeightType,
typename BiasType>
371 static void EthosnRefWinogradConvImpl(ConvData data,
372 const InputType* inputData,
375 const WeightType* filterData,
377 int32_t filterOffset,
378 const BiasType* biasData,
380 int32_t outputOffset,
383 if (data.m_Parameters.m_BiasEnabled && !biasData)
392 armnn::ethosnref::GetOutputTensorData<InputType>(0, data),
393 data.m_Parameters.m_DataLayout);
398 const unsigned int heightIndex = dataLayoutIndexed.
GetHeightIndex();
399 const unsigned int widthIndex = dataLayoutIndexed.
GetWidthIndex();
401 unsigned int inputChannels = filterInfo.
GetShape()[channelsIndex];
402 unsigned int outputChannels = filterInfo.
GetShape()[0];
404 unsigned int batchSize = outputInfo.
GetShape()[0];
405 unsigned int outputHeight = outputInfo.
GetShape()[heightIndex];
406 unsigned int outputWidth = outputInfo.
GetShape()[widthIndex];
407 unsigned int inputHeight = inputInfo.
GetShape()[heightIndex];
408 unsigned int inputWidth = inputInfo.
GetShape()[widthIndex];
410 unsigned int paddingTop = data.m_Parameters.m_PadTop;
411 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
412 unsigned int xStride = data.m_Parameters.m_StrideX;
413 unsigned int yStride = data.m_Parameters.m_StrideY;
414 unsigned int xDilation = data.m_Parameters.m_DilationX;
415 unsigned int yDilation = data.m_Parameters.m_DilationY;
417 unsigned int filterHeight = filterInfo.
GetShape()[heightIndex];
418 unsigned int filterWidth = filterInfo.
GetShape()[widthIndex];
422 unsigned int decomposedHeight = (filterHeight == 1)? 1 : 3;
424 unsigned int decomposedWidth = (filterWidth == 1) ? 1 : 3;
426 unsigned int h_dim, w_dim, w_stride, h_stride;
427 if (filterHeight == 1) {
432 }
else if (filterWidth == 1) {
444 for (
unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
446 for (
unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
448 for (
unsigned int yOutput = 0; yOutput < outputHeight; yOutput=yOutput+h_stride)
450 for (
unsigned int xOutput = 0; xOutput < outputWidth; xOutput=xOutput+w_stride)
453 std::vector<int64_t> MAC(4*4, 0);
455 for (
unsigned int cInput = 0; cInput < inputChannels; cInput++)
458 for (
unsigned int yFilter= 0; yFilter < filterHeight_roundup; yFilter += decomposedHeight) {
459 for (
unsigned int xFilter= 0; xFilter < filterWidth_roundup; xFilter += decomposedWidth) {
461 std::vector<int32_t> filterPreData(decomposedHeight*decomposedWidth, 0);
464 for (
unsigned int xIncrFilter= 0; xIncrFilter < decomposedWidth; xIncrFilter++) {
465 for (
unsigned int yIncrFilter= 0; yIncrFilter < decomposedHeight; yIncrFilter++) {
467 unsigned int filterIndex;
468 bool validFilterIndex = ((yFilter + yIncrFilter) < filterHeight) && ((xFilter + xIncrFilter) < filterWidth);
471 filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
472 (yFilter + yIncrFilter)* filterWidth * inputChannels +
473 (xFilter + xIncrFilter)* inputChannels +
476 filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
477 cInput * filterWidth * filterHeight +
478 (yFilter + yIncrFilter) * filterWidth +
479 xFilter + xIncrFilter;
482 filterPreData[xIncrFilter + decomposedWidth*yIncrFilter] = validFilterIndex ? filterData[filterIndex] -
numeric_cast<int32_t>(filterOffset) : 0;
487 std::vector<int32_t> transformedFilter(4*4, 0);
490 for (
unsigned int h = 0; h < 4; h++) {
491 transformedFilter[h + 4*0] = 2*filterPreData[0];
492 transformedFilter[h + 4*1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
493 transformedFilter[h + 4*2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
494 transformedFilter[h + 4*3] = 2*filterPreData[2];
496 }
else if (w_dim == 1) {
497 for (
unsigned int w = 0; w < 4; w++) {
498 transformedFilter[w*4 + 0] = 2*filterPreData[0];
499 transformedFilter[w*4 + 1] = filterPreData[0] + filterPreData[1] + filterPreData[2];
500 transformedFilter[w*4 + 2] = filterPreData[0] - filterPreData[1] + filterPreData[2];
501 transformedFilter[w*4 + 3] = 2*filterPreData[2];
504 std::vector<int32_t> tmpFilter(h_dim*decomposedWidth, 0);
506 for (
unsigned int w_filter=0; w_filter < decomposedWidth; w_filter++) {
507 tmpFilter[w_filter + 0*decomposedWidth] = 2*filterPreData[w_filter];
508 tmpFilter[w_filter + 1*decomposedWidth] = filterPreData[w_filter] + filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
509 tmpFilter[w_filter + 2*decomposedWidth] = filterPreData[w_filter] - filterPreData[1*decomposedWidth+w_filter] + filterPreData[2*decomposedWidth+w_filter];
510 tmpFilter[w_filter + 3*decomposedWidth] = 2*filterPreData[2*decomposedWidth+w_filter];
513 for (
unsigned int h_filter=0; h_filter < h_dim; h_filter++) {
514 transformedFilter[h_filter*w_dim + 0] = 2*tmpFilter[h_filter*decomposedWidth];
515 transformedFilter[h_filter*w_dim + 1] = tmpFilter[h_filter*decomposedWidth] + tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
516 transformedFilter[h_filter*w_dim + 2] = tmpFilter[h_filter*decomposedWidth] - tmpFilter[1+h_filter*decomposedWidth] + tmpFilter[2+h_filter*decomposedWidth];
517 transformedFilter[h_filter*w_dim + 3] = 2*tmpFilter[2+h_filter*decomposedWidth];
522 std::vector<int32_t> Inputs(4*4, 0);
525 for (
unsigned int yIncr= 0; yIncr < 4; yIncr++)
527 for (
unsigned int xIncr = 0; xIncr < 4; xIncr++)
529 unsigned int yInput = yOutput * yStride + (yFilter + yIncr)*yDilation;
530 unsigned int xInput = xOutput * xStride + (xFilter + xIncr)*xDilation;
532 bool validIndex = (xInput >= paddingLeft) && (xInput < inputWidth + paddingLeft) &&
533 (yInput >= paddingTop) && (yInput < inputHeight + paddingTop);
535 unsigned int inputIndex;
537 inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
538 (yInput - paddingTop) * inputWidth * inputChannels +
539 (xInput - paddingLeft) * inputChannels +
543 inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
544 inputWidth * inputHeight * cInput +
545 inputWidth * (yInput - paddingTop) +
546 xInput - paddingLeft;
548 if (h_dim ==1) Inputs[yIncr + 4*xIncr] = validIndex? inputData[inputIndex] -
numeric_cast<int32_t>(inputOffset) : 0;
549 else Inputs[xIncr + 4*yIncr] = validIndex? inputData[inputIndex] -
numeric_cast<int32_t>(inputOffset) : 0;
553 std::vector<int32_t> transformedInput(4*4, 0);
554 std::vector<int32_t> tmpInput(4*4, 0);
556 for (
unsigned int in_w=0; in_w < 4; in_w++) {
557 tmpInput[in_w + 0*4] = (w_dim == 4)? Inputs[in_w + 4*0] - Inputs[in_w + 4*2] : Inputs[in_w*4 + 0];
558 tmpInput[in_w + 1*4] = (w_dim == 4)? Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 1];
559 tmpInput[in_w + 2*4] = (w_dim == 4)? -Inputs[in_w + 4*1] + Inputs[in_w + 4*2] : Inputs[in_w*4 + 2];
560 tmpInput[in_w + 3*4] = (w_dim == 4)? Inputs[in_w + 4*1] - Inputs[in_w + 4*3] : Inputs[in_w*4 + 3];
563 for (
unsigned int in_h=0; in_h < 4; in_h++) {
564 transformedInput[in_h*4 + 0] = (h_dim == 4)? tmpInput[0 + 4*in_h] - tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 0];
565 transformedInput[in_h*4 + 1] = (h_dim == 4)? tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 1];
566 transformedInput[in_h*4 + 2] = (h_dim == 4)? -tmpInput[1 + 4*in_h] + tmpInput[2 + 4*in_h] : tmpInput[in_h*4 + 2];
567 transformedInput[in_h*4 + 3] = (h_dim == 4)? tmpInput[1 + 4*in_h] - tmpInput[3 + 4*in_h] : tmpInput[in_h*4 + 3];
573 for (
unsigned int y= 0; y < 4 ; y++) {
574 for (
unsigned int x = 0; x < 4; x++) {
577 const bool sign = (weight_fp.
sign != ifm_fp.
sign);
578 uint64_t unsigned_ofm_value =
numeric_cast<uint64_t>(ifm_fp.
man) * numeric_cast<uint64_t>(weight_fp.
man);
579 uint64_t total_shift =
numeric_cast<uint64_t>(weight_fp.
exp) + numeric_cast<uint64_t>(ifm_fp.
exp);
580 unsigned_ofm_value <<= total_shift;
583 unsigned_ofm_value = ~unsigned_ofm_value;
584 unsigned_ofm_value += 1;
586 int64_t ofm_value = 0;
587 memcpy(&ofm_value, &unsigned_ofm_value,
sizeof(int64_t));
588 MAC[x + 4*y] += ofm_value;
596 std::vector<int64_t> transformedFinal(h_stride*w_stride, 0);
599 transformedFinal[0] +=
ShiftRound(MAC[0] + MAC[4] + MAC[8]);
600 transformedFinal[1] +=
ShiftRound(MAC[4] - MAC[8] - MAC[12]);
601 transformedFinal[2] +=
ShiftRound(MAC[1] + MAC[5] + MAC[9]);
602 transformedFinal[3] +=
ShiftRound(MAC[5] - MAC[9] - MAC[13]);
603 transformedFinal[4] +=
ShiftRound(MAC[2] + MAC[6] + MAC[10]);
604 transformedFinal[5] +=
ShiftRound(MAC[6] - MAC[10] - MAC[14]);
605 transformedFinal[6] +=
ShiftRound(MAC[3] + MAC[7] + MAC[11]);
606 transformedFinal[7] +=
ShiftRound(MAC[7] - MAC[11] - MAC[15]);
607 }
else if (w_dim == 1) {
608 transformedFinal[0] +=
ShiftRound(MAC[0] + MAC[1] + MAC[2]);
609 transformedFinal[1] +=
ShiftRound(MAC[4] + MAC[5] + MAC[6]);
610 transformedFinal[2] +=
ShiftRound(MAC[8] + MAC[9] + MAC[10]);
611 transformedFinal[3] +=
ShiftRound(MAC[12] + MAC[13] + MAC[14]);
612 transformedFinal[4] +=
ShiftRound(MAC[1] - MAC[2] - MAC[3]);
613 transformedFinal[5] +=
ShiftRound(MAC[5] - MAC[6] - MAC[7]);
614 transformedFinal[6] +=
ShiftRound(MAC[9] - MAC[10] - MAC[11]);
615 transformedFinal[7] +=
ShiftRound(MAC[13] - MAC[14] - MAC[15]);
632 for (
unsigned int yOutIncr= 0; yOutIncr < h_stride; yOutIncr++)
634 for (
unsigned int xOutIncr = 0; xOutIncr < w_stride; xOutIncr++)
636 unsigned int xOut = xOutput + xOutIncr;
637 unsigned int yOut = yOutput + yOutIncr;
639 unsigned int sumIndex = xOutIncr + w_stride*yOutIncr;
640 int64_t value = std::min<int64_t>(std::max<int64_t>(transformedFinal[sumIndex], std::numeric_limits<int32_t>::min()),
641 std::numeric_limits<int32_t>::max());
643 if (data.m_Parameters.m_BiasEnabled)
645 value += biasData[cOutput];
647 if (outputScale != 0.0f)
649 float multiplier = (inputScale*filterScale) / outputScale;
654 value = std::min<int64_t>(std::max<int64_t>(value, std::numeric_limits<InputType>::min()),
655 std::numeric_limits<InputType>::max());
658 if (yOut < outputHeight && xOut < outputWidth) output.
Get(batchIdx, cOutput, yOut, xOut) =
numeric_cast<InputType>(value);
int32_t operator*(int32_t rhs) const
The implementation of this function is adapted from Android NN's MultiplyByEthosnRefQuantizedMultipli...
struct hw_float convert_to_S12E8M(const int32_t val)
uint32_t bit_field(const uint8_t high_bit, const uint8_t low_bit, const uint32_t val)
unsigned int GetWidthIndex() const
uint32_t one_mask(const uint8_t size)
const TensorShape & GetShape() const
uint32_t wtfp_round_mantissa(const uint32_t unshifted_mantissa, const uint32_t exp)
Performs multiplication of an integer with a multiplier which is less than one, using quantized integ...
Copyright (c) 2021 ARM Limited and Contributors.
unsigned int GetHeightIndex() const
uint32_t DivideRoundUp(uint32_t numerator, uint32_t denominator)
DataType & Get(unsigned int b, unsigned int c, unsigned int h, unsigned int w) const
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
Provides access to the appropriate indexes for Channels, Height and Width based on DataLayout...
bool bit_extract(const uint8_t pos, const uint32_t val)
struct hw_float convert_to_S13E8M(const int32_t val)
int64_t ShiftRound(int64_t value, bool round=false)
EthosnRefQuantizedMultiplierSmallerThanOne(float multiplier)
Constructs a EthosnRefQuantizedMultiplierSmallerThanOne which will multiply by the given multiplier...
std::enable_if_t< std::is_unsigned< Source >::value &&std::is_unsigned< Dest >::value, Dest > numeric_cast(Source source)
unsigned int GetChannelsIndex() const