From 3ff7a4aed60fefababdced93a5aba91409a4050c Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Wed, 29 Sep 2021 10:08:04 +0200
Subject: MLBEDSW-5013 Output diff for u55-bring-up tests, int16

Fixed output diff for some architectures due to incorrect IFM buffer size
calculation when using NearestNeighbour upscaling.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: I0d6d1efc606603cdd6188ae282e7f6babfd7e24e
---
 ethosu/vela/architecture_allocator.py | 32 ++++++++++++++++++++++----------
 ethosu/vela/scheduler.py              |  3 +--
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 32502e33..30e1c872 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -157,6 +157,10 @@ def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
     return acc_type
 
 
+def is_nearest(ifm_resampling: resampling_mode) -> bool:
+    return ifm_resampling == resampling_mode.NEAREST
+
+
 def to_upscale(ifm_resampling: resampling_mode) -> int:
     # Upscaling depending on resampling mode
     return 1 if ifm_resampling == resampling_mode.NONE else 2
@@ -170,26 +174,32 @@ def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool
     return ifm_blockdepth
 
 
-def _required_size(value: int, stride: int, border: int, upscale: int) -> int:
-    return int(math.ceil(((value - 1) * stride + border) / upscale))
+def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
+    return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
 
 
-def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, upscale: int) -> Tuple[int, int]:
-    h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale)
-    w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale)
+def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
+    upscale = to_upscale(resampling_mode)
+    nearest = is_nearest(resampling_mode)
+    h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
+    w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
     return (w1, h1)
 
 
 def _get_ifm_blocksize(
-    ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int
+    ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
 ) -> Shape4D:
     # IFM block height
-    h1 = _required_size(ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale)
+    h1 = _required_size(
+        ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
+    )
     h2 = h1
     height = round_up(min(h1, h2), ublock.height)
 
     # IFM block width
-    w1 = _required_size(ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale)
+    w1 = _required_size(
+        ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
+    )
     w2 = w1
     width = round_up(min(w1, w2), ublock.width)
 
@@ -248,6 +258,7 @@ def find_block_config(
         ifm_granule = arch.ifm_bank_granules[ifm_bits]
     lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
     upscale = to_upscale(ifm_resampling)
+    nearest = is_nearest(ifm_resampling)
 
     # Subkernel repeats of the IFM
     ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
@@ -279,7 +290,7 @@ def find_block_config(
 
                 # Calculate the IFM block dimensions required to feed this OFM block
                 ofm_block = Shape4D(1, height, width, depth)
-                ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
+                ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
                 if not is_equal_depth_op:
                     ifm_block = ifm_block.with_depth(ifm_blockdepth)
 
@@ -396,8 +407,9 @@ def try_block_config(
         ifm_granule = arch.ifm_bank_granules[ifm_bits]
     lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
     upscale = to_upscale(ifm_resampling)
+    nearest = is_nearest(ifm_resampling)
     ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
-    ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
+    ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
     if not is_equal_depth_op:
         ifm_block = ifm_block.with_depth(ifm_blockdepth)
 
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index e4543e3a..044b246c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -32,7 +32,6 @@ from . import weight_compressor
 from .architecture_allocator import ArchitectureBlockConfig
 from .architecture_allocator import find_block_config
 from .architecture_allocator import get_ifm_area_required
-from .architecture_allocator import to_upscale
 from .architecture_features import ArchitectureFeatures
 from .architecture_features import Block
 from .cascade_builder import CascadeBuilder
@@ -269,7 +268,7 @@ class SchedulerOperation:
         """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
         ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
 
-        return get_ifm_area_required(ofm_shape_to_produce, self.kernel, to_upscale(self.resampling_mode))
+        return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
 
     def _calculate_min_stripe_input(self) -> Shape4D:
         # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
-- 
cgit v1.2.1