aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFredrik Svedberg <fredrik.svedberg@arm.com>2021-09-29 10:08:04 +0200
committerFredrik Svedberg <fredrik.svedberg@arm.com>2021-10-01 07:56:18 +0000
commit3ff7a4aed60fefababdced93a5aba91409a4050c (patch)
tree1dfd999beb51fec093d6ce4eb6f68f7bca41b32d
parent008cd10f6f5b8838de1f37c8c42899ace1c1cf0d (diff)
downloadethos-u-vela-3ff7a4aed60fefababdced93a5aba91409a4050c.tar.gz
MLBEDSW-5013 Output diff for u55-bring-up tests, int16
Fixed output diff for some architectures due to incorrect IFM buffer size calculation when using NearestNeighbour upscaling. Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com> Change-Id: I0d6d1efc606603cdd6188ae282e7f6babfd7e24e
-rw-r--r--ethosu/vela/architecture_allocator.py32
-rw-r--r--ethosu/vela/scheduler.py3
2 files changed, 23 insertions, 12 deletions
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 32502e3..30e1c87 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -157,6 +157,10 @@ def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
return acc_type
+def is_nearest(ifm_resampling: resampling_mode) -> bool:
+ return ifm_resampling == resampling_mode.NEAREST
+
+
def to_upscale(ifm_resampling: resampling_mode) -> int:
# Upscaling depending on resampling mode
return 1 if ifm_resampling == resampling_mode.NONE else 2
@@ -170,26 +174,32 @@ def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool
return ifm_blockdepth
-def _required_size(value: int, stride: int, border: int, upscale: int) -> int:
- return int(math.ceil(((value - 1) * stride + border) / upscale))
+def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
+ return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
-def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, upscale: int) -> Tuple[int, int]:
- h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale)
- w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale)
+def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
+ upscale = to_upscale(resampling_mode)
+ nearest = is_nearest(resampling_mode)
+ h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
+ w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
return (w1, h1)
def _get_ifm_blocksize(
- ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int
+ ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
) -> Shape4D:
# IFM block height
- h1 = _required_size(ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale)
+ h1 = _required_size(
+ ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
+ )
h2 = h1
height = round_up(min(h1, h2), ublock.height)
# IFM block width
- w1 = _required_size(ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale)
+ w1 = _required_size(
+ ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
+ )
w2 = w1
width = round_up(min(w1, w2), ublock.width)
@@ -248,6 +258,7 @@ def find_block_config(
ifm_granule = arch.ifm_bank_granules[ifm_bits]
lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
upscale = to_upscale(ifm_resampling)
+ nearest = is_nearest(ifm_resampling)
# Subkernel repeats of the IFM
ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
@@ -279,7 +290,7 @@ def find_block_config(
# Calculate the IFM block dimensions required to feed this OFM block
ofm_block = Shape4D(1, height, width, depth)
- ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
+ ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
if not is_equal_depth_op:
ifm_block = ifm_block.with_depth(ifm_blockdepth)
@@ -396,8 +407,9 @@ def try_block_config(
ifm_granule = arch.ifm_bank_granules[ifm_bits]
lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
upscale = to_upscale(ifm_resampling)
+ nearest = is_nearest(ifm_resampling)
ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
- ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
+ ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
if not is_equal_depth_op:
ifm_block = ifm_block.with_depth(ifm_blockdepth)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index e4543e3..044b246 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -32,7 +32,6 @@ from . import weight_compressor
from .architecture_allocator import ArchitectureBlockConfig
from .architecture_allocator import find_block_config
from .architecture_allocator import get_ifm_area_required
-from .architecture_allocator import to_upscale
from .architecture_features import ArchitectureFeatures
from .architecture_features import Block
from .cascade_builder import CascadeBuilder
@@ -269,7 +268,7 @@ class SchedulerOperation:
"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
- return get_ifm_area_required(ofm_shape_to_produce, self.kernel, to_upscale(self.resampling_mode))
+ return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
def _calculate_min_stripe_input(self) -> Shape4D:
# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)