From b8060f51dcd6c128629e34855fdf46865ec165f5 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Mon, 9 Aug 2021 12:22:51 +0100
Subject: MLBEDSW-4738 Reinstate find_block_config from v2.1

Reinstated the v2.1.0 functionality for find_block_configs(). This is
used exclusively by the external API.

Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
Change-Id: I6977f13866957edb083769658cc8c57c2b3556fb
---
 ethosu/vela/api.py                               | 84 +++++++++++++++++++++++-
 ethosu/vela/architecture_allocator.py            |  1 +
 ethosu/vela/register_command_stream_generator.py | 17 -----
 3 files changed, 82 insertions(+), 20 deletions(-)

(limited to 'ethosu')

diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index e91c0bdb..e31c373a 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -447,9 +447,87 @@ def npu_find_block_configs(npu_op: NpuOperation, accelerator: NpuAccelerator) ->
     This function can be used to find a valid value for npu_op.block_config.
     The block config is the unit of work in which the NPU generates the OFM.
     """
-    from . import register_command_stream_generator
-
-    return register_command_stream_generator.find_block_configs(npu_op, accelerator)
+    from .architecture_features import Accelerator
+    from .architecture_features import ArchitectureFeatures
+    from .architecture_features import Block
+    from .architecture_features import create_default_arch
+    from .architecture_allocator import try_block_config
+    from .register_command_stream_generator import resampling_mode_map
+    from .register_command_stream_util import to_kernel
+    from .operation import NpuBlockType
+
+    is_partkernel = False
+    if isinstance(npu_op, NpuConv2DOperation):
+        block_type = NpuBlockType.ConvolutionMxN
+        is_partkernel = npu_op.block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
+    elif isinstance(npu_op, NpuConvDepthWiseOperation):
+        block_type = NpuBlockType.ConvolutionDepthWise
+    elif isinstance(npu_op, NpuPoolingOperation):
+        block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
+    elif isinstance(npu_op, NpuElementWiseOperation):
+        block_type = NpuBlockType.ElementWise
+    else:
+        assert 0, "Unsupported operation"
+
+    ifm_shape = Block(npu_op.ifm.shape.width, npu_op.ifm.shape.height, npu_op.ifm.shape.depth)
+    ifm2_shape = None
+    if npu_op.ifm2:
+        ifm2_shape = Block(npu_op.ifm2.shape.width, npu_op.ifm2.shape.height, npu_op.ifm2.shape.depth)
+    ofm_shape = Block(npu_op.ofm.shape.width, npu_op.ofm.shape.height, npu_op.ofm.shape.depth)
+
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    ifm_bits = npu_op.ifm.data_type.size_in_bits()
+    kernel = to_kernel(npu_op.kernel)
+    lut_banks = 0
+    if npu_op.activation:
+        lut_banks = 2 if npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP else 0
+
+    has_scaling = True
+    for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
+        if tensor and tensor.quantization is None:
+            has_scaling = False
+            break
+
+    arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
+
+    max_block_width = min(arch.ofm_block_max.width, ofm_shape.width)
+    max_block_height = min(arch.ofm_block_max.height, ofm_shape.height)
+    max_block_depth = min(arch.ofm_block_max.depth, ofm_shape.depth)
+
+    min_block_height = max(arch.ofm_ublock.height, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)
+    min_block_width = max(arch.ofm_ublock.width, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)
+
+    valid_block_configs = []
+    for w in range(min_block_width, max_block_width + min_block_width, min_block_width):
+        for h in range(min_block_height, max_block_height + min_block_height, min_block_height):
+            # Try valid OFM block depths
+            for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
+                # OFM block depth has the constraint that if it causes the OFM to be
+                # split, it must be a multiple of the OFM split size
+                if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
+                    block = Block(w, h, c)
+                    config = try_block_config(
+                        block,
+                        arch,
+                        block_type,
+                        ofm_shape,
+                        ifm_shape,
+                        ifm2_shape,
+                        npu_op.ifm2_scalar is not None,
+                        ifm_bits,
+                        is_partkernel,
+                        kernel,
+                        lut_banks,
+                        has_scaling,
+                        ifm_resampling_mode,
+                    )
+
+                    if config:
+                        ofm_block = config.ofm_block
+                        valid_block_configs.append(NpuShape3D(ofm_block.height, ofm_block.width, ofm_block.depth))
+
+    assert len(valid_block_configs) > 0
+    return valid_block_configs
 
 
 def npu_generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: NpuAccelerator) -> List[int]:
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 3c49eb13..32502e33 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -412,4 +412,5 @@ def try_block_config(
     config.layout = layout
     config.bank_size = arch.shram_bank_size
     config.ifm_block = ifm_block
+    config.ofm_block = block_config
     return config
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index b4a633e9..d61e5717 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -1025,23 +1025,6 @@ def generate_command_stream(
     return res
 
 
-# -------------------------------------------------------------------
-# EXTERNAL API
-# -------------------------------------------------------------------
-
-
-def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
-    """
-    Internal implementation of the public facing API for finding block configs.
-    """
-    if isinstance(npu_op, NpuBlockOperation):
-        # TODO: implement this function
-        arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
-        block = arch.ofm_ublock
-        return [NpuShape3D(height=block.height, width=block.width, depth=block.depth)]
-    return []
-
-
 def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
     """
     Internal implementation of the public facing API for generating an Ethos-U register command stream.
-- 
cgit v1.2.1