From 09387e207aa736c464cf95c8a57609aa21b65d44 Mon Sep 17 00:00:00 2001
From: Diqing Zhong <diqing.zhong@arm.com>
Date: Mon, 28 Sep 2020 18:46:22 +0200
Subject: MLBEDSW-3146: Cycle estimation for conv/pooling ops

Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
Change-Id: Ic6ae795a1626d1cdf63a69d2ff86f7cd898f3134
---
 ethosu/vela/npu_performance.py          | 174 ++++++++++++++++++++++++++------
 ethosu/vela/shared_buffer_allocation.py |  12 ++-
 2 files changed, 150 insertions(+), 36 deletions(-)

diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 24b4c68a..4d221bea 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -24,13 +24,14 @@ import enum
 import numpy as np
 
 from . import numeric_util
+from .architecture_features import Accelerator
 from .architecture_features import Block
-from .architecture_features import SHRAMElements
 from .data_type import DataType
 from .nn_graph import PassPlacement
 from .nn_graph import SchedulerRewrite
 from .operation import NpuBlockType
 from .operation import Op
+from .shared_buffer_allocation import is_acc_40bits_used
 from .tensor import MemArea
 from .tensor import shape_num_elements
 from .tensor import TensorBlockTraversal
@@ -212,22 +213,20 @@ def get_n_blocks_and_area(
     return total_blocks, total_area, block_setup
 
 
-def get_output_cycle_estimate(arch, ps):
-    primary_op = ps.primary_op
-    assert primary_op
-    npu_block_type = primary_op.type.npu_block_type
+def get_output_cycle_estimate(
+    arch, npu_block_type, primary_op, num_elems, ifm_tensor, ofm_tensor, ifm2_tensor, use_acc_40bits=False
+):
     faf = primary_op.activation
-
-    if npu_block_type == NpuBlockType.ElementWise and ps.ifm_tensor.dtype == DataType.int32:
-        if ps.ifm2_tensor is None:
+    if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
+        if ifm2_tensor is None:
             # Unary op
             output_perf_index = 0
         else:
             # Binary op
             output_perf_index = 1
-    elif ps.primary_op.type == Op.Mul and ps.ofm_tensor.dtype == DataType.int32:
+    elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
         output_perf_index = 2
-    elif ps.primary_op.type == Op.Mul or (
+    elif primary_op.type == Op.Mul or (
         npu_block_type
         in (
             NpuBlockType.ConvolutionMxN,
@@ -236,13 +235,13 @@ def get_output_cycle_estimate(arch, ps):
             NpuBlockType.ReduceSum,
             NpuBlockType.VectorProduct,
         )
-        and ps.shared_buffer.use_accumulator_element == SHRAMElements.Acc40
+        and use_acc_40bits
     ):
         output_perf_index = 3
-    elif ps.primary_op.type in (Op.Add, Op.Sub):
-        input_scale = ps.ifm_tensor.quantization.scale_f32
-        input2_scale = ps.ifm2_tensor.quantization.scale_f32
-        output_scale = ps.ofm_tensor.quantization.scale_f32
+    elif primary_op.type in (Op.Add, Op.Sub):
+        input_scale = ifm_tensor.quantization.scale_f32
+        input2_scale = ifm2_tensor.quantization.scale_f32
+        output_scale = ofm_tensor.quantization.scale_f32
 
         if "resizebilinear" in primary_op.attrs:
             output_scale = input2_scale
@@ -253,7 +252,7 @@ def get_output_cycle_estimate(arch, ps):
         else:
             # Advanced Add/Sub
             output_perf_index = 5
-    elif ps.primary_op.type.is_maxpool_op():
+    elif primary_op.type.is_maxpool_op():
         output_perf_index = 6
     else:
         output_perf_index = 7
@@ -265,13 +264,95 @@ def get_output_cycle_estimate(arch, ps):
     else:
         activation_perf_index = 2
 
-    num_elems = ps.outputs[0].elements()
     cycle_per_elem = max(
         arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
     )
     return num_elems * cycle_per_elem
 
 
+def get_conv_pooling_cycle_estimate(
+    arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor
+):
+    num_ublk = (
+        (block_config.width // arch.config.ofm_ublock.width)
+        * (block_config.height // arch.config.ofm_ublock.height)
+        * (block_config.depth // arch.config.ofm_ublock.depth)
+    )
+    num_ofm_blk = 0
+    total_cycles = 0
+    num_elems_blk = block_config.width * block_config.height * block_config.depth
+    ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
+    ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)
+    use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
+
+    sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
+    n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
+    n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
+    sub_kernel_x = [
+        min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
+    ]
+    sub_kernel_y = [
+        min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
+    ]
+    sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
+
+    ifm_blk_depth = 0
+    if npu_block_type != NpuBlockType.Pooling:
+        if ifm_tensor.dtype.size_in_bits() == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
+            ifm_blk_depth = 16
+        elif ifm_tensor.dtype.size_in_bits() == 8:
+            ifm_blk_depth = 32
+        else:
+            ifm_blk_depth = 8
+
+    cycles_dpu_blk = 0
+
+    for num_kernel_elems in sub_kernel_size:
+        if npu_block_type == NpuBlockType.Pooling:
+            cycles = max(4, num_kernel_elems) * num_ublk
+            if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
+                cycles *= 2
+        elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+            cycles = 4 * numeric_util.round_up_divide(num_kernel_elems, 4) * num_ublk
+            if ifm_tensor.dtype.size_in_bits() == 16:
+                cycles *= 2
+        elif (
+            (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
+            or npu_block_type == NpuBlockType.VectorProduct
+            or npu_block_type == NpuBlockType.ReduceSum
+        ):
+            cycles = 4 * num_kernel_elems * num_ublk * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
+        else:
+            assert block_traversal == TensorBlockTraversal.PartKernelFirst
+            divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
+            cycles = 4 * (
+                numeric_util.round_up_divide(num_kernel_elems, divider)
+                * numeric_util.round_up_divide(ifm_blk_depth, 8)
+                * num_ublk
+                * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
+            )
+        cycles_dpu_blk += cycles
+
+    cycles_dpu_blk /= arch.ncores
+
+    num_ofm_blk = (
+        numeric_util.round_up_divide(ofm_tens_shape[1], block_config.height)
+        * numeric_util.round_up_divide(ofm_tens_shape[2], block_config.width)
+        * numeric_util.round_up_divide(ofm_tens_shape[3], block_config.depth)
+    )
+
+    cycles_output_blk = get_output_cycle_estimate(
+        arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits
+    )
+
+    if cycles_dpu_blk > cycles_output_blk:
+        total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
+    else:
+        total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
+
+    return total_cycles
+
+
 def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
     if block_config is None:
         block_config = ps.block_config
@@ -302,7 +383,12 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
         ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
 
         if npu_block_type in set(
-            (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
+            (
+                NpuBlockType.ConvolutionMxN,
+                NpuBlockType.ConvolutionDepthWise,
+                NpuBlockType.Pooling,
+                NpuBlockType.ReduceSum,
+            )
         ):
             # extent the ifm to full dimension
             ifm_tensor_brick_size = tuple(numeric_util.full_shape(4, list(ifm_tensor.brick_size), 1))
@@ -316,12 +402,22 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
             ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2]  # height += top and bottom
             ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3]  # width  += left and right
 
+            block_traversal = TensorBlockTraversal.Default
+
             strides = primary_op.attrs["strides"]
             if npu_block_type != NpuBlockType.Pooling:
-                weight_tensor_shape = weight_tensor.shape
-                weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape
-                weight_tensor_element_size = weight_tensor.element_size()
-                weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
+                if npu_block_type == NpuBlockType.ReduceSum:
+                    block_traversal = TensorBlockTraversal.DepthFirst
+                    weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
+                    weight_tensor_bandwidth_shape = [0] * 4
+                    weight_tensor_element_size = 0
+                    weight_tensor_bandwidth_compression_scale = 0.0
+                else:
+                    block_traversal = weight_tensor.block_traversal
+                    weight_tensor_shape = weight_tensor.shape
+                    weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape
+                    weight_tensor_element_size = weight_tensor.element_size()
+                    weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
                 nn_ops = (
                     int(ofm_tensor.shape[0])
                     * int(ofm_tensor.shape[1])
@@ -394,7 +490,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
             n_kernel_xy = kernel_dims[0] * kernel_dims[1]
             n_input_channels_at_a_time = block_config[2]
 
-            if npu_block_type == NpuBlockType.Pooling or weight_tensor.block_traversal in set(
+            if npu_block_type == NpuBlockType.Pooling or block_traversal in set(
                 (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise)
             ):
                 n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4)
@@ -416,14 +512,18 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
                     * n_kernel_xy
                 )
 
-            if npu_block_type == NpuBlockType.Pooling:
-                # TODO: improve pooling estimation
-                cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle / 2
-            else:
-                cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
             macs[MacCount.NeuralNetworkMacs] += nn_ops
             macs[MacCount.HardwareMacs] += num_mac_ops
-
+            cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate(
+                arch,
+                npu_block_type,
+                primary_op,
+                Block(block_config[1], block_config[0], block_config[3]),
+                block_traversal,
+                kernel_dims,
+                ifm_tensor,
+                ofm_tensor,
+            )
         elif npu_block_type == NpuBlockType.VectorProduct:
             nn_macs = (
                 ifm_tensor.shape[0]
@@ -432,7 +532,16 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
             )
             num_mac_ops = nn_macs
 
-            cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
+            cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate(
+                arch,
+                npu_block_type,
+                primary_op,
+                Block(block_config[1], block_config[0], block_config[3]),
+                weight_tensor.block_traversal,
+                [1, 1],
+                ifm_tensor,
+                ofm_tensor,
+            )
             macs[MacCount.NeuralNetworkMacs] += nn_macs
             macs[MacCount.HardwareMacs] += num_mac_ops
 
@@ -449,8 +558,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
             weight_read_multiple = non_zero_fraction
         elif npu_block_type == NpuBlockType.ElementWise:
             # Work out how many elements we have and calculate performance.
-            cycles[PassCycles.ElementWise] = get_output_cycle_estimate(arch, ps)
-
+            cycles[PassCycles.ElementWise] = get_output_cycle_estimate(
+                arch, npu_block_type, primary_op, ofm_tensor.elements(), ps.ifm_tensor, ps.ofm_tensor, ps.ifm2_tensor
+            )
     # apply the desired rewrites
     for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
         if ps != ps_to_rewrite:
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 484c34b0..51fb1683 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -37,9 +37,6 @@ class SharedBufferAllocation:
         self.banks_required = np.zeros(SharedBufferArea.Size)
 
         ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
-        tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
-        scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
-        has_scale = len(tensors) == len(scales) and None not in scales
 
         self.kernel = Kernel(1, 1)
         self.is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
@@ -81,7 +78,7 @@ class SharedBufferAllocation:
                     self.ifm_count = 1
 
             if self.ifm_bits == 16:
-                if ps.npu_block_type != NpuBlockType.Pooling and has_scale:
+                if is_acc_40bits_used(ps.npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor):
                     self.use_accumulator_element = SHRAMElements.Acc40
                 self.use_ifm_element = self.use_ifm_element + 1
                 assert (self.use_ifm_element == SHRAMElements.IFM16) or (
@@ -171,6 +168,13 @@ class SharedBufferAllocation:
         )
 
 
+def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
+    tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
+    scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
+    has_scale = len(tensors) == len(scales) and None not in scales
+    return npu_block_type != NpuBlockType.Pooling and has_scale
+
+
 def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
     alloc = SharedBufferAllocation(arch, ps)
     assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
-- 
cgit v1.2.1