# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Description: Architecture SHRAM allocator
import enum
import math
from typing import Dict
from typing import Optional
from typing import Tuple
from typing import Union

from .architecture_features import ArchitectureFeatures
from .architecture_features import Block
from .architecture_features import SHRAMConfig
from .architecture_features import SHRAMElements
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .numeric_util import round_up
from .numeric_util import round_up_divide
from .operation import Kernel
from .operation import NpuBlockType
from .range_set import MemoryRangeSet
from .shape4d import Shape4D
from .tensor import MemArea


class SHRAMLayout:
    def __init__(self):
        self.ib_start = 0
        self.ib_end = 0
        self.ib_start2 = 0
        self.ab_start = 0
        self.lut_start = 0


class ArchitectureBlockConfig:
    def __init__(self):
        self.layout = SHRAMLayout()
        self.ifm_block = Shape4D()
        self.ofm_block = Shape4D()  # non-1D-optimised block
        self.acc_type = SHRAMElements.Acc32
        self.is_partkernel = False
        self.bank_size = 0

    def get_shram_memory_access_range(self):
        # Returns the SHRAM memory access range used by this shared buffer,
        # excluding access to LUT
        return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)

    def old_style_representation(self):
        return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]

    def __str__(self):
        return str(self.old_style_representation())


_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}


class ElementwiseUsage(enum.IntEnum):
    No = 0
    Full = 1
    Scalar = 2


def _try_block_config(
    shram: SHRAMConfig,
    ew_usage: ElementwiseUsage,
    ofm_block: Union[Shape4D, Block],
    ifm_block: Union[Shape4D, Block],
    ifm_bits: int,
    ifm_granule: int,
    acc_bits: int,
    acc_granule: int,
    lut_banks: int,
) -> Union[SHRAMLayout, None]:
    assert (acc_bits > 0) and (acc_granule > 0)
    assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)

    # Aways need IFM space
    ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
    ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
    ifm_banks = round_up(ifm_banks, ifm_granule)

    # Calculate SHRAM boundaries of the IFM and Accumulators
    lut_start = shram.total_banks - lut_banks
    ifm_end = shram.reserved_output_banks + ifm_banks
    ifm2_start = ifm_end
    acc_start = lut_start

    # If not elementwise then we need accumulator space
    if ew_usage == ElementwiseUsage.No:
        acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
        acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
        acc_banks = round_up(acc_banks, acc_granule)
        acc_start = acc_start - acc_banks
    else:
        ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
        if ifm2_start + ifm2_banks > acc_start:
            return None
        ifm_end = acc_start

    # IFM must still fit before accumulators
    if ifm_end > acc_start:
        return None

    # Should all fit, so return this layout
    layout = SHRAMLayout()
    layout.ib_start = shram.reserved_output_banks
    layout.ib_start2 = ifm2_start
    layout.ib_end = ifm_end
    layout.ab_start = acc_start
    layout.lut_start = lut_start
    return layout


def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
    if ifm_shape.depth <= 8:
        return True

    # Compare part-kernel to depth-kernel and choose the one with best utilisation
    kernel_elements = kernel.elements_wh()
    depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
    part_utilisation = (
        ifm_shape.depth
        * kernel_elements
        / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
    )

    return part_utilisation > depth_utilisation


def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
    ew_usage = ElementwiseUsage.No
    if npu_op_type == NpuBlockType.ElementWise:
        ew_usage = ElementwiseUsage.Full
        if uses_scalar:
            ew_usage = ElementwiseUsage.Scalar
    return ew_usage


def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
    """Returns accumulator type"""
    acc_type = SHRAMElements.Acc32
    if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
        acc_type = SHRAMElements.Acc40
    return acc_type


def is_nearest(ifm_resampling: resampling_mode) -> bool:
    return ifm_resampling == resampling_mode.NEAREST


def to_upscale(ifm_resampling: resampling_mode) -> int:
    # Upscaling depending on resampling mode
    return 1 if ifm_resampling == resampling_mode.NONE else 2


def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool):
    if ifm_bits == 16:
        ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
    else:
        ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
    return ifm_blockdepth


def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
    return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))


def get_ifm_area_required(
    ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode
) -> Tuple[int, int]:
    upscale = to_upscale(resampling_mode)
    nearest = is_nearest(resampling_mode)
    h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
    w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
    return (w1, h1)


def _get_ifm_blocksize(
    ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
) -> Shape4D:
    # IFM block height
    h1 = _required_size(
        ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
    )
    h2 = h1
    height = round_up(min(h1, h2), ublock.height)

    # IFM block width
    w1 = _required_size(
        ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
    )
    w2 = w1
    width = round_up(min(w1, w2), ublock.width)

    return Shape4D(1, height, width, ofm_block.depth)


def fit_block_for_ofm(
    arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block]
):
    # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
    # interpretation of a more general constraint that can't be applied because the
    # find_block_config function must return block configs that can be applied to any OFM shape.
    if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
        return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
    return block


def find_block_config(
    arch: ArchitectureFeatures,
    npu_op_type: NpuBlockType,
    ofm_shape: Shape4D,
    ifm_shape: Shape4D,
    ifm2_shape: Optional[Shape4D],
    uses_scalar: bool,
    ifm_bits: int,
    kernel: Kernel,
    lut_banks: int,
    scaled: bool,
    ifm_resampling: resampling_mode,
) -> Optional[ArchitectureBlockConfig]:
    SplitDepth = ArchitectureFeatures.OFMSplitDepth
    # Elementwise larger-volume correction
    if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
        ifm_shape = ifm2_shape

    # Figure out if SHRAM should be portioned for elementwise
    ew_usage = _ew_usage(npu_op_type, uses_scalar)

    # Operator typing help
    is_pooling = npu_op_type == NpuBlockType.Pooling
    is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
    is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
    is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise

    # Block config to be returned
    config = ArchitectureBlockConfig()
    config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)

    # Accumulator & granule settings
    config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

    # Memory rounding granules
    acc_granule = arch.accumulator_granules[config.acc_type]
    acc_bits = _AccumulatorBits[config.acc_type]
    if ew_usage != ElementwiseUsage.No:
        ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
    else:
        ifm_granule = arch.ifm_bank_granules[ifm_bits]
    lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
    upscale = to_upscale(ifm_resampling)
    nearest = is_nearest(ifm_resampling)

    # Subkernel repeats of the IFM
    ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
        kernel.area_height(), arch.SubKernelMax.height
    )
    ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)

    # Weights fetch (for operators that have them)
    weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0

    search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
    search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))

    # Block WHC search, loops across the search space looking for best efficiency
    best_cost = math.inf
    best_coverage = math.inf
    depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
    if depth < ofm_shape.depth:
        depth = round_up(depth, SplitDepth)

    while depth <= search_space.depth:
        wont_fit: Dict[Tuple[int, int], bool] = {}
        for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
            for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
                # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
                # fit, then 4x8x16 won't either.
                if wont_fit.get((height, width), False):
                    continue

                # Calculate the IFM block dimensions required to feed this OFM block
                ofm_block = Shape4D(1, height, width, depth)
                ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
                if not is_equal_depth_op:
                    ifm_block = ifm_block.with_depth(ifm_blockdepth)

                # Test if the IFM/OFM blocks fit into SHRAM
                ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
                layout = _try_block_config(
                    arch.shram,
                    ew_usage,
                    Block(ofm_block.width, ofm_block.height, ofm_block.depth),
                    Block(ifm_block.width, ifm_block.height, ifm_block.depth),
                    ifm_bits,
                    ifm_granule,
                    acc_bits,
                    acc_granule,
                    lut_banks,
                )

                if layout:
                    full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
                    blocks = ofm_shape / ofm_block

                    # Weights fetching
                    weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
                    if not is_depthwise:
                        weight_fetch *= ofm_block.depth * blocks.depth

                    # IFM fetching
                    ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
                    if not is_equal_depth_op:
                        ifm_fetch *= full_blocks.depth

                    # Scale relative to every output OFM element
                    if npu_op_type == NpuBlockType.ElementWise:
                        relative_cost = ofm_shape.elements() / (height * width * depth)
                    else:
                        relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()

                    # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
                    if ifm_shape.elements() < ifm_block.elements() * 2:
                        relative_cost = relative_cost / 2

                    # Choose based on relative minimum cost or larger IFM area (if equal cost)
                    if relative_cost <= best_cost:
                        choose_this = False
                        # Check IFM coverage only when it's equal best_cost and small OFM
                        if relative_cost == best_cost:
                            coverage_shape = Shape4D.min(ifm_shape, ifm_block)
                            coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
                            # Small 4x4 IFM constraint found through analysis of networks
                            if coverage <= best_coverage and (height <= 4 and width <= 4):
                                best_coverage = coverage
                                choose_this = True
                        else:
                            best_coverage = math.inf
                            choose_this = True

                        if choose_this:
                            best_cost = relative_cost
                            config.layout = layout
                            config.bank_size = arch.shram_bank_size
                            config.ifm_block = ifm_block
                            config.ofm_block = Shape4D(1, height, width, depth)
                else:
                    wont_fit[(width, height)] = True

        depth = depth + arch.ofm_ublock.depth
        if depth < ofm_shape.depth:
            depth = round_up(depth, SplitDepth)

    if best_cost != math.inf:
        return config

    return None


def try_block_config(
    block_config: Block,
    arch: ArchitectureFeatures,
    npu_op_type: NpuBlockType,
    ofm_shape: Union[Shape4D, Block],
    ifm_shape: Union[Shape4D, Block],
    ifm2_shape: Optional[Union[Shape4D, Block]],
    uses_scalar: bool,
    ifm_bits: int,
    is_partkernel: bool,
    kernel: Kernel,
    lut_banks: int,
    scaled: bool,
    ifm_resampling: resampling_mode,
) -> Optional[ArchitectureBlockConfig]:
    """
    Given a block_config, returns a corresponding ArchitectureBlockConfig.
    Returns None if the block_config does not fit or is invalid.
    """
    # Check block config validity
    if not all(
        blk > 0 and blk <= blk_max and blk % ublk == 0
        for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
    ):
        return None
    # Elementwise larger-volume correction
    if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
        ifm_shape = ifm2_shape

    ew_usage = _ew_usage(npu_op_type, uses_scalar)

    # Operator typing help
    is_pooling = npu_op_type == NpuBlockType.Pooling
    is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
    is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise

    # Block config to be returned
    config = ArchitectureBlockConfig()
    config.is_partkernel = is_partkernel

    # Accumulator & granule settings
    config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

    # Memory rounding granules
    acc_granule = arch.accumulator_granules[config.acc_type]
    acc_bits = _AccumulatorBits[config.acc_type]
    if ew_usage != ElementwiseUsage.No:
        ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
    else:
        ifm_granule = arch.ifm_bank_granules[ifm_bits]
    lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
    upscale = to_upscale(ifm_resampling)
    nearest = is_nearest(ifm_resampling)
    ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
    ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
    if not is_equal_depth_op:
        ifm_block = ifm_block.with_depth(ifm_blockdepth)

    # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
    block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)

    layout = _try_block_config(
        arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
    )
    if layout is None:
        return None
    config.layout = layout
    config.bank_size = arch.shram_bank_size
    config.ifm_block = ifm_block
    config.ofm_block = block_config
    return config