# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Description:
# Contains external APIs
from enum import auto
from enum import Enum
from typing import List
from typing import NamedTuple
from typing import Optional
from typing import Tuple

import numpy


API_VERSION_MAJOR = 1
API_VERSION_MINOR = 5
API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}"


class NpuAccelerator(Enum):
    """
    Supported accelerators
    """

    Ethos_U55_32 = auto()
    Ethos_U55_64 = auto()
    Ethos_U55_128 = auto()
    Ethos_U55_256 = auto()
    Ethos_U65_256 = auto()
    Ethos_U65_512 = auto()


class NpuElementWiseOp(Enum):
    """
    Elementwise operation
    """

    ADD = auto()
    SUB = auto()
    MUL = auto()
    ABS = auto()
    MIN = auto()
    MAX = auto()
    LRELU = auto()  # Leaky relu
    CLZ = auto()  # Number leading zeros
    SHR = auto()  # Rounded right-shift
    SHL = auto()  # Bitwise shift-left


class NpuPoolingOp(Enum):
    """
    Pooling operation
    """

    MAX = auto()
    AVERAGE = auto()
    REDUCE_SUM = auto()


class NpuActivationOp(Enum):
    """
    Activation function
    """

    NONE_OR_RELU = auto()  # Clamps output using min/max
    TANH = auto()
    SIGMOID = auto()
    TABLE_LOOKUP = auto()  # Performs table look-up, using the provided table lookup index


class NpuRoundingMode(Enum):
    """
    Available rounding modes
    """

    TFL = auto()  # TensorFlow Lite rounding
    TRUNCATE = auto()  # Truncate towards zero
    NATURAL = auto()  # Round to nearest with x.5 rounded up, towards +infinity


class NpuLayout(Enum):
    """
    Tensor layout of feature maps
    """

    NHWC = auto()
    NHCWB16 = auto()

    def __str__(self):
        return self.name


class NpuResamplingMode(Enum):
    """
    Resampling mode
    """

    NONE = auto()  # No resampling is performed
    NEAREST = auto()  # 2x2 insert nearest
    TRANSPOSE = auto()  # 2x2 transpose


class NpuBlockTraversal(Enum):
    """
    Block-traversal of weights
    """

    DEPTH_FIRST = auto()
    PART_KERNEL_FIRST = auto()


class NpuDataType(Enum):
    """
    Supported data types in feature maps
    """

    UINT8 = 8, False, auto()
    INT8 = 8, True, auto()
    UINT16 = 16, False, auto()
    INT16 = 16, True, auto()
    INT32 = 32, True, auto()

    def is_signed(self) -> bool:
        """Checks if this data type is signed or unsigned"""
        return self.value[1]

    def size_in_bits(self) -> int:
        """Size of the data type in bits"""
        return self.value[0]

    def size_in_bytes(self) -> int:
        """Size of the data type in bytes"""
        return self.value[0] // 8

    def min_value(self) -> int:
        """Minimum value of this type"""
        if self.is_signed():
            return -(1 << (self.size_in_bits() - 1))
        else:
            return 0

    def max_value(self) -> int:
        """Maximum value of this type"""
        if self.is_signed():
            return (1 << (self.size_in_bits() - 1)) - 1
        else:
            return (1 << self.size_in_bits()) - 1

    def __str__(self):
        return self.name

    __repr__ = __str__


class NpuAddressRange(NamedTuple):
    """
    Address range
    """

    region: int  # Memory region, a value between 0 and 7
    address: int  # Address, offset from the region's base address
    length: int  # The length of the range, in bytes

    def __str__(self):
        return f"(region={self.region}, address={hex(self.address)}, length={self.length})"


class NpuTileBox(NamedTuple):
    """
    Specifies the addresses and dimensions of the tiles of a feature map.
    A feature map can use 1 to 4 tiles
    """

    height_0: int  # The height of tile 0
    height_1: int  # The height of tile 1, 0 if unused
    width_0: int  # the width of tile 0, and tile 2 (if used)
    addresses: List[int]  # A list of 4 addresses, set unused addresses to 0


class NpuShape3D(NamedTuple):
    """
    Shape of (part of) a feature map
    """

    height: int
    width: int
    depth: int


class NpuQuantization(NamedTuple):
    """
    Quantization parameters
    """

    scale_f32: Optional[float]
    zero_point: int


class NpuPadding(NamedTuple):
    """
    Padding to be applied to a convolution operation
    """

    top: int
    left: int
    bottom: int
    right: int


class NpuActivation:
    """
    Activation function, fused with NPU operations
    """

    def __init__(self, op_type: NpuActivationOp):
        self.op_type = op_type  # The activation operation to be performed
        # min/max are optional
        self.min: Optional[float] = None  # E.g. set to 0.0 for RELU
        self.max: Optional[float] = None  # E.g. set to 6.0 for RELU6
        # Table lookup index, only applicable for TABLE_LOOKUP activation, 0-7
        self.lookup_table_index: int = 0


class NpuFeatureMap:
    """
    Basic information about IFM, IFM2, OFM
    """

    def __init__(self):
        self.data_type: NpuDataType = NpuDataType.UINT8
        # The memory region, a value 0-7
        self.region: int = 0
        # Shape of the feature map
        self.shape: NpuShape3D = NpuShape3D(height=0, width=0, depth=0)
        # The tiles that comprise the feature map. In the normal case when only 1 tile is used,
        # height_0 == self.shape.height, height_1 is 0, width_0 == self.shape.width, addresses[1:] are set to 0
        self.tiles: NpuTileBox = NpuTileBox(height_0=0, height_1=0, width_0=0, addresses=[0, 0, 0, 0])
        self.quantization: Optional[NpuQuantization]
        self.layout: NpuLayout = NpuLayout.NHWC
        # x/y/c strides used by the NPU when traversing the feature map, if None, vela will use default strides
        self.strides: Optional[NpuShape3D] = None
        # Used for debug
        self.name: Optional[str] = None


class NpuKernel:
    """
    Kernel information for NPU operations
    """

    def __init__(self, w: int, h: int, stride_x: int = 1, stride_y: int = 1, dilation_x: int = 1, dilation_y: int = 1):
        assert stride_x > 0 and stride_y > 0
        assert dilation_x > 0 and dilation_y > 0
        self.width = w
        self.height = h
        self.stride_x = stride_x
        self.stride_y = stride_y
        self.dilation_x = dilation_x
        self.dilation_y = dilation_y


class NpuAccumulatorType(Enum):
    """
    Accumulator dtype of NPU operation
    """

    Default = auto()
    Int32 = auto()
    Int40 = auto()


class NpuOperationType(Enum):
    """
    Type of NPU operation
    """

    Dma = auto()
    Conv2D = auto()
    ConvDepthWise = auto()
    Pooling = auto()
    ElementWise = auto()


class NpuOperation:
    """
    Base class for all NPU operations
    """

    def __init__(self, op_type: NpuOperationType):
        self.op_type = op_type
        # Used for debug
        self.name: Optional[str] = None


class NpuDmaOperation(NpuOperation):
    """
    DMA operation
    """

    def __init__(self, src: NpuAddressRange, dest: NpuAddressRange):
        super().__init__(NpuOperationType.Dma)
        self.src = src
        self.dest = dest
        # DMA channel, usually 0 (user channel)
        self.channel: int = 0
        # Channel mode, 0 = external, 1 = internal (should usually be 0)
        self.mode: int = 0


class NpuBlockOperation(NpuOperation):
    """
    Base class for operations which produce an OFM
    """

    def __init__(self, op_type: NpuOperationType):
        super().__init__(op_type)
        self.ifm: Optional[NpuFeatureMap] = None
        self.ifm2: Optional[NpuFeatureMap] = None
        # The non-quantized scalar value in a binary elementwise operation. Only set if IFM2 is scalar
        self.ifm2_scalar: Optional[float] = None
        self.ofm: Optional[NpuFeatureMap] = None
        self.kernel: Optional[NpuKernel] = None
        # Weights, one element for each NPU core, empty if no weights are used.
        # Must have been compressed using npu_encode_weights()
        self.weights: List[NpuAddressRange] = []
        # Biases, one element for each NPU core, empty if no bias is used.
        # Must have been encoded using npu_encode_bias()
        self.biases: List[NpuAddressRange] = []
        self.padding: Optional[NpuPadding] = None
        # Optional activation function to be applied
        self.activation: Optional[NpuActivation] = None
        # The block config to be used, which must be valid for the given operation.
        # See also npu_find_block_configs.
        # If the operation has weights, the depth of the block config must be the same as
        # the ofm depth used in the call to npu_encode_weights()
        self.block_config: NpuShape3D
        self.rounding_mode: NpuRoundingMode = NpuRoundingMode.TFL
        # Set to True if the operations is fused with a Quantize operation (affects scaling)
        self.fused_quantize: bool = False
        # IFM upscaling to be applied
        self.ifm_upscale: NpuResamplingMode = NpuResamplingMode.NONE
        self.accumulator_type: NpuAccumulatorType = NpuAccumulatorType.Default


class NpuConv2DOperation(NpuBlockOperation):
    """
    NPU_OP_CONV operation
    """

    def __init__(self):
        super().__init__(NpuOperationType.Conv2D)
        # Block traversal must be consistent with the block_traversal parameter specified in
        # weight_compressor.encode_weights()
        self.block_traversal: NpuBlockTraversal = NpuBlockTraversal.PART_KERNEL_FIRST


class NpuConvDepthWiseOperation(NpuBlockOperation):
    """
    NPU_OP_DEPTHWISE operation
    """

    def __init__(self):
        super().__init__(NpuOperationType.ConvDepthWise)


class NpuPoolingOperation(NpuBlockOperation):
    """
    NPU_OP_POOL operation
    """

    def __init__(self, pooling_op_type: NpuPoolingOp):
        super().__init__(NpuOperationType.Pooling)
        self.sub_op_type: NpuPoolingOp = pooling_op_type
        # Set to a float value for ResizeBilinear/NearestNeighbor operations (affects scaling), else to None
        self.rescale: Optional[float] = None


class NpuElementWiseOperation(NpuBlockOperation):
    """
    NPU_OP_ELEMENTWISE operation
    """

    def __init__(self, elementwise_op_type: NpuElementWiseOp):
        super().__init__(NpuOperationType.ElementWise)
        self.sub_op_type: NpuElementWiseOp = elementwise_op_type
        # Set to True for binary operators where IFM2 should be used as first operand
        self.reversed_operands: bool = False
        # Set to a tuple (scale, shift) for explicit rescale, else to None
        self.rescale: Optional[Tuple] = None


def npu_get_api_version():
    """
    Public facing API to get the API version
    :return: int, the 16 most significant bits, corresponding to major version
            the 16 least significant bits, corresponding to minor version
    """
    version = (API_VERSION_MAJOR << 16) | (API_VERSION_MINOR & 0xFFFF)
    return version


def npu_encode_weights(
    accelerator: NpuAccelerator,
    weights_volume: numpy.ndarray,
    dilation_xy: Tuple[int, int],
    ifm_bitdepth: int,
    ofm_block_depth: int,
    is_depthwise: bool,
    block_traversal: NpuBlockTraversal,
):
    """
    Public facing API to use the Ethos-U weight encoding.

    :param accelerator: NpuAccelerator enum to pick the correct accelerator
    :param weights_volume: numpy.ndarray in OHWI layout with a shape of four
    :param dilation_xy: a two element tuple of dilation attributes in x,y dimension
    :param ifm_bitdepth: the bitdepth of input feature map
    :param ofm_block_depth: the depth of blocks for processing
    :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
    :param block_traversal: indicates how these weights are traversed on sub-kernel basis
    :return: a bytearray of encoded weights
    """
    from .architecture_features import Accelerator
    from . import weight_compressor

    acc = Accelerator.from_npu_accelerator(accelerator)
    encoded_weights, _ = weight_compressor.encode_weights(
        acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal
    )
    return encoded_weights


def npu_encode_bias(bias: numpy.int64, scale: int, shift: int):
    """
    Public facing API to pack bias and scale values as required by the hardware
    :param bias: 64-bit signed number that includes 40-bit signed bias
    :param scale: 32-bit scale value
    :param shift: 6-bit shift value
    :return: packed 80-bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
    """
    from . import weight_compressor

    return weight_compressor.encode_bias(bias, scale, shift)


def npu_find_block_configs(npu_op: NpuOperation, accelerator: NpuAccelerator) -> List[NpuShape3D]:
    """
    Public facing API that returns a list of block configs that are valid for the given operation.
    This function can be used to find a valid value for npu_op.block_config.
    The block config is the unit of work in which the NPU generates the OFM.
    """
    from .architecture_features import Accelerator
    from .architecture_features import ArchitectureFeatures
    from .architecture_features import Block
    from .architecture_features import create_default_arch
    from .architecture_allocator import try_block_config
    from .register_command_stream_generator import resampling_mode_map
    from .register_command_stream_util import to_kernel
    from .operation import NpuBlockType

    is_partkernel = False
    if isinstance(npu_op, NpuConv2DOperation):
        block_type = NpuBlockType.ConvolutionMxN
        is_partkernel = npu_op.block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
    elif isinstance(npu_op, NpuConvDepthWiseOperation):
        block_type = NpuBlockType.ConvolutionDepthWise
    elif isinstance(npu_op, NpuPoolingOperation):
        block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
    elif isinstance(npu_op, NpuElementWiseOperation):
        block_type = NpuBlockType.ElementWise
    else:
        assert 0, "Unsupported operation"

    ifm_shape = Block(npu_op.ifm.shape.width, npu_op.ifm.shape.height, npu_op.ifm.shape.depth)
    ifm2_shape = None
    if npu_op.ifm2:
        ifm2_shape = Block(npu_op.ifm2.shape.width, npu_op.ifm2.shape.height, npu_op.ifm2.shape.depth)
    ofm_shape = Block(npu_op.ofm.shape.width, npu_op.ofm.shape.height, npu_op.ofm.shape.depth)

    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
    ifm_bits = npu_op.ifm.data_type.size_in_bits()
    kernel = to_kernel(npu_op.kernel)
    lut_banks = 0
    if npu_op.activation:
        lut_banks = 2 if npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP else 0

    has_scaling = True
    for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
        if tensor and tensor.quantization is None:
            has_scaling = False
            break

    arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))

    max_block_width = min(arch.ofm_block_max.width, ofm_shape.width)
    max_block_height = min(arch.ofm_block_max.height, ofm_shape.height)
    max_block_depth = min(arch.ofm_block_max.depth, ofm_shape.depth)

    min_block_height = max(arch.ofm_ublock.height, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)
    min_block_width = max(arch.ofm_ublock.width, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)

    valid_block_configs = []
    for w in range(min_block_width, max_block_width + min_block_width, min_block_width):
        for h in range(min_block_height, max_block_height + min_block_height, min_block_height):
            # Try valid OFM block depths
            for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
                # OFM block depth has the constraint that if it causes the OFM to be
                # split, it must be a multiple of the OFM split size
                if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
                    block = Block(w, h, c)
                    config = try_block_config(
                        block,
                        arch,
                        block_type,
                        ofm_shape,
                        ifm_shape,
                        ifm2_shape,
                        npu_op.ifm2_scalar is not None,
                        ifm_bits,
                        is_partkernel,
                        kernel,
                        lut_banks,
                        has_scaling,
                        ifm_resampling_mode,
                    )

                    if config:
                        ofm_block = config.ofm_block
                        valid_block_configs.append(NpuShape3D(ofm_block.height, ofm_block.width, ofm_block.depth))

    assert len(valid_block_configs) > 0
    return valid_block_configs


def npu_generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: NpuAccelerator) -> List[int]:
    """
    Public facing API for generating an Ethos-U register command stream.
    Calculates dependencies between commands and inserts wait operations if needed.

    :param npu_op_list: List[NpuOperation] list of high level NPU operations
    :param accelerator: NpuAccelerator enum to pick the correct accelerator
    :return register commands, as a list of 32-bit integers
    """
    from . import register_command_stream_generator

    return register_command_stream_generator.generate_register_command_stream(npu_op_list, accelerator)


def npu_create_driver_payload(register_command_stream: List[int], accelerator: NpuAccelerator) -> bytes:
    """
    Public facing API for generating driver payload, containing a driver header
    and the given Ethos-U register command stream.
    Returns the payload, in little endian format, which must be placed in memory on a 16-byte aligned
    address.

    :param register_command_stream: List[int] register commands, as a list of 32-bit integers
    :param accelerator: NpuAccelerator enum to pick the correct accelerator
    :return driver payload, as a byte array
    """
    from . import driver_actions

    return driver_actions.npu_create_driver_payload(register_command_stream, accelerator)