# SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Description:
# Common functions and definitions used during the graph optimization.
from typing import Tuple

import numpy as np

from .architecture_features import Accelerator
from .data_type import DataType
from .debug_database import DebugDatabase
from .errors import UnsupportedFeatureError
from .errors import VelaError
from .operation import Op
from .operation import Operation
from .operation_util import create_avgpool_nop
from .shape4d import Shape4D
from .tensor import Tensor

memory_only_ops = (
    Op.Reshape,
    Op.QuantizedReshape,
    Op.Squeeze,
    Op.ExpandDims,
    Op.Identity,
)


def _avoid_nhcwb16_for_concat(tens):
    # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
    # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
    # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
    # and those addresses are always 16 byte aligned due to the NHCWB16 format.
    return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)


def _avoid_nhcwb16_for_split(tens):
    # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

    # Return True if NHCWB16 needs to be avoided
    def offset_not_aligned(read_offset):
        return read_offset is not None and (read_offset.depth % 16) != 0

    for cons_op in tens.consumer_list:
        if cons_op.ifm == tens:
            if offset_not_aligned(cons_op.read_offsets[0]):
                return True
        if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
            if offset_not_aligned(cons_op.read_offsets[1]):
                return True
    return False


def _avoid_nhcwb16_for_shapes(tens):
    # check all producers/consumers to see if any op shape is preventing NHCWB16
    for cons_op in tens.consumer_list:
        if cons_op.ifm == tens:
            cons_op_shape = cons_op.ifm_shapes[0]
        elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
            cons_op_shape = cons_op.ifm_shapes[1]
        else:
            assert False
        if Shape4D(tens.shape) != cons_op_shape:
            return True

    for prod_op in tens.ops:
        if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
            return True

    return False


def _avoid_nhcwb16_for_memory_only(tens):
    # check all producers/consumers to see if any op is preventing NHCWB16
    return any(op.type == Op.Memcpy for op in (tens.consumer_list + tens.ops))


# Check if non linear format can be used
def check_format_restrictions(tens: Tensor, arch):
    if tens.force_linear_format:
        return
    if len(tens.ops) < 1:
        return
    if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
        cons is None for cons in tens.consumer_list
    ):
        return

    # Writing to the buffer of a variable tensor needs to be linear format
    if tens.ops[0].memory_function == Op.VariableTensorWrite:
        return

    # Check if any of the producers/consumers is run on CPU
    if not all(cons.run_on_npu for cons in tens.consumer_list):
        return
    if not all(prod.run_on_npu for prod in tens.ops):
        return

    # "Concat" ofm exception:
    if _avoid_nhcwb16_for_concat(tens):
        return

    # "Split" ifm exception:
    if _avoid_nhcwb16_for_split(tens):
        return

    # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
    if _avoid_nhcwb16_for_shapes(tens):
        return

    # Memory only ifm/ofm exception: DMA ops must use NHCW
    if _avoid_nhcwb16_for_memory_only(tens):
        return

    # Resize bilinear half pixel center implementation requires OFM with linear format to
    # allow stride modification in H/W dimensions.
    for op in tens.ops:
        if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
            return

    for op in tens.consumer_list:
        if op.type == Op.ReduceSum and (
            tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
        ):
            # ReduceSum requires NHWC input
            return
        if op.type == Op.Reshape:
            # Using NHCWB16 format for a no-op reshape is only an option if subsequent
            # consumers do not also need to perform a reshape or if the OFM is going to
            # be processed by CPU operations. No-op reshape consumers with empty lists
            # (those that have no consumers, or null-consumers used as list terminators)
            # must use normal NHWC output.

            def incompatible_consumers(oper):
                if oper and oper.type == Op.Reshape:
                    for consumer in oper.outputs[0].consumer_list:
                        yield from incompatible_consumers(consumer)
                yield not oper or not oper.run_on_npu

            if not any(incompatible_consumers(op)):

                def get_rewrites(oper):
                    if oper and oper.type == Op.Reshape:
                        for consumer in oper.outputs[0].consumer_list:
                            yield from get_rewrites(consumer)
                        yield oper

                # Detect no-op reshapes by comparing their full input and output tensor shapes.
                inshape = op.ifm_shapes[0]
                compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
                if not (compatible_shape and all(compatible_shape)):
                    return
            else:
                return

    tens.force_linear_format = False


def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
    """
    Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
    that provides equivalent results.
    """
    total_padding = needed_total_padding(input_size, stride, filter_size)

    # The bottom/right padding might need downward adjustment depending on stride/input size
    total_minus_before = total_padding - pad_before
    output_pad_after = pad_after
    while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
        output_pad_after -= 1
    return pad_before, output_pad_after


def needed_total_padding(input_size, stride, filter_size):
    """Compute hardware padding."""
    if input_size % stride == 0:
        return max(filter_size - stride, 0)

    return max(filter_size - (input_size % stride), 0)


def set_tensor_equivalence(op: Operation, arch, nng) -> Operation:
    """Set input/output tensor equivalence to the same id for memory operations."""
    if op.type in memory_only_ops:
        eid = op.outputs[0].equivalence_id
        for inp in op.inputs:
            inp.equivalence_id = eid
    return op


def set_ifm_ofm_op_shapes(op, arch, nng):
    if op.run_on_npu and op.type.needs_shapes():
        if op.ifm_shapes or op.ofm_shapes:
            # Shapes already set
            return op
        op.set_ifm_ofm_shapes()
    return op


def check_splitsliceread_to_consumer_shape(op, cons_op):
    assert op.type == Op.SplitSliceRead
    # SplitSliceRead ofm shape must fit within the consumer ifm shape
    if cons_op.ifm == op.ofm:
        cons_shape = cons_op.ifm_shapes[0].as_list()
        read_shape = op.ofm_shapes[0].as_list()
    elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
        cons_shape = cons_op.ifm_shapes[1].as_list()
        read_shape = op.ofm_shapes[0].as_list()
    else:
        return False

    # All read shape values <= consumer shape values
    return all(read_shape[idx] <= x for idx, x in enumerate(cons_shape))


def move_splitsliceread_to_consumer(op, cons_op):
    assert op.type == Op.SplitSliceRead

    if cons_op.ifm == op.ofm:
        cons_op.read_offsets[0] = op.read_offsets[0]
        cons_op.read_shapes[0] = op.read_shapes[0]
        cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
        cons_op.ifm_shapes[0] = op.ifm_shapes[0]
    elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
        cons_op.read_offsets[1] = op.read_offsets[0]
        cons_op.read_shapes[1] = op.read_shapes[0]
        cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
        cons_op.ifm_shapes[1] = op.ifm_shapes[0]
    op.ofm.consumer_list.remove(cons_op)
    op.ofm.ops = []
    if op in op.ifm.consumer_list:
        op.ifm.consumer_list.remove(op)


def check_memory_only_removed(op, arch):
    if op.run_on_npu and op.type in memory_only_ops:
        # Memory only operators should have been removed
        raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")


def record_optimised(op, arch):
    if op.type not in (Op.Const, Op.Placeholder):
        DebugDatabase.add_optimised(op, op)


def bypass_memory_only_ops(op, arch, nng):
    if not op.run_on_npu or op.type not in memory_only_ops:
        return op

    # Memory only operators can be completely removed if there is a one to one
    # connection. The reshape OFM can be connected to the previous op.
    #
    #                Bypassed to
    #                    --->
    #       1x6x6x10             1x6x6x10
    #         ADD                  ADD
    #          |          ------->  |
    #       1x6x6x10      |      1x20x3x6
    #        RESHAPE      |        MEAN
    #          | ---------|
    #       1x20x3x10
    #         MEAN
    #
    # In the above the ADD OFM = RESHAPE IFM is removed and replaced by
    # the RESHAPE OFM.
    #
    # Then there are two cases when bypassing is not possible. One is when
    # the IFM is produced by the CPU. This tensor must be preserved. It
    # cannot be removed from the graph. The other case is when the IFM has
    # multiple consumers, then it is not possible to just bypass the op and
    # there is a need for a DMA (nop).
    #
    #                Converts to
    #                    --->
    #       1x6x6x10                1x6x6x10
    #    -----ADD-----           -----ADD-----
    #    |           |           |           |
    # 1x6x6x10    1x6x6x10   1x6x6x10    1x6x6x10
    #  RESHAPE      MEAN       DMA OP      MEAN
    #    |                       |
    # 1x20x3x6               1x20x3x6
    #   MEAN                   MEAN
    #
    # If the DMA IFM and DMA OFM ends up in the same memory area
    # the DMA op will be removed when the cmd stream is generated.

    ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
    ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)

    if ifm_has_multiple_cons or ifm_is_cpu_produced:
        # Convert to a memcpy op
        op.type = Op.Memcpy
        DebugDatabase.add_optimised(op, op)
    else:
        # Bypass op
        ofm = op.ofm
        ifm = op.ifm
        ofm.ops = []
        for prev_op in ifm.ops:
            prev_op.outputs = [ofm]
            ofm.ops.append(prev_op)

    return op


def convert_depthwise_to_conv(op: Operation, arch, nng) -> Operation:
    """Convert DepthwiseConv2DBias to Conv2D to allow support for DepthwiseConv2DBias ops with 'depth multiplier' > 1,
    as long as IFM depth = 1 and OFM depth is equal to the depth multiplier.
    """
    if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
        ifm_shape = op.ifm_shapes[0]
        weight_tensor = op.inputs[1]
        ofm_shape = op.ofm_shapes[0]
        # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
        # the ofm depth equals the depth multipler.
        if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
            # Change op type to Conv2d
            op.type = Op.Conv2DBias
            del op.attrs["channel_multiplier"]
            del op.attrs["depth_multiplier"]

            weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
            weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
            DebugDatabase.add_optimised(op, op)
        else:
            raise UnsupportedFeatureError(
                f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},"
                f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}"
            )
    return op


def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
    """Creates an average pool for the given concat op/input feature map"""
    ofm = concat_op.ofm
    avgpool_op = create_avgpool_nop(name)
    avgpool_op.inputs = [ifm]
    avgpool_op.outputs = [ofm]

    avgpool_op.write_offset = write_offset
    avgpool_op.write_shape = ifm_shape
    ofm.ops.append(avgpool_op)
    avgpool_op.ifm_shapes.append(ifm_shape)
    avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
    avgpool_op.memory_function = Op.ConcatSliceWrite
    DebugDatabase.add_optimised(concat_op, avgpool_op)
    return avgpool_op