# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
# untouched in the final output.
#
# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
import numpy as np

from .nn_graph import Pass
from .nn_graph import PassPlacement
from .nn_graph import Subgraph
from .operation import CustomType
from .operation import NpuBlockType
from .operation import Op
from .operation import Operation


def make_npu_call_op_pass(npu_subgraph):
    op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
    op.attrs["subgraph"] = npu_subgraph
    op.attrs["custom_type"] = CustomType.NpuOp
    ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
    ps.ops = [op]
    ps.primary_op = op
    op.scheduled_pass = ps

    # Inputs and outputs filled in later as we cut the graphs
    return ps


def switch_tensor_for_op(op, orig_tens, new_tens):

    op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
    op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]

    ps = op.scheduled_pass
    if ps is None:
        return

    ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
    ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]

    if ps.ifm_tensor == orig_tens:
        ps.ifm_tensor = new_tens
    if ps.ifm2_tensor == orig_tens:
        ps.ifm2_tensor = new_tens
    if ps.ofm_tensor == orig_tens:
        ps.ofm_tensor = new_tens
    if ps.weight_tensor == orig_tens:
        ps.weight_tensor = new_tens
    if ps.scale_tensor == orig_tens:
        ps.scale_tensor = new_tens


def rewrite_tensor_cpu_producer_npu_consumers(
    orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
):
    is_const = orig_tens.ops[0].type == Op.Const
    new_tens = orig_tens.clone("_npu")

    op_type = Op.SubgraphInput
    if is_const:
        op_type = Op.Const
    op = Operation(op_type, orig_tens.name + "_input")
    op.scheduled_pass = startup_init_ps
    op.set_output_tensor(new_tens)
    startup_init_ps.ops.append(op)
    startup_init_ps.outputs.append(new_tens)

    if not is_const:
        call_ps.inputs.append(orig_tens)
        call_ps.primary_op.inputs.append(orig_tens)

    # Elementwise op can not overwrite ifm if input is used by many consumers
    if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
        new_tens.ifm_write_protected = True

    # Elementwise op can not overwrite ifm if tensor is used as output from sub graph
    if orig_tens in cpu_subgraph.output_tensors:
        new_tens.ifm_write_protected = True

    for op in list(orig_tens.consumers()):
        if op is None:
            continue  # Subgraph consumers handled separately.
        ps = op.scheduled_pass
        if subgraph_for_pass[ps] == npu_subgraph:
            switch_tensor_for_op(op, orig_tens, new_tens)
            orig_tens.consumer_list.remove(op)
            new_tens.consumer_list.append(op)

    # Deal with output tensors for the NPU graph. These are special.
    npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]


def rewrite_tensor_npu_producer_cpu_consumers(
    orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
):

    new_tens = orig_tens.clone("")
    orig_tens.name = orig_tens.name + "_cpu"
    npu_subgraph.output_tensors.append(orig_tens)

    call_ps.outputs.append(new_tens)
    call_ps.primary_op.outputs.append(new_tens)
    new_tens.ops = [call_ps.primary_op]

    for op in list(orig_tens.consumers()):
        if op is None:
            continue  # Subgraph consumers handled separately.
        ps = op.scheduled_pass
        if subgraph_for_pass[ps] != npu_subgraph:
            switch_tensor_for_op(op, orig_tens, new_tens)
            orig_tens.consumer_list.remove(op)
            new_tens.consumer_list.append(op)

    # Deal with output tensors for the CPU graph. These are special.
    cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]


def extract_subgraph(nng, orig_sg, arch):
    assert orig_sg.placement == PassPlacement.Cpu

    passes = list(orig_sg.passes)
    place_vec = np.array([ps.placement for ps in passes])
    place_vec[
        place_vec == PassPlacement.StartupInit
    ] = PassPlacement.Cpu  # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.

    # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
    # passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.

    # Forward, then backwards
    for is_reversed in range(2):
        last_place = PassPlacement.Cpu
        seq = enumerate(place_vec)
        if is_reversed:
            seq = reversed(list(seq))
        for idx, place in seq:
            if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
                if last_place == PassPlacement.Npu:
                    place = PassPlacement.Npu
                    place_vec[idx] = place

            if place != PassPlacement.MemoryOnly:
                last_place = place

    # Anything left, assign to the CPU.
    place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu

    if np.all(place_vec == PassPlacement.Cpu):
        return []  # Nothing to do

    # Create the subgraphs and split passes between them

    new_subgraphs = []
    split_count = 0
    subgraph_for_pass = {}
    orig_sg.passes = []
    call_pass = {}
    startup_init_passes = {}

    last_place = PassPlacement.Cpu
    curr_sg = orig_sg

    for idx, place in enumerate(place_vec):
        if place != last_place:
            if place == PassPlacement.Npu:
                split_count += 1
                curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
                new_subgraphs.append(curr_sg)
                call_ps = make_npu_call_op_pass(curr_sg)
                subgraph_for_pass[call_ps] = orig_sg
                orig_sg.passes.append(call_ps)
                call_pass[curr_sg] = call_ps

                startup_init_ps = Pass(
                    curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
                )
                curr_sg.passes.append(startup_init_ps)
                startup_init_passes[curr_sg] = startup_init_ps
                subgraph_for_pass[startup_init_ps] = curr_sg

            else:
                curr_sg = orig_sg
            last_place = place
        ps = passes[idx]
        subgraph_for_pass[ps] = curr_sg
        curr_sg.passes.append(ps)

    # Rewrite tensors to fix up graphs.

    for curr_sg in new_subgraphs:
        for ps in curr_sg.passes:
            for tens in ps.inputs:
                source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
                assert len(source_sgs) >= 0
                producer_sg = source_sgs[0]
                for sg in source_sgs:
                    assert sg == producer_sg  # All need to be the same.

                if producer_sg != curr_sg:
                    assert (
                        producer_sg == orig_sg
                    )  # Because we go in-order, all the producers must be the original graph.
                    rewrite_tensor_cpu_producer_npu_consumers(
                        tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
                    )

            for tens in ps.outputs:

                dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
                need_rewrite = False
                for sg in dest_sgs:
                    if sg != curr_sg:
                        need_rewrite = True
                        break
                if tens in orig_sg.output_tensors:
                    need_rewrite = True

                if need_rewrite:
                    rewrite_tensor_npu_producer_cpu_consumers(
                        tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
                    )

        for tens in curr_sg.output_tensors:
            # ofm can depend on multiple ops. These ops can be divided into different NPU
            # nodes due to CPU nodes. If that is the case the ofm must be NHWC.
            tens.needs_linear_format = True

    return new_subgraphs


def extract_npu_subgraphs(nng, arch):

    nng.refresh_after_modification()

    for sg in list(nng.subgraphs):
        if sg.placement == PassPlacement.Cpu:
            new_subgraphs = extract_subgraph(nng, sg, arch)
            nng.subgraphs += new_subgraphs

    nng.refresh_after_modification()
    nng.prune_startup_init_pass()

    for sg in nng.subgraphs:
        sg.build_pass_links()