/*
 * Copyright (c) 2022-2024 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"

#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/CL/CLTensor.h"

#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
#include "support/Cast.h"

#include <algorithm>

namespace arm_compute
{
namespace experimental
{
namespace dynamic_fusion
{
namespace
{
/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
 *
 * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
 *
 * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
 */
class ClAuxTensors
{
public:
    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
     */
    struct DataView
    {
        DataView() = default;
        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
        {
        }
        ~DataView()                                = default;
        DataView(const DataView &other)            = default;
        DataView &operator=(const DataView &other) = default;
        DataView(DataView &&other)                 = default;
        DataView     &operator=(DataView &&other)  = default;
        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
        TensorInfo    tensor_info{}; /**< Associated tensor info */
        AuxMemoryInfo memory_info{}; /**< Memory requirement */
    };

    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
    std::vector<DataView> get_tensors()
    {
        return _tensors;
    }
    std::vector<DataView> get_tensors() const
    {
        return _tensors;
    }

    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);

private:
    /** Add auxiliary tensor.
     *
     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
     * @param[in] memory_info Memory requirements of the auxiliary tensor
     *
     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
     */
    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
    {
        const auto t_id             = tensor_info.id();
        auto       find_tensor_pair = _owned_tensors.find(t_id);
        if (find_tensor_pair != _owned_tensors.end())
        {
            return find_tensor_pair->second.get();
        }
        else
        {
            auto tensor        = std::make_unique<CLTensor>();
            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
            auto new_tensor    = inserted_pair->second.get();
            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
            return new_tensor;
        }
    }

    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
    std::vector<DataView>                                _tensors{};
};
/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
 *
 * @note This is the only recommended method for user to create @ref ClAuxTensors
 *
 * @param[out] aux_tensors Auxiliary tensors required by the workload code
 * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
 *
 * @return Status
 */
Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
{
    for (auto t_id : code.tensors())
    {
        // Get tensor object
        const auto workload_arg  = code.query_tensor(t_id);
        ICLTensor *tensor_object = nullptr;
        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
        {
            // Create aux tensor CLTensor object
            const TensorInfo tensor_info = *workload_arg->tensor_info();
            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);

            if (tensor_object == nullptr)
            {
                return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
            }
        }
    }
    return Status{};
}

/** A fast tensor lookup table for runtime tensor objects retrieval
 */
class ClTensorLUT
{
public:
    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
     *
     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
     *
     * @return ITensorPack*
     */
    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
    {
        auto tensor_pack = _tensor_packs.find(uwk_id);
        if (tensor_pack != _tensor_packs.end())
        {
            return &(tensor_pack->second);
        }
        return nullptr;
    }
    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
     *
     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
     *
     * @return ITensorPack*
     */
    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
    {
        return _tensor_packs.at(uwk_id);
    }

    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
                                    const GpuWorkloadSourceCode   &code,
                                    const std::vector<CLTensor *> &user_tensors,
                                    const ClAuxTensors            &aux_tensors);

private:
    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
     *
     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
     * @param[in] tensor_pack Tensor pack to be added
     */
    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
    {
        _tensor_packs[uwk_id] = tensor_pack;
    }
    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
};

/** Create a fast tensor lookup table for runtime tensor retrieval
 *
 * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
 * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
 * @param[in]  user_tensors User tensors
 * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
 *
 * @return Status
 */
Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
                         const GpuWorkloadSourceCode   &code,
                         const std::vector<CLTensor *> &user_tensors,
                         const ClAuxTensors            &aux_tensors)
{
    // Combine user tensors and aux tensors
    std::map<ITensorInfo::Id, CLTensor *> tensor_map;
    for (auto tensor : user_tensors)
    {
        const auto t_id = tensor->info()->id();

        if (tensor_map.find(t_id) != tensor_map.end())
        {
            // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
            std::vector<ITensorInfo::Id> ids;
            for (auto &t : tensor_map)
            {
                ids.push_back(t.first);
            }
            ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
            tensor_map[new_id]     = tensor;
        }
        else
        {
            tensor_map[t_id] = tensor;
        }
    }
    for (const auto &data : aux_tensors.get_tensors())
    {
        const auto t_id   = data.tensor_info.id();
        const auto tensor = data.tensor;
        if (tensor_map.find(t_id) != tensor_map.end())
        {
            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
        }
        tensor_map[t_id] = tensor;
    }

    // Add tensor objects into corresponding tensor packs
    for (auto id_tensor : tensor_map)
    {
        const auto t_id          = id_tensor.first;
        const auto tensor_object = id_tensor.second;
        if (tensor_object == nullptr)
        {
            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
        }
        if (tensor_object->allocator()->info().total_size() == 0U)
        {
            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
        }

        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
        {
            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
            if (tensor_pack == nullptr)
            {
                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
            }
            else
            {
                tensor_pack->add_tensor(t_id, tensor_object);
            }
        }
    }

    return Status{};
}

} // namespace

struct ClWorkloadRuntime::Implementation
{
    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
    bool                                                       _is_configured{false};
    bool                                                       _is_prepared{false};
    ClTensorLUT                                                _tensor_lut{};
    ClAuxTensors                                               _aux_tensors{};
    GpuWorkloadSourceCode                                      _source_code{};
};

ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
{
}

ClWorkloadRuntime::~ClWorkloadRuntime() = default;

ClWorkloadRuntime::ClWorkloadRuntime(ClWorkloadRuntime &&) = default;

ClWorkloadRuntime &ClWorkloadRuntime::operator=(ClWorkloadRuntime &&) = default;

Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
{
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
    // Generate source code
    _impl->_source_code = sketch.implementation().generate_source_code();
    // Configure unit workload from source code
    for (auto uwk_id : _impl->_source_code.unit_workloads())
    {
        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
        const auto stage = work.stage().stage;
        auto       k     = std::make_unique<ClKernelRuntime>();
        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());

        switch (stage)
        {
            case UnitWorkloadStage::Stage::Run:
            {
                _impl->_kernels.emplace(work.id(), std::move(k));
                break;
            }
            case UnitWorkloadStage::Stage::Prepare:
            {
                _impl->_kernels_prep.emplace(work.id(), std::move(k));
                break;
            }
            default:
            {
                ARM_COMPUTE_ERROR("Invalid unit workload stage");
            }
        }
    }
    // Create auxiliary tensor objects
    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
    _impl->_is_configured = true;
    return Status{};
}

void ClWorkloadRuntime::prepare()
{
    if (!_impl->_is_prepared)
    {
        for (auto &id_kernel_pair : _impl->_kernels_prep)
        {
            const bool flush_queue = false;
            const auto uwk_id      = id_kernel_pair.first;
            auto       kernel      = id_kernel_pair.second.get();
            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
        }

        _impl->_is_prepared = true;
    }
}

Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
{
    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
    // in which case the lut can be cached during prepare
    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
    ARM_COMPUTE_RETURN_ON_ERROR(st);
    prepare();
    for (auto &id_kernel_pair : _impl->_kernels)
    {
        // Flush the command queue on the last kernel
        const bool flush_queue = false;
        const auto uwk_id      = id_kernel_pair.first;
        auto       kernel      = id_kernel_pair.second.get();
        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
    }
    return Status{};
}

std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
{
    std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
    for (const auto &data : _impl->_aux_tensors.get_tensors())
    {
        aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
    }
    return aux_tensors;
}
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute