/**
 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "scheduler.h"

#include "optimizer/analysis/alias_analysis.h"
#include "optimizer/ir/basicblock.h"
#include "compiler_logger.h"

namespace panda::compiler {
/* Instruction scheduling.
 * Current decisions/limitations
 *
 * 1. Scheduler pass is placed immediately before register allocator.
 * 2. It rearranges instructions only inside the basic block, but not between them.
 * 3. No liveness analysis, only calculating dependencies using barrier/users/alias information.
 * 4. No CPU pipeline/resource modeling, only having dependency costs.
 * 5. Forward list scheduling algorithm with standart critical-path-based priority.
 */
bool Scheduler::RunImpl()
{
    COMPILER_LOG(DEBUG, SCHEDULER) << "Run " << GetPassName();
    bool result = false;
    for (auto bb : GetGraph()->GetBlocksRPO()) {
        if (!bb->IsEmpty() && !bb->IsStartBlock()) {
            result |= ScheduleBasicBlock(bb);
        }
    }
    COMPILER_LOG(DEBUG, SCHEDULER) << GetPassName() << " complete";
#ifndef NDEBUG
    GetGraph()->SetSchedulerComplete();
#endif  // NDEBUG
    return result;
}

// Dependency helper function
void Scheduler::AddDep(uint32_t *prio, Inst *from, Inst *to, uint32_t latency, Inst *barrier)
{
    if (from == to) {
        return;
    }
    COMPILER_LOG(DEBUG, SCHEDULER) << "Found dependency " << from->GetId() << " -> " << to->GetId() << " latency "
                                   << latency;
    // Estimate old cycle (without scheduling)
    ocycle_[from] = std::max(ocycle_[from], latency + ocycle_[to]);

    // Update instruction priority - "how high instruction is in dependency tree"
    *prio = std::max(*prio, latency + prio_[to]);

    // Do not add cross-barrier depenedencies into deps_
    if (barrier == nullptr || old_[to] > old_[barrier]) {
        if (deps_.at(from).count(to) == 1) {
            uint32_t old_latency = deps_.at(from).at(to);
            if (old_latency >= latency) {
                return;
            }
        } else {
            num_deps_[to]++;
        }
        deps_.at(from)[to] = latency;
    }
}

// Calculate priority and dependencies
bool Scheduler::BuildAllDeps(BasicBlock *bb)
{
    auto marker_holder = MarkerHolder(GetGraph());
    auto mrk = marker_holder.GetMarker();

    oprev_ = 0;
    num_barriers_ = 0;
    max_prio_ = 0;

    static constexpr uint32_t TOO_LONG_BB = 256;
    uint32_t num_inst = 0;
    uint32_t num_between = 0;
    uint32_t num_special = 0;
    Inst *last_barrier = nullptr;
    for (auto inst : bb->InstsSafeReverse()) {
        ProcessInst(inst, mrk, &num_inst, &num_between, &num_special, &last_barrier);

        if (num_special > TOO_LONG_BB || num_between > TOO_LONG_BB) {
            COMPILER_LOG(DEBUG, SCHEDULER) << "Block " << bb->GetId() << " seems too big for scheduling, skipping";
            Cleanup();
            return false;
        }
    }
    return true;
}

// One instruction deps
void Scheduler::ProcessInst(Inst *inst, Marker mrk, uint32_t *num_inst, uint32_t *num_between, uint32_t *num_special,
                            Inst **last_barrier)
{
    uint32_t prio = 0;
    uint32_t inst_latency = inst->Latency();
    bool barrier = inst->IsBarrier();

    (*num_between)++;
    old_.insert({inst, (*num_inst)++});
    ocycle_.insert({inst, ++oprev_});
    num_deps_.insert({inst, 0U});
    deps_.emplace(inst, GetGraph()->GetLocalAllocator()->Adapter());

    // Dependency to the barrier
    if (*last_barrier != nullptr) {
        AddDep(&prio, inst, *last_barrier, 1U, *last_barrier);
    }

    // Dependency from barrier
    if (barrier) {
        Inst *old_last_barrier = *last_barrier;
        *last_barrier = inst;
        num_barriers_++;
        *num_between = 0;
        for (auto user = inst->GetNext(); user != old_last_barrier; user = user->GetNext()) {
            AddDep(&prio, inst, user, 1U, *last_barrier);
        }
    }

    // Users
    for (auto &user_item : inst->GetUsers()) {
        auto user = user_item.GetInst();
        if (user->IsMarked(mrk)) {
            AddDep(&prio, inst, user, inst_latency, *last_barrier);
        }
    }

    if (inst->IsMemory() || inst->IsRefSpecial()) {
        ProcessMemory(inst, &prio, *last_barrier);
        (*num_special)++;
    }

    if (inst->CanThrow() || inst->IsRuntimeCall() || inst->IsSaveState()) {
        ProcessSpecial(inst, &prio, *last_barrier);
        (*num_special)++;
    }

    inst->SetMarker(mrk);
    prio_.insert({inst, prio});
    max_prio_ = std::max(max_prio_, prio);
    oprev_ = ocycle_[inst];
}

// Memory
void Scheduler::ProcessMemory(Inst *inst, uint32_t *prio, Inst *last_barrier)
{
    if (inst->IsRefSpecial()) {
        loads_.push_back(inst);
        return;
    }
    for (auto mem : stores_) {
        if (GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
            AddDep(prio, inst, mem, 1U, last_barrier);
        }
    }
    if (inst->IsStore()) {
        for (auto mem : loads_) {
            if (mem->IsLoad() && GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
                AddDep(prio, inst, mem, 1U, last_barrier);
            }
        }
        for (auto ct : special_) {
            AddDep(prio, inst, ct, 1U, last_barrier);
        }
        stores_.push_back(inst);
    } else {  // means inst->IsLoad()
        loads_.push_back(inst);
    }
}

// CanThrow or SaveState can't be rearranged, and stores can't be moved over them
void Scheduler::ProcessSpecial(Inst *inst, uint32_t *prio, Inst *last_barrier)
{
    for (auto mem : stores_) {
        AddDep(prio, inst, mem, 1U, last_barrier);
    }
    for (auto ct : special_) {
        AddDep(prio, inst, ct, 1U, last_barrier);
    }
    // 1. SafePoint also has this flag
    // 2. GC triggered inside can poison loaded reference value
    if (inst->IsRuntimeCall()) {
        for (auto mem : loads_) {
            if (mem->GetType() == DataType::REFERENCE) {
                AddDep(prio, inst, mem, 1U, last_barrier);
            }
        }
    }
    // We have to "restore" BoundsCheckI -> LoadArrayI dependency
    if (inst->GetOpcode() == Opcode::BoundsCheckI) {
        auto value = inst->CastToBoundsCheckI()->GetImm();
        // Remove loads with same immediate. No easy way to check arrays are same.
        for (auto load : loads_) {
            if (load->GetOpcode() == Opcode::LoadArrayPairI) {
                auto imm = load->CastToLoadArrayPairI()->GetImm();
                if (imm == value || imm + 1 == value) {
                    AddDep(prio, inst, load, 1U, last_barrier);
                }
            } else if (load->GetOpcode() == Opcode::LoadArrayI && load->CastToLoadArrayI()->GetImm() == value) {
                AddDep(prio, inst, load, 1U, last_barrier);
            }
        }
    }
    special_.push_back(inst);
}

// Rearranges instructions in the basic block using list scheduling algorithm.
bool Scheduler::ScheduleBasicBlock(BasicBlock *bb)
{
    COMPILER_LOG(DEBUG, SCHEDULER) << "Schedule BB " << bb->GetId();

    if (!BuildAllDeps(bb)) {
        return false;
    }

    // Schedule intervals between barriers
    uint32_t cycle = 0;
    uint32_t num_inst = 0;
    Inst *first = nullptr;
    for (auto inst = bb->GetFirstInst(); inst != nullptr; inst = inst->GetNext()) {
        bool barrier = inst->IsBarrier();
        num_inst++;
        inst->ClearMarkers();
        if (first == nullptr) {
            first = inst;
        }
        if (barrier || inst == bb->GetLastInst()) {
            Inst *last = nullptr;
            if (barrier) {
                last = inst->GetPrev();
                num_inst--;
            } else {
                last = inst;
            }
            if (num_inst > 1) {
                cycle += ScheduleInstsBetweenBarriers(first, last);
            } else if (num_inst == 1) {
                ASSERT(first->GetOpcode() != Opcode::LoadPairPart);
                sched_.push_back(first);
                cycle++;
            }
            if (barrier) {
                ASSERT(inst->GetOpcode() != Opcode::LoadPairPart);
                sched_.push_back(inst);
                cycle++;
            }
            num_inst = 0;
            first = nullptr;
        }
    }
    return FinalizeBB(bb, cycle);
}

bool Scheduler::FinalizeBB(BasicBlock *bb, uint32_t cycle)
{
    // Rearrange instructions in basic block according to schedule
    bool result = false;
    bool has_prev = false;
    uint32_t prev;
    for (auto inst : sched_) {
        auto cur = old_[inst];
        if (has_prev && prev - 1 != cur) {
            result = true;
        }
        prev = cur;
        has_prev = true;

        bb->EraseInst(inst);
        bb->AppendInst(inst);
    }

    if (result) {
        COMPILER_LOG(DEBUG, SCHEDULER) << "Stats for block " << bb->GetId() << ": old cycles = " << oprev_
                                       << ", num barriers = " << num_barriers_ << ", critical path = " << max_prio_
                                       << ", scheduled = " << cycle;
        GetGraph()->GetEventWriter().EventScheduler(bb->GetId(), bb->GetGuestPc(), oprev_, num_barriers_, max_prio_,
                                                    cycle);
    }

    Cleanup();
    return result;
}

void Scheduler::Cleanup()
{
    sched_.clear();
    loads_.clear();
    stores_.clear();
    special_.clear();
    old_.clear();
    ocycle_.clear();
    num_deps_.clear();
    prio_.clear();
    deps_.clear();
}

// Rearranges instructions between [first..last] inclusive, none of them are barriers.
uint32_t Scheduler::ScheduleInstsBetweenBarriers(Inst *first, Inst *last)
{
    COMPILER_LOG(DEBUG, SCHEDULER) << "SchedBetween " << first->GetId() << " and " << last->GetId();

    // Compare function for 'waiting' queue
    auto cmp_asap = [this](Inst *left, Inst *right) {
        return asap_[left] > asap_[right] || (asap_[left] == asap_[right] && old_[left] < old_[right]);
    };
    // Queue of instructions, which dependencies are scheduled already, but they are still not finished yet
    SchedulerPriorityQueue waiting(cmp_asap, GetGraph()->GetLocalAllocator()->Adapter());

    // Compare function for 'ready' queue
    auto cmp_prio = [this](Inst *left, Inst *right) {
        return prio_[left] < prio_[right] || (prio_[left] == prio_[right] && old_[left] < old_[right]);
    };
    // Queue of ready instructions
    SchedulerPriorityQueue ready(cmp_prio, GetGraph()->GetLocalAllocator()->Adapter());

    // Initialization, add leafs into 'waiting' queue
    uint32_t num_inst = 0;
    for (auto inst = first; inst != last->GetNext(); inst = inst->GetNext()) {
        asap_.insert({inst, 1U});
        if (num_deps_[inst] == 0) {
            COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << inst->GetId();
            waiting.push(inst);
        }
        num_inst++;
    }

    // Scheduling
    uint32_t cycle = 1;
    while (num_inst > 0) {
        if (ready.empty()) {
            ASSERT(!waiting.empty());
            uint32_t nearest = asap_[waiting.top()];
            // Skipping cycles where we can't schedule any instruction
            if (nearest > cycle) {
                cycle = nearest;
            }
        }

        // Move from 'waiting' to 'ready'
        while (!waiting.empty()) {
            Inst *soonest = waiting.top();
            if (asap_[soonest] <= cycle) {
                waiting.pop();
                COMPILER_LOG(DEBUG, SCHEDULER) << "Queue ready moving " << soonest->GetId();
                ready.push(soonest);
            } else {
                break;
            }
        }
        ASSERT(!ready.empty());

        // Schedule top 'ready' instruction (together with glued, when necessary)
        num_inst -= SchedWithGlued(ready.top(), &waiting, cycle++);
        ready.pop();
    }

    // Cleanup
    asap_.clear();
    return cycle;
}

uint32_t Scheduler::SchedWithGlued(Inst *inst, SchedulerPriorityQueue *waiting, uint32_t cycle)
{
    uint32_t amount = 0;
    // Compare function for 'now' queue
    auto cmp_old = [this](Inst *left, Inst *right) { return old_[left] < old_[right]; };
    // Queue of instructions to schedule at current cycle
    SchedulerPriorityQueue now(cmp_old, GetGraph()->GetLocalAllocator()->Adapter());
    // Add inst into 'now'
    ASSERT(now.empty());
    now.push(inst);

    // Add glued instructions into 'now'
    if (inst->GetOpcode() == Opcode::LoadArrayPair || inst->GetOpcode() == Opcode::LoadArrayPairI) {
        for (auto &user_item : inst->GetUsers()) {
            auto user = user_item.GetInst();
            ASSERT(user->GetOpcode() == Opcode::LoadPairPart);
            now.push(user);
        }
    }

    [[maybe_unused]] constexpr auto MAX_NOW_SIZE = 3;
    ASSERT(now.size() <= MAX_NOW_SIZE);

    // Schedule them
    while (!now.empty()) {
        auto cur = now.top();
        now.pop();
        COMPILER_LOG(DEBUG, SCHEDULER) << "Scheduling " << cur->GetId() << " at cycle " << cycle;

        // Adjust all dependent instructions
        for (auto pair : deps_.at(cur)) {
            // Adjust asap
            uint32_t asap = asap_[pair.first];
            asap = std::max(asap, cycle + pair.second);
            asap_[pair.first] = asap;

            // Adjust num_deps
            uint32_t num_deps = num_deps_[pair.first];
            ASSERT(num_deps > 0);
            num_deps--;
            num_deps_[pair.first] = num_deps;

            // Glued instructions scheduled immediately, so they should not go into queue
            if (num_deps == 0 && pair.first->GetOpcode() != Opcode::LoadPairPart) {
                COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << pair.first->GetId();
                waiting->push(pair.first);
            }
        }

        // Add into schedule
        sched_.push_back(cur);
        amount++;
    }

    return amount;
}
}  // namespace panda::compiler