/*
 * Copyright (C) 2020 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "compiler.h"
#include "bi_print.h"
#include "bi_generated_pack.h"

#define RETURN_PACKED(str) { \
        uint64_t temp = 0; \
        memcpy(&temp, &str, sizeof(str)); \
        return temp; \
}

/* This file contains the final passes of the compiler. Running after
 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
 * bits on the wire (as well as fixup branches) */

static uint64_t
bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2, bool tdd)
{
        /* next_dependencies are the union of the dependencies of successors'
         * dependencies */

        unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
        dependency_wait |= next_2 ? next_2->dependencies : 0;

        struct bifrost_header header = {
                .flow_control =
                        (next_1 == NULL) ? BIFROST_FLOW_END :
                        clause->flow_control,
                .terminate_discarded_threads = tdd,
                .next_clause_prefetch = clause->next_clause_prefetch,
                .staging_barrier = clause->staging_barrier,
                .staging_register = clause->staging_register,
                .dependency_wait = dependency_wait,
                .dependency_slot = clause->scoreboard_id,
                .message_type = clause->message_type,
                .next_message_type = next_1 ? next_1->message_type : 0,
                .suppress_inf = true,
                .suppress_nan = true,
        };

        uint64_t u = 0;
        memcpy(&u, &header, sizeof(header));
        return u;
}

/* The uniform/constant slot allows loading a contiguous 64-bit immediate or
 * pushed uniform per bundle. Figure out which one we need in the bundle (the
 * scheduler needs to ensure we only have one type per bundle), validate
 * everything, and rewrite away the register/uniform indices to use 3-bit
 * sources directly. */

static unsigned
bi_lookup_constant(bi_clause *clause, uint32_t cons, bool *hi)
{
        for (unsigned i = 0; i < clause->constant_count; ++i) {
                /* Try to apply to top or to bottom */
                uint64_t top = clause->constants[i];

                if (cons == ((uint32_t) top | (cons & 0xF)))
                        return i;

                if (cons == (top >> 32ul)) {
                        *hi = true;
                        return i;
                }
        }

        unreachable("Invalid constant accessed");
}

static unsigned
bi_constant_field(unsigned idx)
{
        assert(idx <= 5);

        const unsigned values[] = {
                4, 5, 6, 7, 2, 3
        };

        return values[idx] << 4;
}

static bool
bi_assign_fau_idx_single(bi_registers *regs,
                         bi_clause *clause,
                         bi_instruction *ins,
                         bool assigned,
                         bool fast_zero)
{
        if (!ins)
                return assigned;

        if (ins->type == BI_BRANCH && clause->branch_constant) {
                /* By convention branch constant is last */
                unsigned idx = clause->constant_count - 1;

                /* We can only jump to clauses which are qword aligned so the
                 * bottom 4-bits of the offset are necessarily 0 */
                unsigned lo = 0;

                /* Build the constant */
                unsigned C = bi_constant_field(idx) | lo;

                if (assigned && regs->fau_idx != C)
                        unreachable("Mismatched fau_idx: branch");

                regs->fau_idx = C;
                return true;
        }

        bi_foreach_src(ins, s) {
                if (s == 0 && (ins->type == BI_LOAD_VAR_ADDRESS || ins->type == BI_LOAD_ATTR)) continue;
                if (s == 1 && (ins->type == BI_BRANCH)) continue;

                if (ins->src[s] & BIR_INDEX_CONSTANT) {
                        /* Let direct addresses through */
                        if (ins->type == BI_LOAD_VAR)
                                continue;

                        bool hi = false;
                        uint32_t cons = bi_get_immediate(ins, s);
                        unsigned idx = bi_lookup_constant(clause, cons, &hi);
                        unsigned lo = clause->constants[idx] & 0xF;
                        unsigned f = bi_constant_field(idx) | lo;

                        if (assigned && regs->fau_idx != f)
                                unreachable("Mismatched uniform/const field: imm");

                        regs->fau_idx = f;
                        ins->src[s] = BIR_INDEX_PASS | (hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
                        assigned = true;
                } else if (ins->src[s] & BIR_INDEX_ZERO && (ins->type == BI_LOAD_UNIFORM || ins->type == BI_LOAD_VAR)) {
                        /* XXX: HACK UNTIL WE HAVE HI MATCHING DUE TO OVERFLOW XXX */
                        ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_HI;
                } else if (ins->src[s] & BIR_INDEX_ZERO && !fast_zero) {
                        /* FMAs have a fast zero slot, ADD needs to use the
                         * uniform/const slot's special 0 mode handled here */
                        unsigned f = 0;

                        if (assigned && regs->fau_idx != f)
                                unreachable("Mismatched uniform/const field: 0");

                        regs->fau_idx = f;
                        ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_LO;
                        assigned = true;
                } else if (ins->src[s] & BIR_INDEX_ZERO && fast_zero) {
                        ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
                } else if (ins->src[s] & BIR_INDEX_BLEND) {
                        unsigned rt = ins->blend_location;

                        assert(rt <= 7);
                        assert((ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_HI ||
                               (ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_LO);
                        ins->src[s] = BIR_INDEX_PASS | (ins->src[s] & ~BIR_SPECIAL);
                        if (assigned && regs->fau_idx != (8 | rt))
                                unreachable("Mismatched FAU index");

                        regs->fau_idx = 8 | rt;
                        assigned = true;
                } else if (s & BIR_INDEX_UNIFORM) {
                        unreachable("Push uniforms not implemented yet");
                }
        }

        return assigned;
}

static void
bi_assign_fau_idx(bi_clause *clause,
                  bi_bundle *bundle)
{
        bool assigned =
                bi_assign_fau_idx_single(&bundle->regs, clause, bundle->fma, false, true);

        bi_assign_fau_idx_single(&bundle->regs, clause, bundle->add, assigned, false);
}

/* Assigns a slot for reading, before anything is written */

static void
bi_assign_slot_read(bi_registers *regs, unsigned src)
{
        /* We only assign for registers */
        if (!(src & BIR_INDEX_REGISTER))
                return;

        unsigned reg = src & ~BIR_INDEX_REGISTER;

        /* Check if we already assigned the slot */
        for (unsigned i = 0; i <= 1; ++i) {
                if (regs->slot[i] == reg && regs->enabled[i])
                        return;
        }

        if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
                return;

        /* Assign it now */

        for (unsigned i = 0; i <= 1; ++i) {
                if (!regs->enabled[i]) {
                        regs->slot[i] = reg;
                        regs->enabled[i] = true;
                        return;
                }
        }

        if (!regs->slot23.slot3) {
                regs->slot[2] = reg;
                regs->slot23.slot2 = BIFROST_OP_READ;
                return;
        }

        bi_print_slots(regs, stderr);
        unreachable("Failed to find a free slot for src");
}

static bi_registers
bi_assign_slots(bi_bundle *now, bi_bundle *prev)
{
        /* We assign slots for the main register mechanism. Special ops
         * use the data registers, which has its own mechanism entirely
         * and thus gets skipped over here. */

        unsigned read_dreg = now->add &&
                bi_class_props[now->add->type] & BI_DATA_REG_SRC;

        unsigned write_dreg = prev->add &&
                bi_class_props[prev->add->type] & BI_DATA_REG_DEST;

        /* First, assign reads */

        if (now->fma)
                bi_foreach_src(now->fma, src)
                        bi_assign_slot_read(&now->regs, now->fma->src[src]);

        if (now->add) {
                bi_foreach_src(now->add, src) {
                        if (!(src == 0 && read_dreg))
                                bi_assign_slot_read(&now->regs, now->add->src[src]);
                }
        }

        /* Next, assign writes */

        if (prev->add && prev->add->dest & BIR_INDEX_REGISTER && !write_dreg) {
                now->regs.slot[3] = prev->add->dest & ~BIR_INDEX_REGISTER;
                now->regs.slot23.slot3 = BIFROST_OP_WRITE;
        }

        if (prev->fma && prev->fma->dest & BIR_INDEX_REGISTER) {
                unsigned r = prev->fma->dest & ~BIR_INDEX_REGISTER;

                if (now->regs.slot23.slot3) {
                        /* Scheduler constraint: cannot read 3 and write 2 */
                        assert(!now->regs.slot23.slot2);
                        now->regs.slot[2] = r;
                        now->regs.slot23.slot2 = BIFROST_OP_WRITE;
                } else {
                        now->regs.slot[3] = r;
                        now->regs.slot23.slot3 = BIFROST_OP_WRITE;
                        now->regs.slot23.slot3_fma = true;
                }
        }

        return now->regs;
}

static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)
{
        /* Handle idle special case for first instructions */
        if (r.first_instruction && !(r.slot23.slot2 | r.slot23.slot3))
                return BIFROST_IDLE_1;

        /* Otherwise, use the LUT */
        for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
                if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
                        return i;
        }

        bi_print_slots(&r, stderr);
        unreachable("Invalid slot assignment");
}

static uint64_t
bi_pack_registers(bi_registers regs)
{
        enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
        struct bifrost_regs s = { 0 };
        uint64_t packed = 0;

        /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for
         * first instruction and adds 16 when reg 2 == reg 3 */

        unsigned ctrl;
        bool r2_equals_r3 = false;

        if (regs.first_instruction) {
                /* Bit 3 implicitly must be clear for first instructions.
                 * The affected patterns all write both ADD/FMA, but that
                 * is forbidden for the first instruction, so this does
                 * not add additional encoding constraints */
                assert(!(mode & 0x8));

                /* Move bit 4 to bit 3, since bit 3 is clear */
                ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);

                /* If we can let r2 equal r3, we have to or the hardware raises
                 * INSTR_INVALID_ENC (it's unclear why). */
                if (!(regs.slot23.slot2 && regs.slot23.slot3))
                        r2_equals_r3 = true;
        } else {
                /* We force r2=r3 or not for the upper bit */
                ctrl = (mode & 0xF);
                r2_equals_r3 = (mode & 0x10);
        }

        if (regs.enabled[1]) {
                /* Gotta save that bit!~ Required by the 63-x trick */
                assert(regs.slot[1] > regs.slot[0]);
                assert(regs.enabled[0]);

                /* Do the 63-x trick, see docs/disasm */
                if (regs.slot[0] > 31) {
                        regs.slot[0] = 63 - regs.slot[0];
                        regs.slot[1] = 63 - regs.slot[1];
                }

                assert(regs.slot[0] <= 31);
                assert(regs.slot[1] <= 63);

                s.ctrl = ctrl;
                s.reg1 = regs.slot[1];
                s.reg0 = regs.slot[0];
        } else {
                /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
                s.ctrl = 0;
                s.reg1 = ctrl << 2;

                if (regs.enabled[0]) {
                        /* Bit 0 upper bit of slot 0 */
                        s.reg1 |= (regs.slot[0] >> 5);

                        /* Rest of slot 0 in usual spot */
                        s.reg0 = (regs.slot[0] & 0b11111);
                } else {
                        /* Bit 1 set if slot 0 also disabled */
                        s.reg1 |= (1 << 1);
                }
        }

        /* Force r2 =/!= r3 as needed */
        if (r2_equals_r3) {
                assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3));

                if (regs.slot23.slot2)
                        regs.slot[3] = regs.slot[2];
                else
                        regs.slot[2] = regs.slot[3];
        } else if (!regs.first_instruction) {
                /* Enforced by the encoding anyway */
                assert(regs.slot[2] != regs.slot[3]);
        }

        s.reg2 = regs.slot[2];
        s.reg3 = regs.slot[3];
        s.fau_idx = regs.fau_idx;

        memcpy(&packed, &s, sizeof(s));
        return packed;
}

static unsigned
bi_pack_fma_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
{
        switch (ins->op.special) {
        case BI_SPECIAL_CUBEFACE1:
                return pan_pack_fma_cubeface1(clause, ins, regs);
        default:
                unreachable("Unknown special op");
        }
}

#define BI_PACK_SHIFT(name)                                                      \
static unsigned                                                                  \
bi_pack_fma_ ## name(bi_clause *clause, bi_instruction *ins, bi_registers *regs) \
{                                                                                \
        switch (nir_alu_type_get_type_size(ins->dest_type)) {                    \
        case 32:                                                                 \
                return pan_pack_fma_ ## name ## _i32(clause, ins, regs);         \
        case 16:                                                                 \
                return pan_pack_fma_ ## name ## _v2i16(clause, ins, regs);       \
        case 8:                                                                  \
                return pan_pack_fma_ ## name ## _v4i8(clause, ins, regs);        \
        default:                                                                 \
                unreachable("Invalid dest size");                                \
        }                                                                        \
}

BI_PACK_SHIFT(rshift_and)
BI_PACK_SHIFT(lshift_and)
BI_PACK_SHIFT(rshift_or)
BI_PACK_SHIFT(lshift_or)
BI_PACK_SHIFT(rshift_xor)
BI_PACK_SHIFT(lshift_xor)
BI_PACK_SHIFT(arshift)

static unsigned
bi_pack_fma_bitwise(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
{
        switch (ins->op.bitwise) {
        case BI_BITWISE_AND:
                return ins->bitwise.rshift ?
                       bi_pack_fma_rshift_and(clause, ins, regs) :
                       bi_pack_fma_lshift_and(clause, ins, regs);
        case BI_BITWISE_OR:
                return ins->bitwise.rshift ?
                       bi_pack_fma_rshift_or(clause, ins, regs) :
                       bi_pack_fma_lshift_or(clause, ins, regs);
        case BI_BITWISE_XOR:
                return ins->bitwise.rshift ?
                       bi_pack_fma_rshift_xor(clause, ins, regs) :
                       bi_pack_fma_lshift_xor(clause, ins, regs);
        case BI_BITWISE_ARSHIFT:
                assert(ins->bitwise.rshift);
                return bi_pack_fma_arshift(clause, ins, regs);
        default:
                unreachable("Invalid bitwise op");
        }
}

static unsigned
bi_pack_fma(bi_clause *clause, bi_bundle bundle, bi_registers *regs)
{
        if (!bundle.fma)
                return pan_pack_fma_nop_i32(clause, NULL, regs);

        bool f16 = bundle.fma->dest_type == nir_type_float16;
        bool f32 = bundle.fma->dest_type == nir_type_float32;
        bool u32 = bundle.fma->dest_type == nir_type_uint32 ||
                bundle.fma->dest_type == nir_type_bool32;
        bool u16 = bundle.fma->dest_type == nir_type_uint16;
        bool s32 = bundle.fma->dest_type == nir_type_int32;
        bool s16 = bundle.fma->dest_type == nir_type_int16;

        bool src0_f16 = bundle.fma->src_types[0] == nir_type_float16;
        bool src0_f32 = bundle.fma->src_types[0] == nir_type_float32;
        bool src0_u16 = bundle.fma->src_types[0] == nir_type_uint16;
        bool src0_s16 = bundle.fma->src_types[0] == nir_type_int16;
        bool src0_s8 = bundle.fma->src_types[0] == nir_type_int8;
        bool src0_u8 = bundle.fma->src_types[0] == nir_type_uint8;

        enum bi_cond cond = bundle.fma->cond;
        bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);

        switch (bundle.fma->type) {
        case BI_ADD:
                if (bundle.fma->dest_type == nir_type_float32)
                        return pan_pack_fma_fadd_f32(clause, bundle.fma, regs);
                else if (bundle.fma->dest_type == nir_type_float16)
                        return pan_pack_fma_fadd_v2f16(clause, bundle.fma, regs);

                unreachable("TODO");
        case BI_CMP:
                assert (src0_f16 || src0_f32);

                if (src0_f32)
                        return pan_pack_fma_fcmp_f32(clause, bundle.fma, regs);
                else
                        return pan_pack_fma_fcmp_v2f16(clause, bundle.fma, regs);
        case BI_BITWISE:
                return bi_pack_fma_bitwise(clause, bundle.fma, regs);
        case BI_CONVERT:
                if (src0_s8) {
                        assert(s32);
                        return pan_pack_fma_s8_to_s32(clause, bundle.fma, regs);
                } else if (src0_u8) {
                        assert(u32);
                        return pan_pack_fma_u8_to_u32(clause, bundle.fma, regs);
                } else if (src0_s16) {
                        assert(s32);
                        return pan_pack_fma_s16_to_s32(clause, bundle.fma, regs);
                } else if (src0_u16) {
                        assert(u32);
                        return pan_pack_fma_u16_to_u32(clause, bundle.fma, regs);
                } else if (src0_f16) {
                        assert(f32);
                        return pan_pack_fma_f16_to_f32(clause, bundle.fma, regs);
                } else if (src0_f32) {
                        assert(f16);
                        return pan_pack_fma_v2f32_to_v2f16(clause, bundle.fma, regs);
                }

                unreachable("Invalid FMA convert");
        case BI_CSEL:
                if (f32)
                        return pan_pack_fma_csel_f32(clause, bundle.fma, regs);
                else if (f16)
                        return pan_pack_fma_csel_v2f16(clause, bundle.fma, regs);
                else if ((u32 || s32) && typeless_cond)
                        return pan_pack_fma_csel_i32(clause, bundle.fma, regs);
                else if ((u16 || s16) && typeless_cond)
                        return pan_pack_fma_csel_v2i16(clause, bundle.fma, regs);
                else if (u32)
                        return pan_pack_fma_csel_u32(clause, bundle.fma, regs);
                else if (u16)
                        return pan_pack_fma_csel_v2u16(clause, bundle.fma, regs);
                else if (s32)
                        return pan_pack_fma_csel_s32(clause, bundle.fma, regs);
                else if (s16)
                        return pan_pack_fma_csel_v2s16(clause, bundle.fma, regs);
                else
                        unreachable("Invalid csel type");
        case BI_FMA:
                if (bundle.fma->dest_type == nir_type_float32) {
                        if (bundle.fma->op.mscale)
                                return pan_pack_fma_fma_rscale_f32(clause, bundle.fma, regs);
                        else
                                return pan_pack_fma_fma_f32(clause, bundle.fma, regs);
                } else {
                        assert(bundle.fma->dest_type == nir_type_float16);

                        if (bundle.fma->op.mscale)
                                return pan_pack_fma_fma_rscale_v2f16(clause, bundle.fma, regs);
                        else
                                return pan_pack_fma_fma_v2f16(clause, bundle.fma, regs);
                }
        case BI_FREXP:
                assert(src0_f32 || src0_f16);

                if (src0_f32)
                        return pan_pack_fma_frexpe_f32(clause, bundle.fma, regs);
                else
                        return pan_pack_fma_frexpe_v2f16(clause, bundle.fma, regs);
        case BI_IMATH:
                /* XXX: Only 32-bit, with carries/borrows forced */
                assert(s32 || u32);

                if (bundle.fma->op.imath == BI_IMATH_ADD)
                        return pan_pack_fma_iaddc_i32(clause, bundle.fma, regs);
                else
                        return pan_pack_fma_isubb_i32(clause, bundle.fma, regs);
        case BI_MOV:
                return pan_pack_fma_mov_i32(clause, bundle.fma, regs);
        case BI_SELECT:
                if (nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 16) {
                        return pan_pack_fma_mkvec_v2i16(clause, bundle.fma, regs);
                } else {
                        assert(nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 8);
                        return pan_pack_fma_mkvec_v4i8(clause, bundle.fma, regs);
                }
        case BI_ROUND:
                assert(f16 || f32);

                if (f16)
                        return pan_pack_fma_fround_v2f16(clause, bundle.fma, regs);
                else
                        return pan_pack_fma_fround_f32(clause, bundle.fma, regs);
        case BI_REDUCE_FMA:
                assert(src0_f32 && f32);
                return pan_pack_fma_fadd_lscale_f32(clause, bundle.fma, regs);
        case BI_IMUL:
                return pan_pack_fma_imul_i32(clause, bundle.fma, regs);
        case BI_SPECIAL_FMA:
                return bi_pack_fma_special(clause, bundle.fma, regs);
        default:
                unreachable("Cannot encode class as FMA");
        }
}

static unsigned
bi_pack_add_branch_cond(bi_instruction *ins, bi_registers *regs)
{
        assert(ins->cond == BI_COND_EQ);
        assert(ins->src[1] == BIR_INDEX_ZERO);

        unsigned zero_ctrl = 0;
        unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);

        if (size == 16) {
                /* See BR_SIZE_ZERO swizzle disassembly */
                zero_ctrl = ins->swizzle[0][0] ? 1 : 2;
        } else {
                assert(size == 32);
        }

        /* EQ swap to NE */
        bool slot_swapped = false;

        struct bifrost_branch pack = {
                .src0 = bi_get_src(ins, regs, 0),
                .src1 = (zero_ctrl << 1) | !slot_swapped,
                .cond = BR_COND_EQ,
                .size = BR_SIZE_ZERO,
                .op = BIFROST_ADD_OP_BRANCH
        };

        if (ins->branch_target) {
                /* We assigned the constant slot to fetch the branch offset so
                 * we can just passthrough here. We put in the HI slot to match
                 * the blob since that's where the magic flags end up
                 */
                assert(!ins->src[2]);
                pack.src2 = BIFROST_SRC_FAU_HI;
        } else {
                pack.src2 = bi_get_src(ins, regs, 2);
        }

        RETURN_PACKED(pack);
}

static unsigned
bi_pack_add_branch_uncond(bi_instruction *ins, bi_registers *regs)
{
        struct bifrost_branch pack = {
                /* It's unclear what these bits actually mean */
                .src0 = BIFROST_SRC_FAU_LO,
                .src1 = BIFROST_SRC_PASS_FMA,

                /* All ones in fact */
                .cond = (BR_ALWAYS & 0x7),
                .size = (BR_ALWAYS >> 3),
                .op = BIFROST_ADD_OP_BRANCH
        };

        if (ins->branch_target) {
                /* Offset is passed as a PC-relative offset through an
                 * embedded constant.
                 */
                assert(!ins->src[2]);
                pack.src2 = BIFROST_SRC_FAU_HI;
        } else {
                pack.src2 = bi_get_src(ins, regs, 2);
        }

        RETURN_PACKED(pack);
}

static unsigned
bi_pack_add_branch(bi_instruction *ins, bi_registers *regs)
{
        if (ins->cond == BI_COND_ALWAYS)
                return bi_pack_add_branch_uncond(ins, regs);
        else
                return bi_pack_add_branch_cond(ins, regs);
}

static unsigned
bi_pack_add_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
{
        bool f16 = ins->dest_type == nir_type_float16;

        switch (ins->op.special) {
        case BI_SPECIAL_FRCP:
                return f16 ? pan_pack_add_frcp_f16(clause, ins, regs) :
                             pan_pack_add_frcp_f32(clause, ins, regs);
        case BI_SPECIAL_FRSQ:
                return f16 ? pan_pack_add_frsq_f16(clause, ins, regs) :
                             pan_pack_add_frsq_f32(clause, ins, regs);
        case BI_SPECIAL_EXP2_LOW:
                assert(!f16);
                return pan_pack_add_fexp_f32(clause, ins, regs);
        case BI_SPECIAL_IABS:
                assert(ins->src_types[0] == nir_type_int32);
                return pan_pack_add_iabs_s32(clause, ins, regs);
        case BI_SPECIAL_CUBEFACE2:
                return pan_pack_add_cubeface2(clause, ins, regs);
        case BI_SPECIAL_CUBE_SSEL:
                return pan_pack_add_cube_ssel(clause, ins, regs);
        case BI_SPECIAL_CUBE_TSEL:
                return pan_pack_add_cube_tsel(clause, ins, regs);
        default:
                unreachable("Unknown special op");
        }
}

static unsigned
bi_pack_add(bi_clause *clause, bi_bundle bundle, bi_registers *regs, gl_shader_stage stage)
{
        if (!bundle.add)
                return pan_pack_add_nop_i32(clause, NULL, regs);

        bool f16 = bundle.add->dest_type == nir_type_float16;
        bool f32 = bundle.add->dest_type == nir_type_float32;
        bool u32 = bundle.add->dest_type == nir_type_uint32 ||
                bundle.add->dest_type == nir_type_bool32;
        bool u16 = bundle.add->dest_type == nir_type_uint16;
        bool s32 = bundle.add->dest_type == nir_type_int32;
        bool s16 = bundle.add->dest_type == nir_type_int16;

        bool src0_f16 = bundle.add->src_types[0] == nir_type_float16;
        bool src0_f32 = bundle.add->src_types[0] == nir_type_float32;
        bool src0_u32 = bundle.add->src_types[0] == nir_type_uint32;
        bool src0_u16 = bundle.add->src_types[0] == nir_type_uint16;
        bool src0_u8 = bundle.add->src_types[0] == nir_type_uint8;
        bool src0_s32 = bundle.add->src_types[0] == nir_type_int32;
        bool src0_s16 = bundle.add->src_types[0] == nir_type_int16;
        bool src0_s8 = bundle.add->src_types[0] == nir_type_int8;

        unsigned sz = nir_alu_type_get_type_size(bundle.add->dest_type);
        enum bi_cond cond = bundle.add->cond;
        bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);

        switch (bundle.add->type) {
        case BI_ADD:
                if (bundle.add->dest_type == nir_type_float32)
                        return pan_pack_add_fadd_f32(clause, bundle.add, regs);
                else if (bundle.add->dest_type == nir_type_float16)
                        return pan_pack_add_fadd_v2f16(clause, bundle.add, regs);

                unreachable("TODO");
        case BI_ATEST:
                return pan_pack_add_atest(clause, bundle.add, regs);
        case BI_BRANCH:
                return bi_pack_add_branch(bundle.add, regs);
        case BI_CMP:
                if (src0_f32)
                        return pan_pack_add_fcmp_f32(clause, bundle.add, regs);
                else if (src0_f16)
                        return pan_pack_add_fcmp_v2f16(clause, bundle.add, regs);
                else if ((src0_u32 || src0_s32) && typeless_cond)
                        return pan_pack_add_icmp_i32(clause, bundle.add, regs);
                else if ((src0_u16 || src0_s16) && typeless_cond)
                        return pan_pack_add_icmp_v2i16(clause, bundle.add, regs);
                else if ((src0_u8 || src0_s8) && typeless_cond)
                        return pan_pack_add_icmp_v4i8(clause, bundle.add, regs);
                else if (src0_u32)
                        return pan_pack_add_icmp_u32(clause, bundle.add, regs);
                else if (src0_u16)
                        return pan_pack_add_icmp_v2u16(clause, bundle.add, regs);
                else if (src0_u8)
                        return pan_pack_add_icmp_v4u8(clause, bundle.add, regs);
                else if (src0_s32)
                        return pan_pack_add_icmp_s32(clause, bundle.add, regs);
                else if (src0_s16)
                        return pan_pack_add_icmp_v2s16(clause, bundle.add, regs);
                else if (src0_s8)
                        return pan_pack_add_icmp_v4s8(clause, bundle.add, regs);
                else
                        unreachable("Invalid cmp type");
        case BI_BLEND:
                return pan_pack_add_blend(clause, bundle.add, regs);
        case BI_BITWISE:
                unreachable("Packing todo");
        case BI_CONVERT:
                if (src0_f16 && s16)
                        return pan_pack_add_v2f16_to_v2s16(clause, bundle.add, regs);
                else if (src0_f16 && u16)
                        return pan_pack_add_v2f16_to_v2u16(clause, bundle.add, regs);
                else if (src0_f16 && s32)
                        return pan_pack_add_f16_to_s32(clause, bundle.add, regs);
                else if (src0_f16 && u32)
                        return pan_pack_add_f16_to_u32(clause, bundle.add, regs);
                else if (src0_s16 && f16)
                        return pan_pack_add_v2s16_to_v2f16(clause, bundle.add, regs);
                else if (src0_u16 && f16)
                        return pan_pack_add_v2u16_to_v2f16(clause, bundle.add, regs);
                else if (src0_s8  && s16)
                        return pan_pack_add_v2s8_to_v2s16(clause, bundle.add, regs);
                else if (src0_u8  && u16)
                        return pan_pack_add_v2u8_to_v2u16(clause, bundle.add, regs);
                else if (src0_s8  && f16)
                        return pan_pack_add_v2s8_to_v2f16(clause, bundle.add, regs);
                else if (src0_u8  && f16)
                        return pan_pack_add_v2u8_to_v2f16(clause, bundle.add, regs);
                else if (src0_f32 && s32)
                        return pan_pack_add_f32_to_s32(clause, bundle.add, regs);
                else if (src0_f32 && u32)
                        return pan_pack_add_f32_to_u32(clause, bundle.add, regs);
                else if (src0_s8  && s32)
                        return pan_pack_add_s8_to_s32(clause, bundle.add, regs);
                else if (src0_u8  && u32)
                        return pan_pack_add_u8_to_u32(clause, bundle.add, regs);
                else if (src0_s8  && f32)
                        return pan_pack_add_s8_to_f32(clause, bundle.add, regs);
                else if (src0_u8  && f32)
                        return pan_pack_add_u8_to_f32(clause, bundle.add, regs);
                else if (src0_s32 && f32)
                        return pan_pack_add_s32_to_f32(clause, bundle.add, regs);
                else if (src0_u32 && f32)
                        return pan_pack_add_u32_to_f32(clause, bundle.add, regs);
                else if (src0_s16 && s32)
                        return pan_pack_add_s16_to_s32(clause, bundle.add, regs);
                else if (src0_u16 && u32)
                        return pan_pack_add_u16_to_u32(clause, bundle.add, regs);
                else if (src0_s16 && f32)
                        return pan_pack_add_s16_to_f32(clause, bundle.add, regs);
                else if (src0_u16 && f32)
                        return pan_pack_add_u16_to_f32(clause, bundle.add, regs);
                else if (src0_f16 && f32)
                        return pan_pack_add_f16_to_f32(clause, bundle.add, regs);
                else if (src0_f32 && f16)
                        return pan_pack_add_v2f32_to_v2f16(clause, bundle.add, regs);
                else
                        unreachable("Invalid ADD convert");
        case BI_DISCARD:
                return pan_pack_add_discard_f32(clause, bundle.add, regs);
        case BI_FREXP:
                unreachable("Packing todo");
        case BI_IMATH:
                assert(sz == 8 || sz == 16 || sz == 32);

                if (bundle.add->op.imath == BI_IMATH_ADD) {
                        return (sz == 8) ? pan_pack_add_iadd_v4s8(clause, bundle.add, regs) :
                                (sz == 16) ? pan_pack_add_iadd_v2s16(clause, bundle.add, regs) :
                                pan_pack_add_iadd_s32(clause, bundle.add, regs);
                } else {
                        return (sz == 8) ? pan_pack_add_isub_v4s8(clause, bundle.add, regs) :
                                (sz == 16) ? pan_pack_add_isub_v2s16(clause, bundle.add, regs) :
                                pan_pack_add_isub_s32(clause, bundle.add, regs);
                }
        case BI_LOAD_ATTR:
                return pan_pack_add_ld_attr_imm(clause, bundle.add, regs);
        case BI_LOAD:
        case BI_LOAD_UNIFORM:
                assert(u32 || s32 || f32);
                switch (bundle.add->vector_channels) {
                case 1: return pan_pack_add_load_i32(clause, bundle.add, regs);
                case 2: return pan_pack_add_load_i64(clause, bundle.add, regs);
                case 3: return pan_pack_add_load_i96(clause, bundle.add, regs);
                case 4: return pan_pack_add_load_i128(clause, bundle.add, regs);
                default: unreachable("Invalid channel count");
                }
        case BI_LOAD_VAR:
                if (bundle.add->src[0] & BIR_INDEX_CONSTANT) {
                        if (bi_get_immediate(bundle.add, 0) >= 20)
                                return pan_pack_add_ld_var_special(clause, bundle.add, regs);
                        else if (bundle.add->load_vary.flat)
                                return pan_pack_add_ld_var_flat_imm(clause, bundle.add, regs);
                        else
                                return pan_pack_add_ld_var_imm(clause, bundle.add, regs);
                } else {
                        if (bundle.add->load_vary.flat)
                                return pan_pack_add_ld_var_flat(clause, bundle.add, regs);
                        else
                                return pan_pack_add_ld_var(clause, bundle.add, regs);
                }
        case BI_LOAD_VAR_ADDRESS:
                return pan_pack_add_lea_attr_imm(clause, bundle.add, regs);
        case BI_LOAD_TILE:
                return pan_pack_add_ld_tile(clause, bundle.add, regs);
        case BI_MINMAX:
                if (bundle.add->op.minmax == BI_MINMAX_MIN) {
                        if (bundle.add->dest_type == nir_type_float32)
                                return pan_pack_add_fmin_f32(clause, bundle.add, regs);
                        else if (bundle.add->dest_type == nir_type_float16)
                                return pan_pack_add_fmin_v2f16(clause, bundle.add, regs);
                        unreachable("TODO");
                } else {
                        if (bundle.add->dest_type == nir_type_float32)
                                return pan_pack_add_fmax_f32(clause, bundle.add, regs);
                        else if (bundle.add->dest_type == nir_type_float16)
                                return pan_pack_add_fmax_v2f16(clause, bundle.add, regs);
                        unreachable("TODO");
                }
        case BI_MOV:
                unreachable("Packing todo");
        case BI_STORE:
                assert(src0_u32 || src0_s32 || src0_f32);
                switch (bundle.add->vector_channels) {
                case 1: return pan_pack_add_store_i32(clause, bundle.add, regs);
                case 2: return pan_pack_add_store_i64(clause, bundle.add, regs);
                case 3: return pan_pack_add_store_i96(clause, bundle.add, regs);
                case 4: return pan_pack_add_store_i128(clause, bundle.add, regs);
                default: unreachable("Invalid channel count");
                }
        case BI_STORE_VAR:
                return pan_pack_add_st_cvt(clause, bundle.add, regs);
        case BI_SPECIAL_ADD:
                return bi_pack_add_special(clause, bundle.add, regs);
        case BI_TABLE:
                assert(bundle.add->dest_type == nir_type_float32);
                return pan_pack_add_flogd_f32(clause, bundle.add, regs);
        case BI_SELECT:
                assert(nir_alu_type_get_type_size(bundle.add->src_types[0]) == 16);
                return pan_pack_add_mkvec_v2i16(clause, bundle.add, regs);
        case BI_TEXC:
                return pan_pack_add_texc(clause, bundle.add, regs);
        case BI_TEXC_DUAL:
                unreachable("Packing todo");
        case BI_TEXS:
                assert(f16 || f32);

                if (f16)
                        return pan_pack_add_texs_2d_f16(clause, bundle.add, regs);
                else
                        return pan_pack_add_texs_2d_f32(clause, bundle.add, regs);
case BI_ROUND:
                unreachable("Packing todo");
        case BI_ZS_EMIT:
                return pan_pack_add_zs_emit(clause, bundle.add, regs);
        default:
                unreachable("Cannot encode class as ADD");
        }
}

struct bi_packed_bundle {
        uint64_t lo;
        uint64_t hi;
};

/* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
 * this up at pack time. (Scheduling doesn't care.) */

static void
bi_flip_slots(bi_registers *regs)
{
        if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
                unsigned temp = regs->slot[0];
                regs->slot[0] = regs->slot[1];
                regs->slot[1] = temp;
        }

}

/* Lower CUBEFACE2 to a CUBEFACE1/CUBEFACE2. This is a hack so the scheduler
 * doesn't have to worry about this while we're just packing singletons */

static void
bi_lower_cubeface2(bi_context *ctx, bi_bundle *bundle)
{
        /* Filter for +CUBEFACE2 */
        if (!bundle->add || bundle->add->type != BI_SPECIAL_ADD
                         || bundle->add->op.special != BI_SPECIAL_CUBEFACE2) {
                return;
        }

        /* This won't be used once we emit non-singletons, for now this is just
         * a fact of our scheduler and allows us to clobber FMA */
        assert(!bundle->fma);

        /* Construct an FMA op */
        bi_instruction cubeface1 = {
                .type = BI_SPECIAL_FMA,
                .op.special = BI_SPECIAL_CUBEFACE1,
                /* no dest, just to a temporary */
                .dest_type = nir_type_float32,
                .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
        };

        /* Copy over the register allocated sources (coordinates). */
        memcpy(&cubeface1.src, bundle->add->src, sizeof(cubeface1.src));

        /* Zeroed by RA since this is all 32-bit */
        for (unsigned i = 0; i < 3; ++i)
                assert(bundle->add->swizzle[i][0] == 0);

        /* Emit the instruction */
        bundle->fma = bi_emit_before(ctx, bundle->add, cubeface1);

        /* Now replace the sources of the CUBEFACE2 with a single passthrough
         * from the CUBEFACE1 (and a side-channel) */
        bundle->add->src[0] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
        bundle->add->src[1] = bundle->add->src[2] = 0;
}

static struct bi_packed_bundle
bi_pack_bundle(bi_clause *clause, bi_bundle bundle, bi_bundle prev, bool first_bundle, gl_shader_stage stage)
{
        bi_assign_slots(&bundle, &prev);
        bi_assign_fau_idx(clause, &bundle);
        bundle.regs.first_instruction = first_bundle;

        bi_flip_slots(&bundle.regs);

        uint64_t reg = bi_pack_registers(bundle.regs);
        uint64_t fma = bi_pack_fma(clause, bundle, &bundle.regs);
        uint64_t add = bi_pack_add(clause, bundle, &bundle.regs, stage);

        struct bi_packed_bundle packed = {
                .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
                .hi = add >> 6
        };

        return packed;
}

/* Packs the next two constants as a dedicated constant quadword at the end of
 * the clause, returning the number packed. There are two cases to consider:
 *
 * Case #1: Branching is not used. For a single constant copy the upper nibble
 * over, easy.
 *
 * Case #2: Branching is used. For a single constant, it suffices to set the
 * upper nibble to 4 and leave the latter constant 0, which matches what the
 * blob does.
 *
 * Extending to multiple constants is considerably more tricky and left for
 * future work.
 */

static unsigned
bi_pack_constants(bi_context *ctx, bi_clause *clause,
                unsigned index,
                struct util_dynarray *emission)
{
        /* After these two, are we done? Determines tag */
        bool done = clause->constant_count <= (index + 2);
        ASSERTED bool only = clause->constant_count <= (index + 1);

        /* Is the constant we're packing for a branch? */
        bool branches = clause->branch_constant && done;

        /* TODO: Pos */
        assert(index == 0 && clause->bundle_count == 1);
        assert(only);

        /* Compute branch offset instead of a dummy 0 */
        if (branches) {
                bi_instruction *br = clause->bundles[clause->bundle_count - 1].add;
                assert(br && br->type == BI_BRANCH && br->branch_target);

                /* Put it in the high place */
                int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
                int32_t bytes = qwords * 16;

                /* Copy so we get proper sign behaviour */
                uint32_t raw = 0;
                memcpy(&raw, &bytes, sizeof(raw));

                /* Clear off top bits for the magic bits */
                raw &= ~0xF0000000;

                /* Put in top 32-bits */
                clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
        }

        uint64_t hi = clause->constants[index + 0] >> 60ull;

        struct bifrost_fmt_constant quad = {
                .pos = 0, /* TODO */
                .tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
                .imm_1 = clause->constants[index + 0] >> 4,
                .imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
        };

        if (branches) {
                /* Branch offsets are less than 60-bits so this should work at
                 * least for now */
                quad.imm_1 |= (4ull << 60ull) >> 4;
                assert (hi == 0);
        }

        /* XXX: On G71, Connor observed that the difference of the top 4 bits
         * of the second constant with the first must be less than 8, otherwise
         * we have to swap them. On G52, I'm able to reproduce a similar issue
         * but with a different workaround (modeled above with a single
         * constant, unclear how to workaround for multiple constants.) Further
         * investigation needed. Possibly an errata. XXX */

        util_dynarray_append(emission, struct bifrost_fmt_constant, quad);

        return 2;
}

static void
bi_pack_clause(bi_context *ctx, bi_clause *clause,
                bi_clause *next_1, bi_clause *next_2,
                struct util_dynarray *emission, gl_shader_stage stage,
                bool tdd)
{
        /* After the deadline lowering */
        bi_lower_cubeface2(ctx, &clause->bundles[0]);

        struct bi_packed_bundle ins_1 = bi_pack_bundle(clause, clause->bundles[0], clause->bundles[0], true, stage);
        assert(clause->bundle_count == 1);

        /* State for packing constants throughout */
        unsigned constant_index = 0;

        struct bifrost_fmt1 quad_1 = {
                .tag = clause->constant_count ? BIFROST_FMT1_CONSTANTS : BIFROST_FMT1_FINAL,
                .header = bi_pack_header(clause, next_1, next_2, tdd),
                .ins_1 = ins_1.lo,
                .ins_2 = ins_1.hi & ((1 << 11) - 1),
                .ins_0 = (ins_1.hi >> 11) & 0b111,
        };

        util_dynarray_append(emission, struct bifrost_fmt1, quad_1);

        /* Pack the remaining constants */

        while (constant_index < clause->constant_count) {
                constant_index += bi_pack_constants(ctx, clause,
                                constant_index, emission);
        }
}

static bi_clause *
bi_next_clause(bi_context *ctx, pan_block *block, bi_clause *clause)
{
        /* Try the first clause in this block if we're starting from scratch */
        if (!clause && !list_is_empty(&((bi_block *) block)->clauses))
                return list_first_entry(&((bi_block *) block)->clauses, bi_clause, link);

        /* Try the next clause in this block */
        if (clause && clause->link.next != &((bi_block *) block)->clauses)
                return list_first_entry(&(clause->link), bi_clause, link);

        /* Try the next block, or the one after that if it's empty, etc .*/
        pan_block *next_block = pan_next_block(block);

        bi_foreach_block_from(ctx, next_block, block) {
                bi_block *blk = (bi_block *) block;

                if (!list_is_empty(&blk->clauses))
                        return list_first_entry(&(blk->clauses), bi_clause, link);
        }

        return NULL;
}

/* We should terminate discarded threads if there may be discarded threads (a
 * fragment shader) and helper invocations are not used. Further logic may be
 * required for future discard/demote differentiation
 */

static bool
bi_terminate_discarded_threads(bi_context *ctx)
{
        if (ctx->stage == MESA_SHADER_FRAGMENT)
                return !ctx->nir->info.fs.needs_helper_invocations;
        else
                return false;
}

static void
bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
                          const bi_clause *clause)
{
        /* No need to collect return addresses when we're in a blend shader. */
        if (ctx->is_blend)
                return;

        const bi_bundle *bundle = &clause->bundles[clause->bundle_count - 1];
        const bi_instruction *ins = bundle->add;

        if (!ins || ins->type != BI_BLEND)
                return;

        /* We don't support non-terminal blend instructions yet.
         * That would requires fixing blend shaders to restore the registers
         * they use before jumping back to the fragment shader, which is
         * currently not supported.
         */
        assert(0);

        assert(ins->blend_location < ARRAY_SIZE(ctx->blend_ret_offsets));
        assert(!ctx->blend_ret_offsets[ins->blend_location]);
        ctx->blend_ret_offsets[ins->blend_location] =
                util_dynarray_num_elements(emission, uint8_t);
        assert(!(ctx->blend_ret_offsets[ins->blend_location] & 0x7));
}

void
bi_pack(bi_context *ctx, struct util_dynarray *emission)
{
        bool tdd = bi_terminate_discarded_threads(ctx);

        bi_foreach_block(ctx, _block) {
                bi_block *block = (bi_block *) _block;

                /* Passthrough the first clause of where we're branching to for
                 * the last clause of the block (the clause with the branch) */

                bi_clause *succ_clause = block->base.successors[1] ?
                        bi_next_clause(ctx, block->base.successors[0], NULL) : NULL;

                bi_foreach_clause_in_block(block, clause) {
                        bool is_last = clause->link.next == &block->clauses;

                        bi_clause *next = bi_next_clause(ctx, _block, clause);
                        bi_clause *next_2 = is_last ? succ_clause : NULL;

                        bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage, tdd);

                        if (!is_last)
                                bi_collect_blend_ret_addr(ctx, emission, clause);
                }
        }
}