/*
 * Copyright (C) 2022 Collabora Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#pragma once

#if !defined(PAN_ARCH) || PAN_ARCH < 10
#error "cs_builder.h requires PAN_ARCH >= 10"
#endif

#include "gen_macros.h"

#include "util/bitset.h"
#include "util/u_dynarray.h"

/*
 * cs_builder implements a builder for CSF command streams. It manages the
 * allocation and overflow behaviour of queues and provides helpers for emitting
 * commands to run on the CSF pipe.
 *
 * Users are responsible for the CS buffer allocation and must initialize the
 * command stream with an initial buffer using cs_builder_init(). The CS can
 * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
 * if the builder runs out of memory.
 */

struct cs_buffer {
   /* CPU pointer */
   uint64_t *cpu;

   /* GPU pointer */
   uint64_t gpu;

   /* Capacity in number of 64-bit instructions */
   uint32_t capacity;
};

/**
 * This is used to check that:
 * 1. registers are not used as a source after being loaded without a
 *    WAIT(<ls_scoreboard>) in the middle
 * 2. registers are not reused (used as a destination) after they served as a
 *    STORE() source without a WAIT(<ls_scoreboard>) in the middle
 */
struct cs_load_store_tracker {
   BITSET_DECLARE(pending_loads, 256);
   BITSET_DECLARE(pending_stores, 256);
   uint8_t sb_slot;
};

/**
 * This is used to determine which registers as been written to (a.k.a. used
 * as an instruction's destination).
 */
struct cs_dirty_tracker {
   BITSET_DECLARE(regs, 256);
};

enum cs_reg_perm {
   CS_REG_NO_ACCESS = 0,
   CS_REG_RD = BITFIELD_BIT(1),
   CS_REG_WR = BITFIELD_BIT(2),
   CS_REG_RW = CS_REG_RD | CS_REG_WR,
};

struct cs_builder;

typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);

struct cs_builder_conf {
   /* Number of 32-bit registers in the hardware register file */
   uint8_t nr_registers;

   /* Number of 32-bit registers used by the kernel at submission time */
   uint8_t nr_kernel_registers;

   /* CS buffer allocator */
   struct cs_buffer (*alloc_buffer)(void *cookie);

   /* Optional load/store tracker. */
   struct cs_load_store_tracker *ls_tracker;

   /* Optional dirty registers tracker. */
   struct cs_dirty_tracker *dirty_tracker;

   /* Optional register access checker. */
   reg_perm_cb_t reg_perm;

   /* Cookie passed back to alloc_buffer() */
   void *cookie;
};

/* The CS is formed of one or more CS chunks linked with JUMP instructions.
 * The builder keeps track of the current chunk and the position inside this
 * chunk, so it can emit new instructions, and decide when a new chunk needs
 * to be allocated.
 */
struct cs_chunk {
   /* CS buffer object backing this chunk */
   struct cs_buffer buffer;

   union {
      /* Current position in the buffer object when the chunk is active. */
      uint32_t pos;

      /* Chunk size when the chunk was wrapped. */
      uint32_t size;
   };
};

/* Monolithic sequence of instruction. Must live in a virtually contiguous
 * portion of code.
 */
struct cs_block {
   /* Used to insert the block in the block stack. */
   struct cs_block *next;
};

#define CS_LABEL_INVALID_POS ~0u

/* Labels can only be used inside a cs_block. They can be defined and
 * referenced before they are set to point to a specific position
 * in the block. */
struct cs_label {
   /* The last reference we have seen pointing to this block before
    * it was set. If set to CS_LABEL_INVALID_POS, no forward reference
    * pointing to this label exist.
    */
   uint32_t last_forward_ref;

   /* The label target. If set to CS_LABEL_INVALID_POS, the label has
    * not been set yet.
    */
   uint32_t target;
};

/* CS if/else block. */
struct cs_if_else {
   struct cs_block block;
   struct cs_label end_label;
};

struct cs_builder {
   /* CS builder configuration */
   struct cs_builder_conf conf;

   /* True if an allocation failed, making the whole CS invalid. */
   bool invalid;

   /* Initial (root) CS chunk. */
   struct cs_chunk root_chunk;

   /* Current CS chunk. */
   struct cs_chunk cur_chunk;

   /* Temporary storage for inner blocks that need to be built
    * and copied in one monolithic sequence of instructions with no
    * jump in the middle.
    */
   struct {
      struct cs_block *stack;
      struct util_dynarray instrs;
      struct cs_if_else pending_if;
      unsigned last_load_ip_target;
   } blocks;

   /* Move immediate instruction at the end of the last CS chunk that needs to
    * be patched with the final length of the current CS chunk in order to
    * facilitate correct overflow behaviour.
    */
   uint32_t *length_patch;

   /* Used as temporary storage when the allocator couldn't allocate a new
    * CS chunk.
    */
   uint64_t discard_instr_slot;
};

static inline void
cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
                struct cs_buffer root_buffer)
{
   *b = (struct cs_builder){
      .conf = *conf,
      .root_chunk.buffer = root_buffer,
      .cur_chunk.buffer = root_buffer,
   };

   /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
    * at least that too.
    */
   b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);

   util_dynarray_init(&b->blocks.instrs, NULL);
}

static inline bool
cs_is_valid(struct cs_builder *b)
{
   return !b->invalid;
}

static inline bool
cs_is_empty(struct cs_builder *b)
{
   return b->cur_chunk.pos == 0 &&
          b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
}

static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder *b)
{
   return b->root_chunk.buffer.gpu;
}

static inline uint32_t
cs_root_chunk_size(struct cs_builder *b)
{
   /* Make sure cs_finish() was called. */
   assert(!memcmp(&b->cur_chunk, &(struct cs_chunk){0}, sizeof(b->cur_chunk)));

   return b->root_chunk.size * sizeof(uint64_t);
}

/*
 * Wrap the current queue. External users shouldn't call this function
 * directly, they should call cs_finish() when they are done building
 * the command stream, which will in turn call cs_wrap_queue().
 *
 * Internally, this is also used to finalize internal CS chunks when
 * allocating new sub-chunks. See cs_alloc_chunk() for details.
 *
 * This notably requires patching the previous chunk with the length
 * we ended up emitting for this chunk.
 */
static inline void
cs_wrap_chunk(struct cs_builder *b)
{
   if (!cs_is_valid(b))
      return;

   if (b->length_patch) {
      *b->length_patch = (b->cur_chunk.pos * 8);
      b->length_patch = NULL;
   }

   if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
      b->root_chunk.size = b->cur_chunk.size;
}

enum cs_index_type {
   CS_INDEX_REGISTER = 0,
   CS_INDEX_UNDEF,
};

struct cs_index {
   enum cs_index_type type;

   /* Number of 32-bit words in the index, must be nonzero */
   uint8_t size;

   union {
      uint64_t imm;
      uint8_t reg;
   };
};

static inline struct cs_index
cs_undef(void)
{
   return (struct cs_index){
      .type = CS_INDEX_UNDEF,
   };
}

static inline uint8_t
cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
{
   assert(idx.type == CS_INDEX_REGISTER);
   assert(idx.size == expected_size);

   return idx.reg;
}

static inline unsigned
cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count,
             uint16_t mask)
{
   unsigned reg = cs_to_reg_tuple(src, count);

   if (unlikely(b->conf.reg_perm)) {
      for (unsigned i = reg; i < reg + count; i++) {
         if (mask & BITFIELD_BIT(i - reg)) {
            assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
                   !"Trying to read a restricted register");
         }
      }
   }

   struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;

   if (unlikely(ls_tracker)) {
      for (unsigned i = reg; i < reg + count; i++) {
         if ((mask & BITFIELD_BIT(i - reg)) &&
             BITSET_TEST(ls_tracker->pending_loads, i))
            assert(!"register used as a source before flushing loads\n");
      }
   }

   return reg;
}

static inline unsigned
cs_src32(struct cs_builder *b, struct cs_index src)
{
   return cs_src_tuple(b, src, 1, BITFIELD_MASK(1));
}

static inline unsigned
cs_src64(struct cs_builder *b, struct cs_index src)
{
   return cs_src_tuple(b, src, 2, BITFIELD_MASK(2));
}

static inline unsigned
cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count,
             uint16_t mask)
{
   unsigned reg = cs_to_reg_tuple(dst, count);

   if (unlikely(b->conf.reg_perm)) {
      for (unsigned i = reg; i < reg + count; i++) {
         if (mask & BITFIELD_BIT(i - reg)) {
            assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
                   !"Trying to write a restricted register");
         }
      }
   }

   struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;

   if (unlikely(ls_tracker)) {
      for (unsigned i = reg; i < reg + count; i++) {
         if ((mask & BITFIELD_BIT(i - reg)) &&
             BITSET_TEST(ls_tracker->pending_stores, i))
            assert(
               !"register reused as a destination before flushing stores\n");
      }
   }

   if (unlikely(b->conf.dirty_tracker)) {
      for (unsigned i = reg; i < reg + count; i++) {
         if (mask & BITFIELD_BIT(i - reg))
            BITSET_SET(b->conf.dirty_tracker->regs, i);
      }
   }

   return reg;
}

static inline unsigned
cs_dst32(struct cs_builder *b, struct cs_index dst)
{
   return cs_dst_tuple(b, dst, 1, BITFIELD_MASK(1));
}

static inline unsigned
cs_dst64(struct cs_builder *b, struct cs_index dst)
{
   return cs_dst_tuple(b, dst, 2, BITFIELD_MASK(2));
}

static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
{
   assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
          "overflowed register file");
   assert(size <= 16 && "unsupported");

   return (struct cs_index){
      .type = CS_INDEX_REGISTER,
      .size = size,
      .reg = reg,
   };
}

static inline struct cs_index
cs_reg32(struct cs_builder *b, unsigned reg)
{
   return cs_reg_tuple(b, reg, 1);
}

static inline struct cs_index
cs_reg64(struct cs_builder *b, unsigned reg)
{
   assert((reg % 2) == 0 && "unaligned 64-bit reg");
   return cs_reg_tuple(b, reg, 2);
}

/*
 * The top of the register file is reserved for cs_builder internal use. We
 * need 3 spare registers for handling command queue overflow. These are
 * available here.
 */
static inline uint8_t
cs_overflow_address_reg(struct cs_builder *b)
{
   return b->conf.nr_registers - 2;
}

static inline uint8_t
cs_overflow_length_reg(struct cs_builder *b)
{
   return b->conf.nr_registers - 3;
}

static inline struct cs_index
cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
{
   assert(idx.type == CS_INDEX_REGISTER && "unsupported");
   assert(word < idx.size && "overrun");

   return cs_reg32(b, idx.reg + word);
}

static inline struct cs_block *
cs_cur_block(struct cs_builder *b)
{
   return b->blocks.stack;
}

#define JUMP_SEQ_INSTR_COUNT 4

static inline bool
cs_reserve_instrs(struct cs_builder *b, uint32_t num_instrs)
{
   /* Don't call this function with num_instrs=0. */
   assert(num_instrs > 0);
   assert(cs_cur_block(b) == NULL);

   /* If an allocation failure happened before, we just discard all following
    * instructions.
    */
   if (unlikely(!cs_is_valid(b)))
      return false;

   /* Lazy root chunk allocation. */
   if (unlikely(!b->root_chunk.buffer.cpu)) {
      b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
      b->cur_chunk.buffer = b->root_chunk.buffer;
      if (!b->cur_chunk.buffer.cpu) {
         b->invalid = true;
         return false;
      }
   }

   /* Make sure the instruction sequence fits in a single chunk. */
   assert(b->cur_chunk.buffer.capacity >= num_instrs);

   /* If the current chunk runs out of space, allocate a new one and jump to it.
    * We actually do this a few instructions before running out, because the
    * sequence to jump to a new queue takes multiple instructions.
    */
   if (unlikely((b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
                b->cur_chunk.buffer.capacity)) {
      /* Now, allocate a new chunk */
      struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);

      /* Allocation failure, from now on, all new instructions will be
       * discarded.
       */
      if (unlikely(!newbuf.cpu)) {
         b->invalid = true;
         return false;
      }

      uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);

      pan_cast_and_pack(ptr, CS_MOVE, I) {
         I.destination = cs_overflow_address_reg(b);
         I.immediate = newbuf.gpu;
      }

      ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);

      pan_cast_and_pack(ptr, CS_MOVE32, I) {
         I.destination = cs_overflow_length_reg(b);
      }

      /* The length will be patched in later */
      uint32_t *length_patch = (uint32_t *)ptr;

      ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);

      pan_cast_and_pack(ptr, CS_JUMP, I) {
         I.length = cs_overflow_length_reg(b);
         I.address = cs_overflow_address_reg(b);
      }

      /* Now that we've emitted everything, finish up the previous queue */
      cs_wrap_chunk(b);

      /* And make this one current */
      b->length_patch = length_patch;
      b->cur_chunk.buffer = newbuf;
      b->cur_chunk.pos = 0;
   }

   return true;
}

static inline void *
cs_alloc_ins_block(struct cs_builder *b, uint32_t num_instrs)
{
   if (cs_cur_block(b))
      return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);

   if (!cs_reserve_instrs(b, num_instrs))
      return NULL;

   assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
   uint32_t pos = b->cur_chunk.pos;
   b->cur_chunk.pos += num_instrs;
   return b->cur_chunk.buffer.cpu + pos;
}

static inline void
cs_flush_block_instrs(struct cs_builder *b)
{
   if (cs_cur_block(b) != NULL)
      return;

   uint32_t num_instrs =
      util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
   if (!num_instrs)
      return;

   /* If LOAD_IP is the last instruction in the block, we reserve one more
    * slot to make sure the next instruction won't point to a CS chunk linking
    * sequence. */
   if (unlikely(b->blocks.last_load_ip_target >= num_instrs)) {
      if (!cs_reserve_instrs(b, num_instrs + 1))
         return;
   }

   void *buffer = cs_alloc_ins_block(b, num_instrs);

   if (likely(buffer != NULL)) {
      /* If we have a LOAD_IP chain, we need to patch each LOAD_IP
       * instruction before we copy the block to the final memory
       * region. */
      while (unlikely(b->blocks.last_load_ip_target)) {
         uint64_t *instr = util_dynarray_element(
            &b->blocks.instrs, uint64_t, b->blocks.last_load_ip_target - 1);
         unsigned prev_load_ip_target = *instr & BITFIELD_MASK(32);
         uint64_t ip =
            b->cur_chunk.buffer.gpu +
            ((b->cur_chunk.pos - num_instrs + b->blocks.last_load_ip_target) *
             sizeof(uint64_t));

         /* Drop the prev_load_ip_target value and replace it by the final
	  * IP. */
         *instr &= ~BITFIELD64_MASK(32);
         *instr |= ip;

         b->blocks.last_load_ip_target = prev_load_ip_target;
      }

      memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
   }

   util_dynarray_clear(&b->blocks.instrs);
}

static inline uint32_t
cs_block_next_pos(struct cs_builder *b)
{
   assert(cs_cur_block(b) != NULL);

   return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
}

static inline void
cs_label_init(struct cs_label *label)
{
   label->last_forward_ref = CS_LABEL_INVALID_POS;
   label->target = CS_LABEL_INVALID_POS;
}

static inline void
cs_set_label(struct cs_builder *b, struct cs_label *label)
{
   assert(label->target == CS_LABEL_INVALID_POS);
   label->target = cs_block_next_pos(b);

   for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
        forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
      uint64_t *ins =
         util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);

      assert(forward_ref < label->target);
      assert(label->target - forward_ref <= INT16_MAX);

      /* Save the next forward reference to this target before overwritting
       * it with the final offset.
       */
      int16_t offset = *ins & BITFIELD64_MASK(16);

      next_forward_ref =
         offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;

      assert(next_forward_ref == CS_LABEL_INVALID_POS ||
             next_forward_ref < forward_ref);

      *ins &= ~BITFIELD64_MASK(16);
      *ins |= label->target - forward_ref - 1;
   }
}

static inline void
cs_flush_pending_if(struct cs_builder *b)
{
   if (likely(cs_cur_block(b) != &b->blocks.pending_if.block))
      return;

   cs_set_label(b, &b->blocks.pending_if.end_label);
   b->blocks.stack = b->blocks.pending_if.block.next;
   cs_flush_block_instrs(b);
}

static inline void *
cs_alloc_ins(struct cs_builder *b)
{
   /* If an instruction is emitted after an if_end(), it flushes the pending if,
    * causing further cs_else_start() instructions to be invalid. */
   cs_flush_pending_if(b);

   return cs_alloc_ins_block(b, 1) ?: &b->discard_instr_slot;
}

/* Call this when you are done building a command stream and want to prepare
 * it for submission.
 */
static inline void
cs_finish(struct cs_builder *b)
{
   if (!cs_is_valid(b))
      return;

   cs_flush_pending_if(b);
   cs_wrap_chunk(b);

   /* This prevents adding instructions after that point. */
   memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));

   util_dynarray_fini(&b->blocks.instrs);
}

/*
 * Helper to emit a new instruction into the command queue. The allocation needs
 * to be separated out being pan_pack can evaluate its argument multiple times,
 * yet cs_alloc has side effects.
 */
#define cs_emit(b, T, cfg) pan_cast_and_pack(cs_alloc_ins(b), CS_##T, cfg)

/* Asynchronous operations take a mask of scoreboard slots to wait on
 * before executing the instruction, and signal a scoreboard slot when
 * the operation is complete.
 * A wait_mask of zero means the operation is synchronous, and signal_slot
 * is ignored in that case.
 */
struct cs_async_op {
   uint16_t wait_mask;
   uint8_t signal_slot;
};

static inline struct cs_async_op
cs_defer(unsigned wait_mask, unsigned signal_slot)
{
   /* The scoreboard slot to signal is incremented before the wait operation,
    * waiting on it would cause an infinite wait.
    */
   assert(!(wait_mask & BITFIELD_BIT(signal_slot)));

   return (struct cs_async_op){
      .wait_mask = wait_mask,
      .signal_slot = signal_slot,
   };
}

static inline struct cs_async_op
cs_now(void)
{
   return (struct cs_async_op){
      .wait_mask = 0,
      .signal_slot = ~0,
   };
}

static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
{
   switch (opcode) {
   case MALI_CS_OPCODE_FLUSH_CACHE2:
   case MALI_CS_OPCODE_FINISH_TILING:
   case MALI_CS_OPCODE_LOAD_MULTIPLE:
   case MALI_CS_OPCODE_STORE_MULTIPLE:
   case MALI_CS_OPCODE_RUN_COMPUTE:
   case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
   case MALI_CS_OPCODE_RUN_FRAGMENT:
   case MALI_CS_OPCODE_RUN_FULLSCREEN:
   case MALI_CS_OPCODE_RUN_IDVS:
   case MALI_CS_OPCODE_RUN_TILING:
      /* Always asynchronous. */
      return true;

   case MALI_CS_OPCODE_FINISH_FRAGMENT:
   case MALI_CS_OPCODE_SYNC_ADD32:
   case MALI_CS_OPCODE_SYNC_SET32:
   case MALI_CS_OPCODE_SYNC_ADD64:
   case MALI_CS_OPCODE_SYNC_SET64:
   case MALI_CS_OPCODE_STORE_STATE:
   case MALI_CS_OPCODE_TRACE_POINT:
   case MALI_CS_OPCODE_HEAP_OPERATION:
      /* Asynchronous only if wait_mask != 0. */
      return wait_mask != 0;

   default:
      return false;
   }
}

#define cs_apply_async(I, async)                                               \
   do {                                                                        \
      I.wait_mask = async.wait_mask;                                           \
      I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask)          \
                         ? async.signal_slot                                   \
                         : 0;                                                  \
      assert(I.signal_slot != ~0 ||                                            \
             !"Can't use cs_now() on pure async instructions");                \
   } while (0)

static inline void
cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
{
   cs_emit(b, MOVE32, I) {
      I.destination = cs_dst32(b, dest);
      I.immediate = imm;
   }
}

static inline void
cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
{
   cs_emit(b, MOVE, I) {
      I.destination = cs_dst64(b, dest);
      I.immediate = imm;
   }
}

static inline void
cs_load_ip_to(struct cs_builder *b, struct cs_index dest)
{
   /* If a load_ip instruction is emitted after an if_end(), it flushes the
    * pending if, causing further cs_else_start() instructions to be invalid.
    */
   cs_flush_pending_if(b);

   if (likely(cs_cur_block(b) == NULL)) {
      if (!cs_reserve_instrs(b, 2))
         return;

      /* We make IP point to the instruction right after our MOVE. */
      uint64_t ip =
         b->cur_chunk.buffer.gpu + (sizeof(uint64_t) * (b->cur_chunk.pos + 1));
      cs_move48_to(b, dest, ip);
   } else {
      cs_move48_to(b, dest, b->blocks.last_load_ip_target);
      b->blocks.last_load_ip_target =
         util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
   }
}

static inline void
cs_block_start(struct cs_builder *b, struct cs_block *block)
{
   cs_flush_pending_if(b);
   block->next = b->blocks.stack;
   b->blocks.stack = block;
}

static inline void
cs_block_end(struct cs_builder *b, struct cs_block *block)
{
   cs_flush_pending_if(b);

   assert(cs_cur_block(b) == block);

   b->blocks.stack = block->next;

   cs_flush_block_instrs(b);
}

static inline void
cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
          struct cs_index val)
{
   cs_emit(b, BRANCH, I) {
      I.offset = offset;
      I.condition = cond;
      I.value = cs_src32(b, val);
   }
}

static inline void
cs_branch_label(struct cs_builder *b, struct cs_label *label,
                enum mali_cs_condition cond, struct cs_index val)
{
   assert(cs_cur_block(b) != NULL);

   if (label->target == CS_LABEL_INVALID_POS) {
      uint32_t branch_ins_pos = cs_block_next_pos(b);

      /* Instead of emitting a BRANCH with the final offset, we record the
       * diff between the current branch, and the previous branch that was
       * referencing this unset label. This way we build a single link list
       * that can be walked when the label is set with cs_set_label().
       * We use -1 as the end-of-list marker.
       */
      int16_t offset = -1;
      if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
         assert(label->last_forward_ref < branch_ins_pos);
         assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
         offset = branch_ins_pos - label->last_forward_ref;
      }

      cs_emit(b, BRANCH, I) {
         I.offset = offset;
         I.condition = cond;
         I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
      }

      label->last_forward_ref = branch_ins_pos;
   } else {
      int32_t offset = label->target - cs_block_next_pos(b) - 1;

      /* The branch target is encoded in a 16-bit signed integer, make sure we
       * don't underflow.
       */
      assert(offset >= INT16_MIN);

      /* Backward references are easy, we can emit them immediately. */
      cs_emit(b, BRANCH, I) {
         I.offset = offset;
         I.condition = cond;
         I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
      }
   }
}

static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)
{
   switch (cond) {
   case MALI_CS_CONDITION_LEQUAL:
      return MALI_CS_CONDITION_GREATER;
   case MALI_CS_CONDITION_EQUAL:
      return MALI_CS_CONDITION_NEQUAL;
   case MALI_CS_CONDITION_LESS:
      return MALI_CS_CONDITION_GEQUAL;
   case MALI_CS_CONDITION_GREATER:
      return MALI_CS_CONDITION_LEQUAL;
   case MALI_CS_CONDITION_NEQUAL:
      return MALI_CS_CONDITION_EQUAL;
   case MALI_CS_CONDITION_GEQUAL:
      return MALI_CS_CONDITION_LESS;
   case MALI_CS_CONDITION_ALWAYS:
      unreachable("cannot invert ALWAYS");
   default:
      unreachable("invalid cond");
   }
}

static inline struct cs_if_else *
cs_if_start(struct cs_builder *b, struct cs_if_else *if_else,
            enum mali_cs_condition cond, struct cs_index val)
{
   cs_block_start(b, &if_else->block);
   cs_label_init(&if_else->end_label);
   cs_branch_label(b, &if_else->end_label, cs_invert_cond(cond), val);
   return if_else;
}

static inline void
cs_if_end(struct cs_builder *b, struct cs_if_else *if_else)
{
   assert(cs_cur_block(b) == &if_else->block);

   b->blocks.pending_if.block.next = if_else->block.next;
   b->blocks.stack = &b->blocks.pending_if.block;
   b->blocks.pending_if.end_label = if_else->end_label;
}

static inline struct cs_if_else *
cs_else_start(struct cs_builder *b, struct cs_if_else *if_else)
{
   assert(cs_cur_block(b) == &b->blocks.pending_if.block);

   if_else->block.next = b->blocks.pending_if.block.next;
   b->blocks.stack = &if_else->block;
   cs_label_init(&if_else->end_label);
   cs_branch_label(b, &if_else->end_label, MALI_CS_CONDITION_ALWAYS,
                   cs_undef());
   cs_set_label(b, &b->blocks.pending_if.end_label);
   cs_label_init(&b->blocks.pending_if.end_label);

   return if_else;
}

static inline void
cs_else_end(struct cs_builder *b, struct cs_if_else *if_else)
{
   cs_set_label(b, &if_else->end_label);
   cs_block_end(b, &if_else->block);
}

#define cs_if(__b, __cond, __val)                                              \
   for (struct cs_if_else __storage,                                           \
        *__if_else = cs_if_start(__b, &__storage, __cond, __val);              \
        __if_else != NULL; cs_if_end(__b, __if_else), __if_else = NULL)

#define cs_else(__b)                                                           \
   for (struct cs_if_else __storage,                                           \
        *__if_else = cs_else_start(__b, &__storage);                           \
        __if_else != NULL; cs_else_end(__b, __if_else), __if_else = NULL)

struct cs_loop {
   struct cs_label start, end;
   struct cs_block block;
   enum mali_cs_condition cond;
   struct cs_index val;
   struct cs_load_store_tracker *orig_ls_state;
   struct cs_load_store_tracker ls_state;
};

static inline void
cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
{
   if (likely(!b->conf.ls_tracker))
      return;

   if (!loop->orig_ls_state) {
      loop->orig_ls_state = b->conf.ls_tracker;
      loop->ls_state = *loop->orig_ls_state;
      b->conf.ls_tracker = &loop->ls_state;
   } else {
      BITSET_OR(loop->orig_ls_state->pending_loads,
                loop->orig_ls_state->pending_loads,
                loop->ls_state.pending_loads);
      BITSET_OR(loop->orig_ls_state->pending_stores,
                loop->orig_ls_state->pending_stores,
                loop->ls_state.pending_stores);
   }
}

static inline struct cs_loop *
cs_do_while_start(struct cs_builder *b, struct cs_loop *loop,
                  enum mali_cs_condition cond, struct cs_index val)
{
   *loop = (struct cs_loop){
      .cond = cond,
      .val = val,
   };

   cs_block_start(b, &loop->block);
   cs_label_init(&loop->start);
   cs_label_init(&loop->end);
   cs_set_label(b, &loop->start);
   return loop;
}

static inline struct cs_loop *
cs_while_start(struct cs_builder *b, struct cs_loop *loop,
               enum mali_cs_condition cond, struct cs_index val)
{
   cs_do_while_start(b, loop, cond, val);

   /* Do an initial check on the condition, and if it's false, jump to
    * the end of the loop block. For 'while(true)' loops, skip the
    * conditional branch.
    */
   if (cond != MALI_CS_CONDITION_ALWAYS) {
      cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
      cs_loop_diverge_ls_update(b, loop);
   }

   return loop;
}

static inline void
cs_loop_conditional_continue(struct cs_builder *b, struct cs_loop *loop,
                             enum mali_cs_condition cond, struct cs_index val)
{
   cs_flush_pending_if(b);
   cs_branch_label(b, &loop->start, cond, val);
   cs_loop_diverge_ls_update(b, loop);
}

static inline void
cs_loop_conditional_break(struct cs_builder *b, struct cs_loop *loop,
                          enum mali_cs_condition cond, struct cs_index val)
{
   cs_flush_pending_if(b);
   cs_branch_label(b, &loop->end, cond, val);
   cs_loop_diverge_ls_update(b, loop);
}

static inline void
cs_while_end(struct cs_builder *b, struct cs_loop *loop)
{
   cs_flush_pending_if(b);
   cs_branch_label(b, &loop->start, loop->cond, loop->val);
   cs_set_label(b, &loop->end);
   cs_block_end(b, &loop->block);

   if (unlikely(loop->orig_ls_state)) {
      BITSET_OR(loop->orig_ls_state->pending_loads,
                loop->orig_ls_state->pending_loads,
                loop->ls_state.pending_loads);
      BITSET_OR(loop->orig_ls_state->pending_stores,
                loop->orig_ls_state->pending_stores,
                loop->ls_state.pending_stores);
      b->conf.ls_tracker = loop->orig_ls_state;
   }
}

#define cs_while(__b, __cond, __val)                                           \
   for (struct cs_loop __loop_storage,                                         \
        *__loop = cs_while_start(__b, &__loop_storage, __cond, __val);         \
        __loop != NULL; cs_while_end(__b, __loop), __loop = NULL)

#define cs_continue(__b)                                                       \
   cs_loop_conditional_continue(__b, __loop, MALI_CS_CONDITION_ALWAYS,         \
                                cs_undef())

#define cs_break(__b)                                                          \
   cs_loop_conditional_break(__b, __loop, MALI_CS_CONDITION_ALWAYS, cs_undef())

/* Pseudoinstructions follow */

static inline void
cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
{
   if (imm < (1ull << 48)) {
      /* Zero extends */
      cs_move48_to(b, dest, imm);
   } else {
      cs_move32_to(b, cs_extract32(b, dest, 0), imm);
      cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
   }
}

static inline void
cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
{
   struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;

   cs_emit(b, WAIT, I) {
      I.wait_mask = wait_mask;
      I.progress_increment = progress_inc;
   }

   /* We don't do advanced tracking of cs_defer(), and assume that
    * load/store will be flushed with an explicit wait on the load/store
    * scoreboard. */
   if (unlikely(ls_tracker) &&
       (wait_mask & BITFIELD_BIT(ls_tracker->sb_slot))) {
      BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
      BITSET_CLEAR_RANGE(ls_tracker->pending_stores, 0, 255);
   }
}

static inline void
cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
{
   assert(slot < 8 && "invalid slot");

   cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
}

struct cs_shader_res_sel {
   uint8_t srt, fau, spd, tsd;
};

static inline struct cs_shader_res_sel
cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
{
   return (struct cs_shader_res_sel){
      .srt = srt,
      .fau = fau,
      .spd = spd,
      .tsd = tsd,
   };
}

static inline void
cs_run_compute(struct cs_builder *b, unsigned task_increment,
               enum mali_task_axis task_axis, bool progress_inc,
               struct cs_shader_res_sel res_sel)
{
   cs_emit(b, RUN_COMPUTE, I) {
      I.task_increment = task_increment;
      I.task_axis = task_axis;
      I.progress_increment = progress_inc;
      I.srt_select = res_sel.srt;
      I.spd_select = res_sel.spd;
      I.tsd_select = res_sel.tsd;
      I.fau_select = res_sel.fau;
   }
}

static inline void
cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
              struct cs_shader_res_sel res_sel)
{
   cs_emit(b, RUN_TILING, I) {
      I.flags_override = flags_override;
      I.progress_increment = progress_inc;
      I.srt_select = res_sel.srt;
      I.spd_select = res_sel.spd;
      I.tsd_select = res_sel.tsd;
      I.fau_select = res_sel.fau;
   }
}

static inline void
cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
            bool malloc_enable, struct cs_shader_res_sel varying_sel,
            struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
{
   cs_emit(b, RUN_IDVS, I) {
      I.flags_override = flags_override;
      I.progress_increment = progress_inc;
      I.malloc_enable = malloc_enable;

      if (draw_id.type == CS_INDEX_UNDEF) {
         I.draw_id_register_enable = false;
      } else {
         I.draw_id_register_enable = true;
         I.draw_id = cs_src32(b, draw_id);
      }

      assert(varying_sel.spd == 1);
      assert(varying_sel.fau == 0 || varying_sel.fau == 1);
      assert(varying_sel.srt == 0 || varying_sel.srt == 1);
      assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
      I.varying_fau_select = varying_sel.fau == 1;
      I.varying_srt_select = varying_sel.srt == 1;
      I.varying_tsd_select = varying_sel.tsd == 1;

      assert(frag_sel.spd == 2);
      assert(frag_sel.fau == 2);
      assert(frag_sel.srt == 2 || frag_sel.srt == 0);
      assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
      I.fragment_srt_select = frag_sel.srt == 2;
      I.fragment_tsd_select = frag_sel.tsd == 2;
   }
}

static inline void
cs_run_fragment(struct cs_builder *b, bool enable_tem,
                enum mali_tile_render_order tile_order, bool progress_inc)
{
   cs_emit(b, RUN_FRAGMENT, I) {
      I.enable_tem = enable_tem;
      I.tile_order = tile_order;
      I.progress_increment = progress_inc;
   }
}

static inline void
cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
                  bool progress_inc, struct cs_index dcd)
{
   cs_emit(b, RUN_FULLSCREEN, I) {
      I.flags_override = flags_override;
      I.progress_increment = progress_inc;
      I.dcd = cs_src64(b, dcd);
   }
}

static inline void
cs_finish_tiling(struct cs_builder *b, bool progress_inc)
{
   cs_emit(b, FINISH_TILING, I)
      I.progress_increment = progress_inc;
}

static inline void
cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
                   struct cs_index first_free_heap_chunk,
                   struct cs_index last_free_heap_chunk,
                   struct cs_async_op async)
{
   cs_emit(b, FINISH_FRAGMENT, I) {
      I.increment_fragment_completed = increment_frag_completed;
      cs_apply_async(I, async);
      I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
      I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
   }
}

static inline void
cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
         unsigned imm)
{
   cs_emit(b, ADD_IMMEDIATE32, I) {
      I.destination = cs_dst32(b, dest);
      I.source = cs_src32(b, src);
      I.immediate = imm;
   }
}

static inline void
cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
         unsigned imm)
{
   cs_emit(b, ADD_IMMEDIATE64, I) {
      I.destination = cs_dst64(b, dest);
      I.source = cs_src64(b, src);
      I.immediate = imm;
   }
}

static inline void
cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
          struct cs_index src2)
{
   cs_emit(b, UMIN32, I) {
      I.destination = cs_dst32(b, dest);
      I.source_1 = cs_src32(b, src1);
      I.source_2 = cs_src32(b, src2);
   }
}

static inline void
cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
           unsigned mask, int offset)
{
   unsigned count = util_last_bit(mask);
   unsigned base_reg = cs_dst_tuple(b, dest, count, mask);

   cs_emit(b, LOAD_MULTIPLE, I) {
      I.base_register = base_reg;
      I.address = cs_src64(b, address);
      I.mask = mask;
      I.offset = offset;
   }

   if (unlikely(b->conf.ls_tracker)) {
      for (unsigned i = 0; i < count; i++) {
         if (mask & BITFIELD_BIT(i))
            BITSET_SET(b->conf.ls_tracker->pending_loads, base_reg + i);
      }
   }
}

static inline void
cs_load32_to(struct cs_builder *b, struct cs_index dest,
             struct cs_index address, int offset)
{
   cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
}

static inline void
cs_load64_to(struct cs_builder *b, struct cs_index dest,
             struct cs_index address, int offset)
{
   cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
}

static inline void
cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
         unsigned mask, int offset)
{
   unsigned count = util_last_bit(mask);
   unsigned base_reg = cs_src_tuple(b, data, count, mask);

   cs_emit(b, STORE_MULTIPLE, I) {
      I.base_register = base_reg;
      I.address = cs_src64(b, address);
      I.mask = mask;
      I.offset = offset;
   }

   if (unlikely(b->conf.ls_tracker)) {
      for (unsigned i = 0; i < count; i++) {
         if (mask & BITFIELD_BIT(i))
            BITSET_SET(b->conf.ls_tracker->pending_stores, base_reg + i);
      }
   }
}

static inline void
cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
           int offset)
{
   cs_store(b, data, address, BITFIELD_MASK(1), offset);
}

static inline void
cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
           int offset)
{
   cs_store(b, data, address, BITFIELD_MASK(2), offset);
}

/*
 * Select which scoreboard entry will track endpoint tasks and other tasks
 * respectively. Pass to cs_wait to wait later.
 */
static inline void
cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
{
   assert(ep < 8 && "invalid slot");
   assert(other < 8 && "invalid slot");

   cs_emit(b, SET_SB_ENTRY, I) {
      I.endpoint_entry = ep;
      I.other_entry = other;
   }

   /* We assume the load/store scoreboard entry is static to keep things
    * simple. */
   if (unlikely(b->conf.ls_tracker))
      assert(b->conf.ls_tracker->sb_slot == other);
}

static inline void
cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
{
   cs_emit(b, PROGRESS_WAIT, I) {
      I.source = cs_src64(b, ref);
      I.queue = queue;
   }
}

static inline void
cs_set_exception_handler(struct cs_builder *b,
                         enum mali_cs_exception_type exception_type,
                         struct cs_index address, struct cs_index length)
{
   cs_emit(b, SET_EXCEPTION_HANDLER, I) {
      I.exception_type = exception_type;
      I.address = cs_src64(b, address);
      I.length = cs_src32(b, length);
   }
}

static inline void
cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
{
   cs_emit(b, CALL, I) {
      I.address = cs_src64(b, address);
      I.length = cs_src32(b, length);
   }
}

static inline void
cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
{
   cs_emit(b, JUMP, I) {
      I.address = cs_src64(b, address);
      I.length = cs_src32(b, length);
   }
}

enum cs_res_id {
   CS_COMPUTE_RES = BITFIELD_BIT(0),
   CS_FRAG_RES = BITFIELD_BIT(1),
   CS_TILER_RES = BITFIELD_BIT(2),
   CS_IDVS_RES = BITFIELD_BIT(3),
};

static inline void
cs_req_res(struct cs_builder *b, uint32_t res_mask)
{
   cs_emit(b, REQ_RESOURCE, I) {
      I.compute = res_mask & CS_COMPUTE_RES;
      I.tiler = res_mask & CS_TILER_RES;
      I.idvs = res_mask & CS_IDVS_RES;
      I.fragment = res_mask & CS_FRAG_RES;
   }
}

static inline void
cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
                enum mali_cs_flush_mode lsc, bool other_inv,
                struct cs_index flush_id, struct cs_async_op async)
{
   cs_emit(b, FLUSH_CACHE2, I) {
      I.l2_flush_mode = l2;
      I.lsc_flush_mode = lsc;
      I.other_invalidate = other_inv;
      I.latest_flush_id = cs_src32(b, flush_id);
      cs_apply_async(I, async);
   }
}

#define CS_SYNC_OPS(__cnt_width)                                               \
   static inline void cs_sync##__cnt_width##_set(                              \
      struct cs_builder *b, bool propagate_error,                              \
      enum mali_cs_sync_scope scope, struct cs_index val,                      \
      struct cs_index addr, struct cs_async_op async)                          \
   {                                                                           \
      cs_emit(b, SYNC_SET##__cnt_width, I) {                                   \
         I.error_propagate = propagate_error;                                  \
         I.scope = scope;                                                      \
         I.data = cs_src##__cnt_width(b, val);                                 \
         I.address = cs_src64(b, addr);                                        \
         cs_apply_async(I, async);                                             \
      }                                                                        \
   }                                                                           \
                                                                               \
   static inline void cs_sync##__cnt_width##_add(                              \
      struct cs_builder *b, bool propagate_error,                              \
      enum mali_cs_sync_scope scope, struct cs_index val,                      \
      struct cs_index addr, struct cs_async_op async)                          \
   {                                                                           \
      cs_emit(b, SYNC_ADD##__cnt_width, I) {                                   \
         I.error_propagate = propagate_error;                                  \
         I.scope = scope;                                                      \
         I.data = cs_src##__cnt_width(b, val);                                 \
         I.address = cs_src64(b, addr);                                        \
         cs_apply_async(I, async);                                             \
      }                                                                        \
   }                                                                           \
                                                                               \
   static inline void cs_sync##__cnt_width##_wait(                             \
      struct cs_builder *b, bool reject_error, enum mali_cs_condition cond,    \
      struct cs_index ref, struct cs_index addr)                               \
   {                                                                           \
      assert(cond == MALI_CS_CONDITION_LEQUAL ||                               \
             cond == MALI_CS_CONDITION_GREATER);                               \
      cs_emit(b, SYNC_WAIT##__cnt_width, I) {                                  \
         I.error_reject = reject_error;                                        \
         I.condition = cond;                                                   \
         I.data = cs_src##__cnt_width(b, ref);                                 \
         I.address = cs_src64(b, addr);                                        \
      }                                                                        \
   }

CS_SYNC_OPS(32)
CS_SYNC_OPS(64)

static inline void
cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
               enum mali_cs_state state, struct cs_async_op async)
{
   cs_emit(b, STORE_STATE, I) {
      I.offset = offset;
      I.state = state;
      I.address = cs_src64(b, address);
      cs_apply_async(I, async);
   }
}

static inline void
cs_prot_region(struct cs_builder *b, unsigned size)
{
   cs_emit(b, PROT_REGION, I) {
      I.size = size;
   }
}

static inline void
cs_progress_store(struct cs_builder *b, struct cs_index src)
{
   cs_emit(b, PROGRESS_STORE, I)
      I.source = cs_src64(b, src);
}

static inline void
cs_progress_load(struct cs_builder *b, struct cs_index dst)
{
   cs_emit(b, PROGRESS_LOAD, I)
      I.destination = cs_dst64(b, dst);
}

static inline void
cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
                        bool progress_inc, struct cs_shader_res_sel res_sel)
{
   cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
      I.workgroups_per_task = wg_per_task;
      I.progress_increment = progress_inc;
      I.srt_select = res_sel.srt;
      I.spd_select = res_sel.spd;
      I.tsd_select = res_sel.tsd;
      I.fau_select = res_sel.fau;
   }
}

static inline void
cs_error_barrier(struct cs_builder *b)
{
   cs_emit(b, ERROR_BARRIER, _)
      ;
}

static inline void
cs_heap_set(struct cs_builder *b, struct cs_index address)
{
   cs_emit(b, HEAP_SET, I) {
      I.address = cs_src64(b, address);
   }
}

static inline void
cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
                  struct cs_async_op async)
{
   cs_emit(b, HEAP_OPERATION, I) {
      I.operation = operation;
      cs_apply_async(I, async);
   }
}

static inline void
cs_vt_start(struct cs_builder *b, struct cs_async_op async)
{
   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
}

static inline void
cs_vt_end(struct cs_builder *b, struct cs_async_op async)
{
   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
}

static inline void
cs_frag_end(struct cs_builder *b, struct cs_async_op async)
{
   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
}

static inline void
cs_trace_point(struct cs_builder *b, struct cs_index regs,
               struct cs_async_op async)
{
   cs_emit(b, TRACE_POINT, I) {
      I.base_register =
         cs_src_tuple(b, regs, regs.size, BITFIELD_MASK(regs.size));
      I.register_count = regs.size;
      cs_apply_async(I, async);
   }
}

struct cs_match {
   struct cs_block block;
   struct cs_label break_label;
   struct cs_block case_block;
   struct cs_label next_case_label;
   struct cs_index val;
   struct cs_index scratch_reg;
   struct cs_load_store_tracker case_ls_state;
   struct cs_load_store_tracker ls_state;
   struct cs_load_store_tracker *orig_ls_state;
   bool default_emitted;
};

static inline struct cs_match *
cs_match_start(struct cs_builder *b, struct cs_match *match,
               struct cs_index val, struct cs_index scratch_reg)
{
   *match = (struct cs_match){
      .val = val,
      .scratch_reg = scratch_reg,
      .orig_ls_state = b->conf.ls_tracker,
   };

   cs_block_start(b, &match->block);
   cs_label_init(&match->break_label);
   cs_label_init(&match->next_case_label);

   return match;
}

static inline void
cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
{
   if (unlikely(match->orig_ls_state)) {
      match->case_ls_state = *match->orig_ls_state;
      b->conf.ls_tracker = &match->case_ls_state;
   }
}

static inline void
cs_match_case_ls_get(struct cs_match *match)
{
   if (unlikely(match->orig_ls_state)) {
      BITSET_OR(match->ls_state.pending_loads,
                match->case_ls_state.pending_loads,
                match->ls_state.pending_loads);
      BITSET_OR(match->ls_state.pending_stores,
                match->case_ls_state.pending_stores,
                match->ls_state.pending_stores);
   }
}

static inline void
cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
{
   assert(!match->default_emitted || !"default case must be last");
   if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
      cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
                      cs_undef());
      cs_block_end(b, &match->case_block);
      cs_match_case_ls_get(match);
      cs_set_label(b, &match->next_case_label);
      cs_label_init(&match->next_case_label);
   }

   if (id)
      cs_add32(b, match->scratch_reg, match->val, -id);

   cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
                   id ? match->scratch_reg : match->val);

   cs_match_case_ls_set(b, match);
   cs_block_start(b, &match->case_block);
}

static inline void
cs_match_default(struct cs_builder *b, struct cs_match *match)
{
   assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
          !"default case requires at least one other case");
   cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
                   cs_undef());

   if (cs_cur_block(b) == &match->case_block) {
      cs_block_end(b, &match->case_block);
      cs_match_case_ls_get(match);
   }

   cs_set_label(b, &match->next_case_label);
   cs_label_init(&match->next_case_label);
   cs_match_case_ls_set(b, match);
   cs_block_start(b, &match->case_block);
   match->default_emitted = true;
}

static inline void
cs_match_end(struct cs_builder *b, struct cs_match *match)
{
   if (cs_cur_block(b) == &match->case_block) {
      cs_match_case_ls_get(match);
      cs_block_end(b, &match->case_block);
   }

   if (unlikely(match->orig_ls_state)) {
      if (!match->default_emitted) {
         /* If we don't have a default, assume we don't handle all possible cases
          * and the match load/store state with the original load/store state.
          */
         BITSET_OR(match->orig_ls_state->pending_loads,
                   match->ls_state.pending_loads,
                   match->orig_ls_state->pending_loads);
         BITSET_OR(match->orig_ls_state->pending_stores,
                   match->ls_state.pending_stores,
                   match->orig_ls_state->pending_stores);
      } else {
         *match->orig_ls_state = match->ls_state;
      }

      b->conf.ls_tracker = match->orig_ls_state;
   }

   cs_set_label(b, &match->next_case_label);
   cs_set_label(b, &match->break_label);

   cs_block_end(b, &match->block);
}

#define cs_match(__b, __val, __scratch)                                        \
   for (struct cs_match __match_storage,                                       \
        *__match = cs_match_start(__b, &__match_storage, __val, __scratch);    \
        __match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)

#define cs_case(__b, __ref)                                                    \
   for (bool __case_defined = ({                                               \
           cs_match_case(__b, __match, __ref);                                 \
           false;                                                              \
        });                                                                    \
        !__case_defined; __case_defined = true)

#define cs_default(__b)                                                        \
   for (bool __default_defined = ({                                            \
           cs_match_default(__b, __match);                                     \
           false;                                                              \
        });                                                                    \
        !__default_defined; __default_defined = true)

static inline void
cs_nop(struct cs_builder *b)
{
   cs_emit(b, NOP, I) {};
}

struct cs_exception_handler_ctx {
   struct cs_index ctx_reg;
   unsigned dump_addr_offset;
   uint8_t ls_sb_slot;
};

struct cs_exception_handler {
   struct cs_block block;
   struct cs_dirty_tracker dirty;
   struct cs_exception_handler_ctx ctx;
   unsigned dump_size;
   uint64_t address;
   uint32_t length;
};

static inline struct cs_exception_handler *
cs_exception_handler_start(struct cs_builder *b,
                           struct cs_exception_handler *handler,
                           struct cs_exception_handler_ctx ctx)
{
   assert(cs_cur_block(b) == NULL);
   assert(b->conf.dirty_tracker == NULL);

   *handler = (struct cs_exception_handler){
      .ctx = ctx,
   };

   cs_block_start(b, &handler->block);

   b->conf.dirty_tracker = &handler->dirty;

   return handler;
}

#define SAVE_RESTORE_MAX_OPS (256 / 16)

static inline void
cs_exception_handler_end(struct cs_builder *b,
                         struct cs_exception_handler *handler)
{
   struct cs_index ranges[SAVE_RESTORE_MAX_OPS];
   uint16_t masks[SAVE_RESTORE_MAX_OPS];
   unsigned num_ranges = 0;
   uint32_t num_instrs =
      util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
   struct cs_index addr_reg = {
      .type = CS_INDEX_REGISTER,
      .size = 2,
      .reg = b->conf.nr_registers - 2,
   };

   /* Manual cs_block_end() without an instruction flush. We do that to insert
    * the preamble without having to move memory in b->blocks.instrs. The flush
    * will be done after the preamble has been emitted. */
   assert(cs_cur_block(b) == &handler->block);
   assert(handler->block.next == NULL);
   b->blocks.stack = NULL;

   if (!num_instrs)
      return;

   /* Try to minimize number of load/store by grouping them */
   unsigned nregs = b->conf.nr_registers - b->conf.nr_kernel_registers;
   unsigned pos, last = 0;

   BITSET_FOREACH_SET(pos, handler->dirty.regs, nregs) {
      unsigned range = MIN2(nregs - pos, 16);
      unsigned word = BITSET_BITWORD(pos);
      unsigned bit = pos % BITSET_WORDBITS;
      unsigned remaining_bits = BITSET_WORDBITS - bit;

      if (pos < last)
         continue;

      masks[num_ranges] = handler->dirty.regs[word] >> bit;
      if (remaining_bits < range)
         masks[num_ranges] |= handler->dirty.regs[word + 1] << remaining_bits;
      masks[num_ranges] &= BITFIELD_MASK(range);

      ranges[num_ranges] =
         cs_reg_tuple(b, pos, util_last_bit(masks[num_ranges]));
      num_ranges++;
      last = pos + range;
   }

   handler->dump_size = BITSET_COUNT(handler->dirty.regs) * sizeof(uint32_t);

   /* Make sure the current chunk is able to accommodate the block
    * instructions as well as the preamble and postamble.
    * Adding 4 instructions (2x wait_slot and the move for the address) as
    * the move might actually be translated to two MOVE32 instructions. */
   num_instrs += (num_ranges * 2) + 4;

   /* Align things on a cache-line in case the buffer contains more than one
    * exception handler (64 bytes = 8 instructions). */
   uint32_t padded_num_instrs = ALIGN_POT(num_instrs, 8);

   if (!cs_reserve_instrs(b, padded_num_instrs))
      return;

   handler->address =
      b->cur_chunk.buffer.gpu + (b->cur_chunk.pos * sizeof(uint64_t));

   /* Preamble: backup modified registers */
   if (num_ranges > 0) {
      unsigned offset = 0;

      cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
                   handler->ctx.dump_addr_offset);
      cs_wait_slot(b, handler->ctx.ls_sb_slot, false);

      for (unsigned i = 0; i < num_ranges; ++i) {
         unsigned reg_count = util_bitcount(masks[i]);

         cs_store(b, ranges[i], addr_reg, masks[i], offset);
         offset += reg_count * 4;
      }

      cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
   }

   /* Now that the preamble is emitted, we can flush the instructions we have in
    * our exception handler block. */
   cs_flush_block_instrs(b);

   /* Postamble: restore modified registers */
   if (num_ranges > 0) {
      unsigned offset = 0;

      cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
                   handler->ctx.dump_addr_offset);
      cs_wait_slot(b, handler->ctx.ls_sb_slot, false);

      for (unsigned i = 0; i < num_ranges; ++i) {
         unsigned reg_count = util_bitcount(masks[i]);

         cs_load_to(b, ranges[i], addr_reg, masks[i], offset);
         offset += reg_count * 4;
      }

      cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
   }

   /* Fill the rest of the buffer with NOPs. */
   for (; num_instrs < padded_num_instrs; num_instrs++)
      cs_nop(b);

   handler->length = padded_num_instrs;
}

#define cs_exception_handler_def(__b, __handler, __ctx)                        \
   for (struct cs_exception_handler *__ehandler =                              \
           cs_exception_handler_start(__b, __handler, __ctx);                  \
        __ehandler != NULL;                                                    \
        cs_exception_handler_end(__b, __handler), __ehandler = NULL)

struct cs_tracing_ctx {
   bool enabled;
   struct cs_index ctx_reg;
   unsigned tracebuf_addr_offset;
   uint8_t ls_sb_slot;
};

static inline void
cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
                  struct cs_index scratch_regs, unsigned trace_size)
{
   assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
          trace_size < INT16_MAX);
   assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));

   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);

   /* We always update the tracebuf position first, so we can easily detect OOB
    * access. Use cs_trace_field_offset() to get an offset taking this
    * pre-increment into account. */
   cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
   cs_wait_slot(b, ctx->ls_sb_slot, false);
   cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
   cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
   cs_wait_slot(b, ctx->ls_sb_slot, false);
}

#define cs_trace_field_offset(__type, __field)                                 \
   (int16_t)(offsetof(struct cs_##__type##_trace, __field) -                   \
             sizeof(struct cs_##__type##_trace))

struct cs_run_fragment_trace {
   uint64_t ip;
   uint32_t sr[7];
} __attribute__((aligned(64)));

static inline void
cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
                      struct cs_index scratch_regs, bool enable_tem,
                      enum mali_tile_render_order tile_order, bool progress_inc)
{
   if (likely(!ctx->enabled)) {
      cs_run_fragment(b, enable_tem, tile_order, progress_inc);
      return;
   }

   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
   struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);

   cs_trace_preamble(b, ctx, scratch_regs,
                     sizeof(struct cs_run_fragment_trace));

   /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
    * won't point to the right instruction. */
   cs_load_ip_to(b, data);
   cs_run_fragment(b, enable_tem, tile_order, progress_inc);
   cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));

   cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
            cs_trace_field_offset(run_fragment, sr));
   cs_wait_slot(b, ctx->ls_sb_slot, false);
}

struct cs_run_idvs_trace {
   uint64_t ip;
   uint32_t draw_id;
   uint32_t pad;
   uint32_t sr[61];
} __attribute__((aligned(64)));

static inline void
cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
                  struct cs_index scratch_regs, uint32_t flags_override,
                  bool progress_inc, bool malloc_enable,
                  struct cs_shader_res_sel varying_sel,
                  struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
{
   if (likely(!ctx->enabled)) {
      cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
                  frag_sel, draw_id);
      return;
   }

   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
   struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);

   cs_trace_preamble(b, ctx, scratch_regs,
                     sizeof(struct cs_run_idvs_trace));

   /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
    * won't point to the right instruction. */
   cs_load_ip_to(b, data);
   cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
               frag_sel, draw_id);
   cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));

   if (draw_id.type != CS_INDEX_UNDEF)
      cs_store32(b, draw_id, tracebuf_addr,
                 cs_trace_field_offset(run_idvs, draw_id));

   for (unsigned i = 0; i < 48; i += 16)
      cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
               cs_trace_field_offset(run_idvs, sr[i]));
   cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
            cs_trace_field_offset(run_idvs, sr[48]));
   cs_wait_slot(b, ctx->ls_sb_slot, false);
}

struct cs_run_compute_trace {
   uint64_t ip;
   uint32_t sr[40];
} __attribute__((aligned(64)));

static inline void
cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
                     struct cs_index scratch_regs, unsigned task_increment,
                     enum mali_task_axis task_axis, bool progress_inc,
                     struct cs_shader_res_sel res_sel)
{
   if (likely(!ctx->enabled)) {
      cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
      return;
   }

   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
   struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);

   cs_trace_preamble(b, ctx, scratch_regs,
                     sizeof(struct cs_run_compute_trace));

   /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
    * won't point to the right instruction. */
   cs_load_ip_to(b, data);
   cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
   cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));

   for (unsigned i = 0; i < 32; i += 16)
      cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
               cs_trace_field_offset(run_compute, sr[i]));
   cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
            cs_trace_field_offset(run_compute, sr[32]));
   cs_wait_slot(b, ctx->ls_sb_slot, false);
}

static inline void
cs_trace_run_compute_indirect(struct cs_builder *b,
                              const struct cs_tracing_ctx *ctx,
                              struct cs_index scratch_regs,
                              unsigned wg_per_task, bool progress_inc,
                              struct cs_shader_res_sel res_sel)
{
   if (likely(!ctx->enabled)) {
      cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
      return;
   }

   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
   struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);

   cs_trace_preamble(b, ctx, scratch_regs,
                     sizeof(struct cs_run_compute_trace));

   /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
    * won't point to the right instruction. */
   cs_load_ip_to(b, data);
   cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
   cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));

   for (unsigned i = 0; i < 32; i += 16)
      cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
               cs_trace_field_offset(run_compute, sr[i]));
   cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
            cs_trace_field_offset(run_compute, sr[32]));
   cs_wait_slot(b, ctx->ls_sb_slot, false);
}