/*
 * Copyright © 2014-2015 Broadcom
 * SPDX-License-Identifier: MIT
 */

#include "nir_to_rc.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_deref.h"
#include "compiler/nir/nir_legacy.h"
#include "compiler/nir/nir_worklist.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "tgsi/tgsi_dump.h"
#include "tgsi/tgsi_from_mesa.h"
#include "tgsi/tgsi_info.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_ureg.h"
#include "tgsi/tgsi_util.h"
#include "util/u_debug.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "r300_nir.h"
#include "r300_screen.h"

struct ntr_insn {
   enum tgsi_opcode opcode;
   struct ureg_dst dst[2];
   struct ureg_src src[4];
   enum tgsi_texture_type tex_target;
   enum tgsi_return_type tex_return_type;
   struct tgsi_texture_offset tex_offset[4];

   unsigned mem_qualifier;
   enum pipe_format mem_format;

   bool is_tex : 1;
   bool precise : 1;
};

struct ntr_block {
   /* Array of struct ntr_insn */
   struct util_dynarray insns;
   int start_ip;
   int end_ip;
};

struct ntr_reg_interval {
   uint32_t start, end;
};

struct ntr_compile {
   nir_shader *s;
   nir_function_impl *impl;
   struct pipe_screen *screen;
   struct ureg_program *ureg;

   /* Options */
   bool lower_fabs;

   bool addr_declared[3];
   struct ureg_dst addr_reg[3];

   /* if condition set up at the end of a block, for ntr_emit_if(). */
   struct ureg_src if_cond;

   /* TGSI temps for our NIR SSA and register values. */
   struct ureg_dst *reg_temp;
   struct ureg_src *ssa_temp;

   struct ntr_reg_interval *liveness;

   /* Map from nir_block to ntr_block */
   struct hash_table *blocks;
   struct ntr_block *cur_block;
   unsigned current_if_else;
   unsigned cf_label;

   /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
   bool precise;

   unsigned num_temps;
   unsigned first_non_array_temp;

   /* Mappings from driver_location to TGSI input/output number.
    *
    * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
    * their numbers assigned incrementally, unlike inputs or constants.
    */
   struct ureg_src *input_index_map;
   uint64_t centroid_inputs;

   uint32_t first_ubo;
};

static struct ureg_dst
ntr_temp(struct ntr_compile *c)
{
   return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
}

static struct ntr_block *
ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block)
{
   struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
   return entry->data;
}

static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list);
static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list);

static struct ntr_insn *
ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode, struct ureg_dst dst, struct ureg_src src0,
         struct ureg_src src1, struct ureg_src src2, struct ureg_src src3)
{
   struct ntr_insn insn = {
      .opcode = opcode,
      .dst = {dst, ureg_dst_undef()},
      .src = {src0, src1, src2, src3},
      .precise = c->precise,
   };
   util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn);
   return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn);
}

#define OP00(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c)                                              \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(),          \
               ureg_src_undef(), ureg_src_undef());                                                \
   }

#define OP01(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_src src0)                        \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(),    \
               ureg_src_undef());                                                                  \
   }

#define OP10(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst)                         \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(),     \
               ureg_src_undef());                                                                  \
   }

#define OP11(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0)   \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(),                 \
               ureg_src_undef());                                                                  \
   }

#define OP12(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
                               struct ureg_src src1)                                               \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef());          \
   }

#define OP13(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
                               struct ureg_src src1, struct ureg_src src2)                         \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());                      \
   }

#define OP14(op)                                                                                   \
   static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
                               struct ureg_src src1, struct ureg_src src2, struct ureg_src src3)   \
   {                                                                                               \
      ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                                  \
   }

/* We hand-craft our tex instructions */
#define OP12_TEX(op)
#define OP14_TEX(op)

/* Use a template include to generate a correctly-typed ntr_OP()
 * function for each TGSI opcode:
 */
#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"

/**
 * Interprets a nir_load_const used as a NIR src as a uint.
 *
 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
 * instruction (or in a phi-web used by an integer ALU instruction) were
 * converted to floats and the ALU instruction swapped to the float equivalent.
 * However, this means that integer load_consts used by intrinsics (which don't
 * normally get that conversion) may have been reformatted to be floats.  Given
 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
 * we can just look and see if they look like floats and convert them back to
 * ints.
 */
static uint32_t
ntr_src_as_uint(struct ntr_compile *c, nir_src src)
{
   uint32_t val = nir_src_as_uint(src);
   if (val >= fui(1.0))
      val = (uint32_t)uif(val);
   return val;
}

/* Per-channel masks of def/use within the block, and the per-channel
 * livein/liveout for the block as a whole.
 */
struct ntr_live_reg_block_state {
   uint8_t *def, *use, *livein, *liveout, *defin, *defout;
};

struct ntr_live_reg_state {
   unsigned bitset_words;

   struct ntr_reg_interval *regs;

   /* Used in propagate_across_edge() */
   BITSET_WORD *tmp_live;

   struct ntr_live_reg_block_state *blocks;

   nir_block_worklist worklist;
};

static void
ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs, int ip,
                      unsigned index, unsigned used_mask)
{
   bs->use[index] |= used_mask & ~bs->def[index];

   c->liveness[index].start = MIN2(c->liveness[index].start, ip);
   c->liveness[index].end = MAX2(c->liveness[index].end, ip);
}
static void
ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl,
                           struct ntr_live_reg_state *state)
{
   for (int i = 0; i < impl->num_blocks; i++) {
      state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
      state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
      state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
      state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
      state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
      state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
   }

   int ip = 0;
   nir_foreach_block (block, impl) {
      struct ntr_live_reg_block_state *bs = &state->blocks[block->index];
      struct ntr_block *ntr_block = ntr_block_from_nir(c, block);

      ntr_block->start_ip = ip;

      util_dynarray_foreach (&ntr_block->insns, struct ntr_insn, insn) {
         const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(insn->opcode);

         /* Set up use[] for the srcs.
          *
          * Uses are the channels of the reg read in the block that don't have a
          * preceding def to screen them off.  Note that we don't do per-element
          * tracking of array regs, so they're never screened off.
          */
         for (int i = 0; i < opcode_info->num_src; i++) {
            if (insn->src[i].File != TGSI_FILE_TEMPORARY)
               continue;
            int index = insn->src[i].Index;

            uint32_t used_mask = tgsi_util_get_src_usage_mask(
               insn->opcode, i, insn->dst->WriteMask, insn->src[i].SwizzleX, insn->src[i].SwizzleY,
               insn->src[i].SwizzleZ, insn->src[i].SwizzleW, insn->tex_target, insn->tex_target);

            assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
            ntr_live_reg_mark_use(c, bs, ip, index, used_mask);
         }

         if (insn->is_tex) {
            for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
               if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
                  ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
            }
         }

         /* Set up def[] for the srcs.
          *
          * Defs are the unconditionally-written (not R/M/W) channels of the reg in
          * the block that don't have a preceding use.
          */
         for (int i = 0; i < opcode_info->num_dst; i++) {
            if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
               continue;
            int index = insn->dst[i].Index;
            uint32_t writemask = insn->dst[i].WriteMask;

            bs->def[index] |= writemask & ~bs->use[index];
            bs->defout[index] |= writemask;

            assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
            c->liveness[index].start = MIN2(c->liveness[index].start, ip);
            c->liveness[index].end = MAX2(c->liveness[index].end, ip);
         }
         ip++;
      }

      ntr_block->end_ip = ip;
   }
}

static void
ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl)
{
   nir_metadata_require(impl, nir_metadata_block_index);

   c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps);

   struct ntr_live_reg_state state = {
      .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks),
   };

   /* The intervals start out with start > end (indicating unused) */
   for (int i = 0; i < c->num_temps; i++)
      c->liveness[i].start = ~0;

   ntr_live_reg_setup_def_use(c, impl, &state);

   /* Make a forward-order worklist of all the blocks. */
   nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
   nir_foreach_block (block, impl) {
      nir_block_worklist_push_tail(&state.worklist, block);
   }

   /* Propagate defin/defout down the CFG to calculate the live variables
    * potentially defined along any possible control flow path.  We'll use this
    * to keep things like conditional defs of the reg (or array regs where we
    * don't track defs!) from making the reg's live range extend back to the
    * start of the program.
    */
   while (!nir_block_worklist_is_empty(&state.worklist)) {
      nir_block *block = nir_block_worklist_pop_head(&state.worklist);
      for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
         nir_block *succ = block->successors[j];
         if (!succ || succ->index == impl->num_blocks)
            continue;

         for (int i = 0; i < c->num_temps; i++) {
            uint8_t new_def =
               state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];

            if (new_def) {
               state.blocks[succ->index].defin[i] |= new_def;
               state.blocks[succ->index].defout[i] |= new_def;
               nir_block_worklist_push_tail(&state.worklist, succ);
            }
         }
      }
   }

   /* Make a reverse-order worklist of all the blocks. */
   nir_foreach_block (block, impl) {
      nir_block_worklist_push_head(&state.worklist, block);
   }

   /* We're now ready to work through the worklist and update the liveness sets
    * of each of the blocks.  As long as we keep the worklist up-to-date as we
    * go, everything will get covered.
    */
   while (!nir_block_worklist_is_empty(&state.worklist)) {
      /* We pop them off in the reverse order we pushed them on.  This way
       * the first walk of the instructions is backwards so we only walk
       * once in the case of no control flow.
       */
      nir_block *block = nir_block_worklist_pop_head(&state.worklist);
      struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
      struct ntr_live_reg_block_state *bs = &state.blocks[block->index];

      for (int i = 0; i < c->num_temps; i++) {
         /* Collect livein from our successors to include in our liveout. */
         for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
            nir_block *succ = block->successors[j];
            if (!succ || succ->index == impl->num_blocks)
               continue;
            struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index];

            uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
            if (new_liveout) {
               if (state.blocks[block->index].defout[i])
                  c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip);
               bs->liveout[i] |= sbs->livein[i];
            }
         }

         /* Propagate use requests from either our block's uses or our
          * non-screened-off liveout up to our predecessors.
          */
         uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) & ~bs->livein[i]);
         if (new_livein) {
            bs->livein[i] |= new_livein;
            set_foreach (block->predecessors, entry) {
               nir_block *pred = (void *)entry->key;
               nir_block_worklist_push_tail(&state.worklist, pred);
            }

            if (new_livein & state.blocks[block->index].defin[i])
               c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip);
         }
      }
   }

   ralloc_free(state.blocks);
   nir_block_worklist_fini(&state.worklist);
}

static void
ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
{
   if (index < c->first_non_array_temp)
      return;

   if (c->liveness[index].start == ip && ra_map[index] == ~0)
      ra_map[index] = ureg_DECL_temporary(c->ureg).Index;

   if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
      ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
      BITSET_SET(released, index);
   }
}

static void
ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl)
{
   ntr_live_regs(c, impl);

   unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
   unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));

   /* No RA on NIR array regs */
   for (int i = 0; i < c->first_non_array_temp; i++)
      ra_map[i] = i;

   for (int i = c->first_non_array_temp; i < c->num_temps; i++)
      ra_map[i] = ~0;

   int ip = 0;
   nir_foreach_block (block, impl) {
      struct ntr_block *ntr_block = ntr_block_from_nir(c, block);

      for (int i = 0; i < c->num_temps; i++)
         ntr_ra_check(c, ra_map, released, ip, i);

      util_dynarray_foreach (&ntr_block->insns, struct ntr_insn, insn) {
         const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(insn->opcode);

         for (int i = 0; i < opcode_info->num_src; i++) {
            if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
               ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index);
               insn->src[i].Index = ra_map[insn->src[i].Index];
            }
         }

         if (insn->is_tex) {
            for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
               if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
                  ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
                  insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
               }
            }
         }

         for (int i = 0; i < opcode_info->num_dst; i++) {
            if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
               ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
               insn->dst[i].Index = ra_map[insn->dst[i].Index];
            }
         }
         ip++;
      }

      for (int i = 0; i < c->num_temps; i++)
         ntr_ra_check(c, ra_map, released, ip, i);
   }
}

static void
ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl)
{
   for (int i = c->first_non_array_temp; i < c->num_temps; i++)
      ureg_DECL_temporary(c->ureg);
}

/* TGSI varying declarations have a component usage mask associated (used by
 * r600 and svga).
 */
static uint32_t
ntr_tgsi_var_usage_mask(const struct nir_variable *var)
{
   const struct glsl_type *type_without_array = glsl_without_array(var->type);
   unsigned num_components = glsl_get_vector_elements(type_without_array);
   if (num_components == 0) /* structs */
      num_components = 4;

   return u_bit_consecutive(var->data.location_frac, num_components);
}

static struct ureg_dst
ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
{
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
   int base = nir_intrinsic_base(instr);
   *frac = nir_intrinsic_component(instr);

   struct ureg_dst out;
   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
      unsigned semantic_name, semantic_index;
      tgsi_get_gl_frag_result_semantic(semantics.location, &semantic_name, &semantic_index);
      semantic_index += semantics.dual_source_blend_index;

      switch (semantics.location) {
      case FRAG_RESULT_DEPTH:
         *frac = 2; /* z write is the to the .z channel in TGSI */
         break;
      case FRAG_RESULT_STENCIL:
         *frac = 1;
         break;
      default:
         break;
      }

      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
   } else {
      unsigned semantic_name, semantic_index;

      tgsi_get_gl_varying_semantic(semantics.location, true, &semantic_name, &semantic_index);

      uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components);
      uint32_t gs_streams = semantics.gs_streams;
      for (int i = 0; i < 4; i++) {
         if (!(usage_mask & (1 << i)))
            gs_streams &= ~(0x3 << 2 * i);
      }

      /* No driver appears to use array_id of outputs. */
      unsigned array_id = 0;

      /* This bit is lost in the i/o semantics, but it's unused in in-tree
       * drivers.
       */
      bool invariant = semantics.invariant;

      out = ureg_DECL_output_layout(c->ureg, semantic_name, semantic_index, gs_streams, base,
                                    usage_mask, array_id, semantics.num_slots, invariant);
   }

   unsigned write_mask;
   if (nir_intrinsic_has_write_mask(instr))
      write_mask = nir_intrinsic_write_mask(instr);
   else
      write_mask = ((1 << instr->num_components) - 1) << *frac;

   write_mask = write_mask << *frac;
   return ureg_writemask(out, write_mask);
}

static bool
ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c, struct ureg_dst *dst, nir_src *src)
{
   *dst = ureg_dst_undef();

   if (nir_src_is_if(src))
      return false;

   if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
   if (intr->intrinsic != nir_intrinsic_store_output || !nir_src_is_const(intr->src[1])) {
      return false;
   }

   uint32_t frac;
   *dst = ntr_output_decl(c, intr, &frac);
   dst->Index += ntr_src_as_uint(c, intr->src[1]);

   return frac == 0;
}

/* If this reg is used only for storing an output, then in the simple
 * cases we can write directly to the TGSI output instead of having
 * store_output emit its own MOV.
 */
static bool
ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
                                 nir_intrinsic_instr *reg_decl)
{
   assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);

   *dst = ureg_dst_undef();

   /* Look for a single use for try_store_in_tgsi_output */
   nir_src *use = NULL;
   nir_foreach_reg_load (src, reg_decl) {
      nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
      nir_foreach_use_including_if (load_use, &load->def) {
         /* We can only have one use */
         if (use != NULL)
            return false;

         use = load_use;
      }
   }

   if (use == NULL)
      return false;

   return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
}

/* If this SSA def is used only for storing an output, then in the simple
 * cases we can write directly to the TGSI output instead of having
 * store_output emit its own MOV.
 */
static bool
ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, nir_def *def)
{
   *dst = ureg_dst_undef();

   if (!list_is_singular(&def->uses))
      return false;

   nir_foreach_use_including_if (use, def) {
      return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
   }
   unreachable("We have one use");
}

static void
ntr_setup_inputs(struct ntr_compile *c)
{
   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
      return;

   unsigned num_inputs = 0;
   int num_input_arrays = 0;

   nir_foreach_shader_in_variable (var, c->s) {
      const struct glsl_type *type = var->type;
      unsigned array_len = glsl_count_attribute_slots(type, false);

      num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
   }

   c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);

   nir_foreach_shader_in_variable (var, c->s) {
      const struct glsl_type *type = var->type;
      unsigned array_len = glsl_count_attribute_slots(type, false);

      unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
      unsigned sample_loc;
      struct ureg_src decl;

      if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
         interpolation = tgsi_get_interp_mode(
            var->data.interpolation,
            var->data.location == VARYING_SLOT_COL0 || var->data.location == VARYING_SLOT_COL1);

         if (var->data.location == VARYING_SLOT_POS)
            interpolation = TGSI_INTERPOLATE_LINEAR;
      }

      unsigned semantic_name, semantic_index;
      tgsi_get_gl_varying_semantic(var->data.location, true, &semantic_name, &semantic_index);

      if (var->data.sample) {
         sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
      } else if (var->data.centroid) {
         sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
         c->centroid_inputs |= (BITSET_MASK(array_len) << var->data.driver_location);
      } else {
         sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
      }

      unsigned array_id = 0;
      if (glsl_type_is_array(type))
         array_id = ++num_input_arrays;

      uint32_t usage_mask = ntr_tgsi_var_usage_mask(var);

      decl = ureg_DECL_fs_input_centroid_layout(
         c->ureg, semantic_name, semantic_index, interpolation, sample_loc,
         var->data.driver_location, usage_mask, array_id, array_len);

      if (semantic_name == TGSI_SEMANTIC_FACE) {
         struct ureg_dst temp = ntr_temp(c);
         /* tgsi docs say that floating point FACE will be positive for
          * frontface and negative for backface, but realistically
          * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
          * Copy that behavior, since some drivers (r300) have been doing a
          * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
          * front face).
          */
         temp.Saturate = true;
         ntr_MOV(c, temp, decl);
         decl = ureg_src(temp);
      }

      for (unsigned i = 0; i < array_len; i++) {
         c->input_index_map[var->data.driver_location + i] = decl;
         c->input_index_map[var->data.driver_location + i].Index += i;
      }
   }
}

static int
ntr_sort_by_location(const nir_variable *a, const nir_variable *b)
{
   return a->data.location - b->data.location;
}

/**
 * Workaround for virglrenderer requiring that TGSI FS output color variables
 * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
 */
static void
ntr_setup_outputs(struct ntr_compile *c)
{
   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
      return;

   nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out);

   nir_foreach_shader_out_variable (var, c->s) {
      if (var->data.location == FRAG_RESULT_COLOR)
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);

      unsigned semantic_name, semantic_index;
      tgsi_get_gl_frag_result_semantic(var->data.location, &semantic_name, &semantic_index);

      (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
   }
}

static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
{
   switch (dim) {
   case GLSL_SAMPLER_DIM_1D:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
      else
         return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
   case GLSL_SAMPLER_DIM_2D:
   case GLSL_SAMPLER_DIM_EXTERNAL:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
      else
         return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
   case GLSL_SAMPLER_DIM_3D:
      return TGSI_TEXTURE_3D;
   case GLSL_SAMPLER_DIM_CUBE:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
      else
         return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
   case GLSL_SAMPLER_DIM_RECT:
      if (is_shadow)
         return TGSI_TEXTURE_SHADOWRECT;
      else
         return TGSI_TEXTURE_RECT;
   case GLSL_SAMPLER_DIM_MS:
      return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
   case GLSL_SAMPLER_DIM_BUF:
      return TGSI_TEXTURE_BUFFER;
   default:
      unreachable("unknown sampler dim");
   }
}

static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)
{
   switch (type) {
   case GLSL_TYPE_INT:
      return TGSI_RETURN_TYPE_SINT;
   case GLSL_TYPE_UINT:
      return TGSI_RETURN_TYPE_UINT;
   case GLSL_TYPE_FLOAT:
      return TGSI_RETURN_TYPE_FLOAT;
   default:
      unreachable("unexpected texture type");
   }
}

static void
ntr_setup_uniforms(struct ntr_compile *c)
{
   nir_foreach_uniform_variable (var, c->s) {
      if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
          glsl_type_is_texture(glsl_without_array(var->type))) {
         /* Don't use this size for the check for samplers -- arrays of structs
          * containing samplers should be ignored, and just the separate lowered
          * sampler uniform decl used.
          */
         int size = glsl_type_get_sampler_count(var->type) + glsl_type_get_texture_count(var->type);

         const struct glsl_type *stype = glsl_without_array(var->type);
         enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(
            glsl_get_sampler_dim(stype), glsl_sampler_type_is_array(stype),
            glsl_sampler_type_is_shadow(stype));
         enum tgsi_return_type ret_type =
            tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
         for (int i = 0; i < size; i++) {
            ureg_DECL_sampler_view(c->ureg, var->data.binding + i, target, ret_type, ret_type,
                                   ret_type, ret_type);
            ureg_DECL_sampler(c->ureg, var->data.binding + i);
         }

         /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
          * size declaration happens with other UBOs below.
          */
      }
   }

   c->first_ubo = ~0;

   unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
   nir_foreach_variable_with_modes (var, c->s, nir_var_mem_ubo) {
      int ubo = var->data.driver_location;
      if (ubo == -1)
         continue;

      if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
         c->first_ubo = MIN2(c->first_ubo, ubo);

      unsigned size = glsl_get_explicit_size(var->interface_type, false);
      ubo_sizes[ubo] = size;
   }

   for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
      if (ubo_sizes[i])
         ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
   }
}

static void
ntr_setup_registers(struct ntr_compile *c)
{
   assert(c->num_temps == 0);

   nir_foreach_reg_decl_safe (nir_reg, nir_shader_get_entrypoint(c->s)) {
      /* Permanently allocate all the array regs at the start. */
      unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
      unsigned index = nir_reg->def.index;

      if (num_array_elems != 0) {
         struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
         c->reg_temp[index] = decl;
         assert(c->num_temps == decl.Index);
         c->num_temps += num_array_elems;
      }
   }
   c->first_non_array_temp = c->num_temps;

   /* After that, allocate non-array regs in our virtual space that we'll
    * register-allocate before ureg emit.
    */
   nir_foreach_reg_decl_safe (nir_reg, nir_shader_get_entrypoint(c->s)) {
      unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
      unsigned num_components = nir_intrinsic_num_components(nir_reg);
      unsigned index = nir_reg->def.index;

      /* We already handled arrays */
      if (num_array_elems == 0) {
         struct ureg_dst decl;
         uint32_t write_mask = BITFIELD_MASK(num_components);

         if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
            decl = ureg_writemask(ntr_temp(c), write_mask);
         }
         c->reg_temp[index] = decl;
      }
   }
}

static struct ureg_src
ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr)
{
   int num_components = instr->def.num_components;

   float values[4];
   assert(instr->def.bit_size == 32);
   for (int i = 0; i < num_components; i++)
      values[i] = uif(instr->value[i].u32);

   return ureg_DECL_immediate(c->ureg, values, num_components);
}

static struct ureg_src
ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index)
{
   assert(addr_index < ARRAY_SIZE(c->addr_reg));

   for (int i = 0; i <= addr_index; i++) {
      if (!c->addr_declared[i]) {
         c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg), TGSI_WRITEMASK_X);
         c->addr_declared[i] = true;
      }
   }

   ntr_ARL(c, c->addr_reg[addr_index], addr);
   return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
}

/* Forward declare for recursion with indirects */
static struct ureg_src ntr_get_src(struct ntr_compile *c, nir_src src);

static struct ureg_src
ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src)
{
   if (src->is_ssa) {
      if (src->ssa->parent_instr->type == nir_instr_type_load_const)
         return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));

      return c->ssa_temp[src->ssa->index];
   } else {
      struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
      reg_temp.Index += src->reg.base_offset;

      if (src->reg.indirect) {
         struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect));
         return ureg_src_indirect(ureg_src(reg_temp), ntr_reladdr(c, offset, 0));
      } else {
         return ureg_src(reg_temp);
      }
   }
}

static struct ureg_src
ntr_get_src(struct ntr_compile *c, nir_src src)
{
   nir_legacy_src chased = nir_legacy_chase_src(&src);
   return ntr_get_chased_src(c, &chased);
}

static struct ureg_src
ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i)
{
   /* We only support 32-bit float modifiers.  The only other modifier type
    * officially supported by TGSI is 32-bit integer negates, but even those are
    * broken on virglrenderer, so skip lowering all integer and f64 float mods.
    *
    * The lower_fabs requests that we not have native source modifiers
    * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
    */
   nir_legacy_alu_src src = nir_legacy_chase_alu_src(&instr->src[i], !c->lower_fabs);
   struct ureg_src usrc = ntr_get_chased_src(c, &src.src);

   usrc = ureg_swizzle(usrc, src.swizzle[0], src.swizzle[1], src.swizzle[2], src.swizzle[3]);

   if (src.fabs)
      usrc = ureg_abs(usrc);
   if (src.fneg)
      usrc = ureg_negate(usrc);

   return usrc;
}

/* Reswizzles a source so that the unset channels in the write mask still refer
 * to one of the channels present in the write mask.
 */
static struct ureg_src
ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
{
   assert(write_mask);
   int first_chan = ffs(write_mask) - 1;
   return ureg_swizzle(src, (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
                       (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
                       (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
                       (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
}

static struct ureg_dst
ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa)
{
   uint32_t writemask;
   /* Fix writemask for nir_intrinsic_load_ubo_vec4 according to uses. */
   if (ssa->parent_instr->type == nir_instr_type_intrinsic &&
       nir_instr_as_intrinsic(ssa->parent_instr)->intrinsic == nir_intrinsic_load_ubo_vec4)
      writemask = nir_def_components_read(ssa);
   else
      writemask = BITSET_MASK(ssa->num_components);

   struct ureg_dst dst;
   if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa))
      dst = ntr_temp(c);

   c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask);

   return ureg_writemask(dst, writemask);
}

static struct ureg_dst
ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest)
{
   if (dest->is_ssa)
      return ntr_get_ssa_def_decl(c, dest->ssa);
   else
      return c->reg_temp[dest->reg.handle->index];
}

static struct ureg_dst
ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest)
{
   struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest);

   if (!dest->is_ssa) {
      dst.Index += dest->reg.base_offset;

      if (dest->reg.indirect) {
         struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect));
         dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0));
      }
   }

   return dst;
}

static struct ureg_dst
ntr_get_dest(struct ntr_compile *c, nir_def *def)
{
   nir_legacy_dest chased = nir_legacy_chase_dest(def);
   return ntr_get_chased_dest(c, &chased);
}

static struct ureg_dst
ntr_get_alu_dest(struct ntr_compile *c, nir_def *def)
{
   nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
   struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest);

   if (chased.fsat)
      dst.Saturate = true;

   /* Only registers get write masks */
   if (chased.dest.is_ssa)
      return dst;

   return ureg_writemask(dst, chased.write_mask);
}

/* For an SSA dest being populated by a constant src, replace the storage with
 * a copy of the ureg_src.
 */
static void
ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src)
{
   if (!src.Indirect && !src.DimIndirect) {
      switch (src.File) {
      case TGSI_FILE_IMMEDIATE:
      case TGSI_FILE_INPUT:
      case TGSI_FILE_CONSTANT:
      case TGSI_FILE_SYSTEM_VALUE:
         c->ssa_temp[def->index] = src;
         return;
      }
   }

   ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src);
}

static void
ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src)
{
   nir_legacy_dest chased = nir_legacy_chase_dest(def);

   if (chased.is_ssa)
      ntr_store_def(c, chased.ssa, src);
   else {
      struct ureg_dst dst = ntr_get_chased_dest(c, &chased);
      ntr_MOV(c, dst, src);
   }
}

static void
ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op, struct ureg_dst dst, struct ureg_src src0,
                struct ureg_src src1)
{
   unsigned i;

   /* POW is the only 2-operand scalar op. */
   if (tgsi_op != TGSI_OPCODE_POW)
      src1 = src0;

   for (i = 0; i < 4; i++) {
      if (dst.WriteMask & (1 << i)) {
         ntr_insn(c, tgsi_op, ureg_writemask(dst, 1 << i), ureg_scalar(src0, i),
                  ureg_scalar(src1, i), ureg_src_undef(), ureg_src_undef());
      }
   }
}

static void
ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr)
{
   struct ureg_src src[4];
   struct ureg_dst dst;
   unsigned i;
   int num_srcs = nir_op_infos[instr->op].num_inputs;

   /* Don't try to translate folded fsat since their source won't be valid */
   if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
      return;

   c->precise = instr->exact;

   assert(num_srcs <= ARRAY_SIZE(src));
   for (i = 0; i < num_srcs; i++)
      src[i] = ntr_get_alu_src(c, instr, i);
   for (; i < ARRAY_SIZE(src); i++)
      src[i] = ureg_src_undef();

   dst = ntr_get_alu_dest(c, &instr->def);

   static enum tgsi_opcode op_map[] = {
      [nir_op_mov] = TGSI_OPCODE_MOV,

      [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2,
      [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3,
      [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4,
      [nir_op_ffloor] = TGSI_OPCODE_FLR,
      [nir_op_ffract] = TGSI_OPCODE_FRC,
      [nir_op_fceil] = TGSI_OPCODE_CEIL,
      [nir_op_fround_even] = TGSI_OPCODE_ROUND,

      [nir_op_slt] = TGSI_OPCODE_SLT,
      [nir_op_sge] = TGSI_OPCODE_SGE,
      [nir_op_seq] = TGSI_OPCODE_SEQ,
      [nir_op_sne] = TGSI_OPCODE_SNE,

      [nir_op_ftrunc] = TGSI_OPCODE_TRUNC,
      [nir_op_fadd] = TGSI_OPCODE_ADD,
      [nir_op_fmul] = TGSI_OPCODE_MUL,

      [nir_op_fmin] = TGSI_OPCODE_MIN,
      [nir_op_fmax] = TGSI_OPCODE_MAX,
      [nir_op_ffma] = TGSI_OPCODE_MAD,
   };

   if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) {
      /* The normal path for NIR to TGSI ALU op translation */
      ntr_insn(c, op_map[instr->op], dst, src[0], src[1], src[2], src[3]);
   } else {
      /* Special cases for NIR to TGSI ALU op translation. */

      /* TODO: Use something like the ntr_store() path for the MOV calls so we
       * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
       */

      switch (instr->op) {
      case nir_op_fabs:
         /* Try to eliminate */
         if (!c->lower_fabs && nir_legacy_float_mod_folds(instr))
            break;

         if (c->lower_fabs)
            ntr_MAX(c, dst, src[0], ureg_negate(src[0]));
         else
            ntr_MOV(c, dst, ureg_abs(src[0]));
         break;

      case nir_op_fsat:
         ntr_MOV(c, ureg_saturate(dst), src[0]);
         break;

      case nir_op_fneg:
         /* Try to eliminate */
         if (nir_legacy_float_mod_folds(instr))
            break;

         ntr_MOV(c, dst, ureg_negate(src[0]));
         break;

         /* NOTE: TGSI 32-bit math ops have the old "one source channel
          * replicated to all dst channels" behavior, while 64 is normal mapping
          * of src channels to dst.
          */
      case nir_op_frcp:
         ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
         break;

      case nir_op_frsq:
         ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
         break;

      case nir_op_fexp2:
         ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
         break;

      case nir_op_flog2:
         ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
         break;

      case nir_op_fsin:
         ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
         break;

      case nir_op_fcos:
         ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
         break;

      case nir_op_fsub:
         ntr_ADD(c, dst, src[0], ureg_negate(src[1]));
         break;

      case nir_op_fmod:
         unreachable("should be handled by .lower_fmod = true");
         break;

      case nir_op_fpow:
         ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
         break;

      case nir_op_flrp:
         ntr_LRP(c, dst, src[2], src[1], src[0]);
         break;

      case nir_op_fcsel:
         /* Implement this as CMP(-abs(src0), src1, src2). */
         ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
         break;

      case nir_op_fcsel_gt:
         ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
         break;

      case nir_op_fcsel_ge:
         /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
         ntr_CMP(c, dst, src[0], src[2], src[1]);
         break;

      case nir_op_vec4:
      case nir_op_vec3:
      case nir_op_vec2:
         unreachable("covered by nir_lower_vec_to_movs()");

      default:
         fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
         unreachable("Unknown NIR opcode");
      }
   }

   c->precise = false;
}

static struct ureg_src
ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc, nir_src src, int addr_reg)
{
   if (nir_src_is_const(src)) {
      usrc.Index += ntr_src_as_uint(c, src);
      return usrc;
   } else {
      return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg));
   }
}

static struct ureg_dst
ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst, nir_src src)
{
   if (nir_src_is_const(src)) {
      dst.Index += ntr_src_as_uint(c, src);
      return dst;
   } else {
      return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0));
   }
}

static struct ureg_dst
ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst, nir_src src)
{
   if (nir_src_is_const(src)) {
      return ureg_dst_dimension(udst, ntr_src_as_uint(c, src));
   } else {
      return ureg_dst_dimension_indirect(udst, ntr_reladdr(c, ntr_get_src(c, src), 1), 0);
   }
}
/* Some load operations in NIR will have a fractional offset that we need to
 * swizzle down before storing to the result register.
 */
static struct ureg_src
ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
{
   return ureg_swizzle(src, frac, frac + MIN2(num_components - 1, 1),
                       frac + MIN2(num_components - 1, 2), frac + MIN2(num_components - 1, 3));
}

static void
ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);

   struct ureg_dst addr_temp = ureg_dst_undef();

   if (nir_src_is_const(instr->src[0])) {
      src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0]));
   } else {
      /* virglrenderer requires that indirect UBO references have the UBO
       * array's base index in the Index field, not added to the indirect
       * address.
       *
       * Many nir intrinsics have a base address const value for the start of
       * their array indirection, but load_ubo doesn't.  We fake it by
       * subtracting it off here.
       */
      addr_temp = ntr_temp(c);
      ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
      src = ureg_src_dimension_indirect(src, ntr_reladdr(c, ureg_src(addr_temp), 1), c->first_ubo);
   }

   /* !pipe_caps.load_constbuf: Just emit it as a vec4 reference to the const
    * file.
    */
   src.Index = nir_intrinsic_base(instr);

   if (nir_src_is_const(instr->src[1])) {
      src.Index += ntr_src_as_uint(c, instr->src[1]);
   } else {
      src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0));
   }

   int start_component = nir_intrinsic_component(instr);

   src = ntr_shift_by_frac(src, start_component, instr->num_components);

   ntr_store(c, &instr->def, src);
}

static void
ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   uint32_t frac = nir_intrinsic_component(instr);
   uint32_t num_components = instr->num_components;
   unsigned base = nir_intrinsic_base(instr);
   struct ureg_src input;
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);

   if (c->s->info.stage == MESA_SHADER_VERTEX) {
      input = ureg_DECL_vs_input(c->ureg, base);
      for (int i = 1; i < semantics.num_slots; i++)
         ureg_DECL_vs_input(c->ureg, base + i);
   } else {
      input = c->input_index_map[base];
   }

   input = ntr_shift_by_frac(input, frac, num_components);

   switch (instr->intrinsic) {
   case nir_intrinsic_load_input:
      input = ntr_ureg_src_indirect(c, input, instr->src[0], 0);
      ntr_store(c, &instr->def, input);
      break;

   case nir_intrinsic_load_interpolated_input: {
      input = ntr_ureg_src_indirect(c, input, instr->src[1], 0);

      nir_intrinsic_instr *bary_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);

      switch (bary_instr->intrinsic) {
      case nir_intrinsic_load_barycentric_pixel:
      case nir_intrinsic_load_barycentric_sample:
         /* For these, we know that the barycentric load matches the
          * interpolation on the input declaration, so we can use it directly.
          */
         ntr_store(c, &instr->def, input);
         break;

      case nir_intrinsic_load_barycentric_centroid:
         /* If the input was declared centroid, then there's no need to
          * emit the extra TGSI interp instruction, we can just read the
          * input.
          */
         if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
            ntr_store(c, &instr->def, input);
         } else {
            ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input);
         }
         break;

      case nir_intrinsic_load_barycentric_at_sample:
         /* We stored the sample in the fake "bary" dest. */
         ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input, ntr_get_src(c, instr->src[0]));
         break;

      case nir_intrinsic_load_barycentric_at_offset:
         /* We stored the offset in the fake "bary" dest. */
         ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input, ntr_get_src(c, instr->src[0]));
         break;

      default:
         unreachable("bad barycentric interp intrinsic\n");
      }
      break;
   }

   default:
      unreachable("bad load input intrinsic\n");
   }
}

static void
ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   struct ureg_src src = ntr_get_src(c, instr->src[0]);

   if (src.File == TGSI_FILE_OUTPUT) {
      /* If our src is the output file, that's an indication that we were able
       * to emit the output stores in the generating instructions and we have
       * nothing to do here.
       */
      return;
   }

   uint32_t frac;
   struct ureg_dst out = ntr_output_decl(c, instr, &frac);

   if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
      out = ntr_ureg_dst_indirect(c, out, instr->src[2]);
      out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]);
   } else {
      out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
   }

   uint8_t swizzle[4] = {0, 0, 0, 0};
   for (int i = frac; i < 4; i++) {
      if (out.WriteMask & (1 << i))
         swizzle[i] = i - frac;
   }

   src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);

   ntr_MOV(c, out, src);
}

static void
ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);

   /* ntr_try_store_in_tgsi_output() optimization is not valid if normal
    * load_output is present.
    */
   assert(c->s->info.stage != MESA_SHADER_VERTEX &&
          (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));

   uint32_t frac;
   struct ureg_dst out = ntr_output_decl(c, instr, &frac);

   if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
      out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
      out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]);
   } else {
      out = ntr_ureg_dst_indirect(c, out, instr->src[0]);
   }

   struct ureg_dst dst = ntr_get_dest(c, &instr->def);
   struct ureg_src out_src = ureg_src(out);

   /* Don't swizzling unavailable channels of the output in the writemasked-out
    * components. Avoids compile failures in virglrenderer with
    * TESS_LEVEL_INNER.
    */
   int fill_channel = ffs(dst.WriteMask) - 1;
   uint8_t swizzles[4] = {0, 1, 2, 3};
   for (int i = 0; i < 4; i++)
      if (!(dst.WriteMask & (1 << i)))
         swizzles[i] = fill_channel;
   out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);

   if (semantics.fb_fetch_output)
      ntr_FBFETCH(c, dst, out_src);
   else
      ntr_MOV(c, dst, out_src);
}

static void
ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
   enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
   struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);

   /* virglrenderer doesn't like references to channels of the sysval that
    * aren't defined, even if they aren't really read.  (GLSL compile fails on
    * gl_NumWorkGroups.w, for example).
    */
   uint32_t write_mask = BITSET_MASK(instr->def.num_components);
   sv = ntr_swizzle_for_write_mask(sv, write_mask);

   /* TGSI and NIR define these intrinsics as always loading ints, but they can
    * still appear on hardware with non-native-integers fragment shaders using
    * the draw path (i915g).  In that case, having called nir_lower_int_to_float
    * means that we actually want floats instead.
    */
   switch (instr->intrinsic) {
   case nir_intrinsic_load_vertex_id:
   case nir_intrinsic_load_instance_id:
      ntr_U2F(c, ntr_get_dest(c, &instr->def), sv);
      return;

   default:
      break;
   }

   ntr_store(c, &instr->def, sv);
}

static void
ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr)
{
   switch (instr->intrinsic) {
   case nir_intrinsic_load_ubo:
   case nir_intrinsic_load_ubo_vec4:
      ntr_emit_load_ubo(c, instr);
      break;

      /* Vertex */
   case nir_intrinsic_load_draw_id:
   case nir_intrinsic_load_invocation_id:
   case nir_intrinsic_load_frag_coord:
   case nir_intrinsic_load_point_coord:
   case nir_intrinsic_load_front_face:
      ntr_emit_load_sysval(c, instr);
      break;

   case nir_intrinsic_load_input:
   case nir_intrinsic_load_per_vertex_input:
   case nir_intrinsic_load_interpolated_input:
      ntr_emit_load_input(c, instr);
      break;

   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_vertex_output:
      ntr_emit_store_output(c, instr);
      break;

   case nir_intrinsic_load_output:
   case nir_intrinsic_load_per_vertex_output:
      ntr_emit_load_output(c, instr);
      break;

   case nir_intrinsic_terminate:
      ntr_KILL(c);
      break;

   case nir_intrinsic_terminate_if: {
      struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0);
      /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
      ntr_KILL_IF(c, ureg_negate(cond));
      break;
   }
      /* In TGSI we don't actually generate the barycentric coords, and emit
       * interp intrinsics later.  However, we do need to store the
       * load_barycentric_at_* argument so that we can use it at that point.
       */
   case nir_intrinsic_load_barycentric_pixel:
   case nir_intrinsic_load_barycentric_centroid:
   case nir_intrinsic_load_barycentric_sample:
      break;
   case nir_intrinsic_load_barycentric_at_sample:
   case nir_intrinsic_load_barycentric_at_offset:
      ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0]));
      break;

   case nir_intrinsic_ddx:
   case nir_intrinsic_ddx_coarse:
      ntr_DDX(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0]));
      return;
   case nir_intrinsic_ddy:
   case nir_intrinsic_ddy_coarse:
      ntr_DDY(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0]));
      return;

   case nir_intrinsic_decl_reg:
   case nir_intrinsic_load_reg:
   case nir_intrinsic_load_reg_indirect:
   case nir_intrinsic_store_reg:
   case nir_intrinsic_store_reg_indirect:
      /* fully consumed */
      break;

   default:
      fprintf(stderr, "Unknown intrinsic: ");
      nir_print_instr(&instr->instr, stderr);
      fprintf(stderr, "\n");
      break;
   }
}

struct ntr_tex_operand_state {
   struct ureg_src srcs[4];
   unsigned i;
};

static void
ntr_push_tex_arg(struct ntr_compile *c, nir_tex_instr *instr, nir_tex_src_type tex_src_type,
                 struct ntr_tex_operand_state *s)
{
   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
   if (tex_src < 0)
      return;

   nir_src *src = &instr->src[tex_src].src;
   s->srcs[s->i++] = ntr_get_src(c, *src);
}

static void
ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr)
{
   struct ureg_dst dst = ntr_get_dest(c, &instr->def);
   enum tgsi_texture_type target =
      tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
   unsigned tex_opcode;

   int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
   int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);

   struct ureg_src sampler;
   if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
      /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
      sampler = ntr_get_src(c, instr->src[tex_handle_src].src);
      assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
   } else {
      assert(tex_handle_src == -1 && sampler_handle_src == -1);
      sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
      int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
      if (sampler_src >= 0) {
         struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src);
         sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2));
      }
   }

   switch (instr->op) {
   case nir_texop_tex:
      if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
          MAX2(instr->coord_components, 2) + instr->is_shadow)
         tex_opcode = TGSI_OPCODE_TXP;
      else
         tex_opcode = TGSI_OPCODE_TEX;
      break;
   case nir_texop_txl:
      tex_opcode = TGSI_OPCODE_TXL;
      break;
   case nir_texop_txb:
      tex_opcode = TGSI_OPCODE_TXB;
      break;
   case nir_texop_txd:
      tex_opcode = TGSI_OPCODE_TXD;
      break;
   case nir_texop_txs:
      tex_opcode = TGSI_OPCODE_TXQ;
      break;
   case nir_texop_tg4:
      tex_opcode = TGSI_OPCODE_TG4;
      break;
   case nir_texop_query_levels:
      tex_opcode = TGSI_OPCODE_TXQ;
      break;
   case nir_texop_lod:
      tex_opcode = TGSI_OPCODE_LODQ;
      break;
   case nir_texop_texture_samples:
      tex_opcode = TGSI_OPCODE_TXQS;
      break;
   default:
      unreachable("unsupported tex op");
   }

   struct ntr_tex_operand_state s = {.i = 0};
   ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
   ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s);

   /* non-coord arg for TXQ */
   if (tex_opcode == TGSI_OPCODE_TXQ) {
      ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s);
      /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
       * scalar
       */
      s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
   }

   if (s.i > 1) {
      if (tex_opcode == TGSI_OPCODE_TEX)
         tex_opcode = TGSI_OPCODE_TEX2;
      if (tex_opcode == TGSI_OPCODE_TXB)
         tex_opcode = TGSI_OPCODE_TXB2;
      if (tex_opcode == TGSI_OPCODE_TXL)
         tex_opcode = TGSI_OPCODE_TXL2;
   }

   if (instr->op == nir_texop_txd) {
      /* Derivs appear in their own src args */
      int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
      int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
      s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src);
      s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src);
   }

   if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
      if (c->screen->caps.tgsi_tg4_component_in_swizzle) {
         sampler = ureg_scalar(sampler, instr->component);
         s.srcs[s.i++] = ureg_src_undef();
      } else {
         s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
      }
   }

   s.srcs[s.i++] = sampler;

   enum tgsi_return_type tex_type;
   switch (instr->dest_type) {
   case nir_type_float32:
      tex_type = TGSI_RETURN_TYPE_FLOAT;
      break;
   case nir_type_int32:
      tex_type = TGSI_RETURN_TYPE_SINT;
      break;
   case nir_type_uint32:
      tex_type = TGSI_RETURN_TYPE_UINT;
      break;
   default:
      unreachable("unknown texture type");
   }

   struct ureg_dst tex_dst;
   if (instr->op == nir_texop_query_levels)
      tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W);
   else
      tex_dst = dst;

   while (s.i < 4)
      s.srcs[s.i++] = ureg_src_undef();

   struct ntr_insn *insn =
      ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
   insn->tex_target = target;
   insn->tex_return_type = tex_type;
   insn->is_tex = true;

   int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
   if (tex_offset_src >= 0) {
      struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src);

      insn->tex_offset[0].File = offset.File;
      insn->tex_offset[0].Index = offset.Index;
      insn->tex_offset[0].SwizzleX = offset.SwizzleX;
      insn->tex_offset[0].SwizzleY = offset.SwizzleY;
      insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
      insn->tex_offset[0].Padding = 0;
   }

   if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
      for (uint8_t i = 0; i < 4; ++i) {
         struct ureg_src imm =
            ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
         insn->tex_offset[i].File = imm.File;
         insn->tex_offset[i].Index = imm.Index;
         insn->tex_offset[i].SwizzleX = imm.SwizzleX;
         insn->tex_offset[i].SwizzleY = imm.SwizzleY;
         insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
      }
   }

   if (instr->op == nir_texop_query_levels)
      ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
}

static void
ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump)
{
   switch (jump->type) {
   case nir_jump_break:
      ntr_BRK(c);
      break;

   case nir_jump_continue:
      ntr_CONT(c);
      break;

   default:
      fprintf(stderr, "Unknown jump instruction: ");
      nir_print_instr(&jump->instr, stderr);
      fprintf(stderr, "\n");
      abort();
   }
}

static void
ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr)
{
   /* Nothing to do but make sure that we have some storage to deref. */
   (void)ntr_get_ssa_def_decl(c, &instr->def);
}

static void
ntr_emit_instr(struct ntr_compile *c, nir_instr *instr)
{
   switch (instr->type) {
   case nir_instr_type_deref:
      /* ignored, will be walked by nir_intrinsic_image_*_deref. */
      break;

   case nir_instr_type_alu:
      ntr_emit_alu(c, nir_instr_as_alu(instr));
      break;

   case nir_instr_type_intrinsic:
      ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
      break;

   case nir_instr_type_load_const:
      /* Nothing to do here, as load consts are done directly from
       * ntr_get_src() (since many constant NIR srcs will often get folded
       * directly into a register file index instead of as a TGSI src).
       */
      break;

   case nir_instr_type_tex:
      ntr_emit_texture(c, nir_instr_as_tex(instr));
      break;

   case nir_instr_type_jump:
      ntr_emit_jump(c, nir_instr_as_jump(instr));
      break;

   case nir_instr_type_undef:
      ntr_emit_ssa_undef(c, nir_instr_as_undef(instr));
      break;

   default:
      fprintf(stderr, "Unknown NIR instr type: ");
      nir_print_instr(instr, stderr);
      fprintf(stderr, "\n");
      abort();
   }
}

static void
ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt)
{
   ntr_IF(c, c->if_cond);

   ntr_emit_cf_list(c, &if_stmt->then_list);

   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
      ntr_ELSE(c);
      ntr_emit_cf_list(c, &if_stmt->else_list);
   }

   ntr_ENDIF(c);
}

static void
ntr_emit_loop(struct ntr_compile *c, nir_loop *loop)
{
   assert(!nir_loop_has_continue_construct(loop));
   ntr_BGNLOOP(c);
   ntr_emit_cf_list(c, &loop->body);
   ntr_ENDLOOP(c);
}

static void
ntr_emit_block(struct ntr_compile *c, nir_block *block)
{
   struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
   c->cur_block = ntr_block;

   nir_foreach_instr (instr, block) {
      ntr_emit_instr(c, instr);

      /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */
      if (ureg_get_instruction_number(c->ureg) != 0) {
         fprintf(stderr, "Emitted ureg insn during: ");
         nir_print_instr(instr, stderr);
         fprintf(stderr, "\n");
         unreachable("emitted ureg insn");
      }
   }

   /* Set up the if condition for ntr_emit_if(), which we have to do before
    * freeing up the temps (the "if" is treated as inside the block for liveness
    * purposes, despite not being an instruction)
    *
    * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
    * looks at all of .xyzw.  No harm in working around the bug.
    */
   nir_if *nif = nir_block_get_following_if(block);
   if (nif)
      c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X);
}

static void
ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list)
{
   foreach_list_typed (nir_cf_node, node, node, list) {
      switch (node->type) {
      case nir_cf_node_block:
         ntr_emit_block(c, nir_cf_node_as_block(node));
         break;

      case nir_cf_node_if:
         ntr_emit_if(c, nir_cf_node_as_if(node));
         break;

      case nir_cf_node_loop:
         ntr_emit_loop(c, nir_cf_node_as_loop(node));
         break;

      default:
         unreachable("unknown CF type");
      }
   }
}

static void
ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block)
{
   struct ntr_block *ntr_block = ntr_block_from_nir(c, block);

   /* Emit the ntr insns to tgsi_ureg. */
   util_dynarray_foreach (&ntr_block->insns, struct ntr_insn, insn) {
      const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(insn->opcode);

      switch (insn->opcode) {
      case TGSI_OPCODE_IF:
         ureg_IF(c->ureg, insn->src[0], &c->cf_label);
         break;

      case TGSI_OPCODE_ELSE:
         ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
         ureg_ELSE(c->ureg, &c->cf_label);
         c->current_if_else = c->cf_label;
         break;

      case TGSI_OPCODE_ENDIF:
         ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
         ureg_ENDIF(c->ureg);
         break;

      case TGSI_OPCODE_BGNLOOP:
         /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
          * does reference BGNLOOP's.  Follow the former behavior unless something comes up
          * with a need.
          */
         ureg_BGNLOOP(c->ureg, &c->cf_label);
         break;

      case TGSI_OPCODE_ENDLOOP:
         ureg_ENDLOOP(c->ureg, &c->cf_label);
         break;

      default:
         if (insn->is_tex) {
            int num_offsets = 0;
            for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
               if (insn->tex_offset[i].File != TGSI_FILE_NULL)
                  num_offsets = i + 1;
            }
            ureg_tex_insn(c->ureg, insn->opcode, insn->dst, opcode_info->num_dst, insn->tex_target,
                          insn->tex_return_type, insn->tex_offset, num_offsets, insn->src,
                          opcode_info->num_src);
         } else {
            ureg_insn(c->ureg, insn->opcode, insn->dst, opcode_info->num_dst, insn->src,
                      opcode_info->num_src, insn->precise);
         }
      }
   }
}

static void
ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt)
{
   /* Note: the last block emitted our IF opcode. */

   int if_stack = c->current_if_else;
   c->current_if_else = c->cf_label;

   /* Either the then or else block includes the ENDIF, which will fix up the
    * IF(/ELSE)'s label for jumping
    */
   ntr_emit_cf_list_ureg(c, &if_stmt->then_list);
   ntr_emit_cf_list_ureg(c, &if_stmt->else_list);

   c->current_if_else = if_stack;
}

static void
ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list)
{
   foreach_list_typed (nir_cf_node, node, node, list) {
      switch (node->type) {
      case nir_cf_node_block:
         ntr_emit_block_ureg(c, nir_cf_node_as_block(node));
         break;

      case nir_cf_node_if:
         ntr_emit_if_ureg(c, nir_cf_node_as_if(node));
         break;

      case nir_cf_node_loop:
         /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
          * does reference BGNLOOP's.  Follow the former behavior unless something comes up
          * with a need.
          */
         ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
         break;

      default:
         unreachable("unknown CF type");
      }
   }
}

static void
ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl)
{
   c->impl = impl;

   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);

   /* Set up the struct ntr_blocks to put insns in */
   c->blocks = _mesa_pointer_hash_table_create(c);
   nir_foreach_block (block, impl) {
      struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block);
      util_dynarray_init(&ntr_block->insns, ntr_block);
      _mesa_hash_table_insert(c->blocks, block, ntr_block);
   }

   ntr_setup_registers(c);

   c->cur_block = ntr_block_from_nir(c, nir_start_block(impl));
   ntr_setup_inputs(c);
   ntr_setup_outputs(c);
   ntr_setup_uniforms(c);

   /* Emit the ntr insns */
   ntr_emit_cf_list(c, &impl->body);

   if (c->s->info.stage == MESA_SHADER_FRAGMENT)
      ntr_allocate_regs(c, impl);
   else
      ntr_allocate_regs_unoptimized(c, impl);

   /* Turn the ntr insns into actual TGSI tokens */
   ntr_emit_cf_list_ureg(c, &impl->body);

   ralloc_free(c->liveness);
   c->liveness = NULL;
}

static int
type_size(const struct glsl_type *type, bool bindless)
{
   return glsl_count_attribute_slots(type, false);
}

/* Allow vectorizing of ALU instructions.
 */
static uint8_t
ntr_should_vectorize_instr(const nir_instr *instr, const void *data)
{
   if (instr->type != nir_instr_type_alu)
      return 0;

   return 4;
}

static bool
ntr_should_vectorize_io(unsigned align, unsigned bit_size, unsigned num_components,
                        unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                        void *data)
{
   if (bit_size != 32)
      return false;

   /* Our offset alignment should always be at least 4 bytes */
   if (align < 4)
      return false;

   /* No wrapping off the end of a TGSI reg.  We could do a bit better by
    * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
    * need this restriction.
    */
   unsigned worst_start_component = align == 4 ? 3 : align / 4;
   if (worst_start_component + num_components > 4)
      return false;

   return true;
}

static nir_variable_mode
ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
{
   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
   unsigned indirect_mask = nir_var_shader_in | nir_var_shader_out;

   if (!screen->get_shader_param(screen, pipe_stage, PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
      indirect_mask |= nir_var_function_temp;
   }

   return indirect_mask;
}

struct ntr_lower_tex_state {
   nir_scalar channels[8];
   unsigned i;
};

static void
nir_to_rc_lower_tex_instr_arg(nir_builder *b, nir_tex_instr *instr, nir_tex_src_type tex_src_type,
                              struct ntr_lower_tex_state *s)
{
   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
   if (tex_src < 0)
      return;

   nir_def *def = instr->src[tex_src].src.ssa;
   for (int i = 0; i < def->num_components; i++) {
      s->channels[s->i++] = nir_get_scalar(def, i);
   }

   nir_tex_instr_remove_src(instr, tex_src);
}

/**
 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
 * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
 * manage it on our own, and may lead to more vectorization.
 */
static bool
nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_tex)
      return false;

   nir_tex_instr *tex = nir_instr_as_tex(instr);

   if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
      return false;

   b->cursor = nir_before_instr(instr);

   struct ntr_lower_tex_state s = {0};

   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
   /* We always have at least two slots for the coordinate, even on 1D. */
   s.i = MAX2(s.i, 2);

   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
   s.i = MAX2(s.i, 3);

   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);

   /* XXX: LZ */
   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
   nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);

   /* No need to pack undefs in unused channels of the tex instr */
   while (!s.channels[s.i - 1].def)
      s.i--;

   /* Instead of putting undefs in the unused slots of the vecs, just put in
    * another used channel.  Otherwise, we'll get unnecessary moves into
    * registers.
    */
   assert(s.channels[0].def != NULL);
   for (int i = 1; i < s.i; i++) {
      if (!s.channels[i].def)
         s.channels[i] = s.channels[0];
   }

   nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
   if (s.i > 4)
      nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_vec_scalars(b, &s.channels[4], s.i - 4));

   return true;
}

static bool
nir_to_rc_lower_tex(nir_shader *s)
{
   return nir_shader_instructions_pass(s, nir_to_rc_lower_tex_instr, nir_metadata_control_flow,
                                       NULL);
}

/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
static void
nir_to_rc_lower_txp(nir_shader *s)
{
   nir_lower_tex_options lower_tex_options = {
      .lower_txp = 0,
   };

   nir_foreach_block (block, nir_shader_get_entrypoint(s)) {
      nir_foreach_instr (instr, block) {
         if (instr->type != nir_instr_type_tex)
            continue;
         nir_tex_instr *tex = nir_instr_as_tex(instr);

         if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
            continue;

         bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
         bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 ||
                        s->info.stage != MESA_SHADER_FRAGMENT;
         bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;

         /* We can do TXP for any tex (not txg) where we can fit all the
          * coordinates and comparator and projector in one vec4 without any
          * other modifiers to add on.
          *
          * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
          * if we get any funny projectors then we just blow them all away.
          */
         if (tex->op != nir_texop_tex || has_lod || has_offset ||
             (tex->coord_components >= 3 && has_compare))
            lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
      }
   }

   /* nir_lower_tex must be run even if no options are set, because we need the
    * LOD to be set for query_levels and for non-fragment shaders.
    */
   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
}

/**
 * Translates the NIR shader to TGSI.
 *
 * This requires some lowering of the NIR shader to prepare it for translation.
 * We take ownership of the NIR shader passed, returning a reference to the new
 * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
 */
const void *
nir_to_rc(struct nir_shader *s, struct pipe_screen *screen)
{
   struct ntr_compile *c;
   const void *tgsi_tokens;
   bool is_r500 = r300_screen(screen)->caps.is_r500;
   c = rzalloc(NULL, struct ntr_compile);
   c->screen = screen;
   c->lower_fabs = !is_r500 && s->info.stage == MESA_SHADER_VERTEX;

   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      if (is_r500) {
         NIR_PASS_V(s, r300_transform_fs_trig_input);
      }
   } else if (r300_screen(screen)->caps.has_tcl) {
      if (is_r500) {
         /* Only nine should set both NTT shader name and
          * use_legacy_math_rules and D3D9 already mandates
          * the proper range for the trigonometric inputs.
          */
         if (!s->info.use_legacy_math_rules || !(s->info.name && !strcmp("TTN", s->info.name))) {
            NIR_PASS_V(s, r300_transform_vs_trig_input);
         }
      } else {
         if (r300_screen(screen)->caps.is_r400) {
            NIR_PASS_V(s, r300_transform_vs_trig_input);
         }
      }
   }

   /* Lower array indexing on FS inputs.  Since we don't set
    * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
    * elements by ureg, and so dynamically indexing them would be invalid.
    * Ideally we would set that ureg flag based on
    * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
    * splitting NIR VS outputs to elements even if the FS doesn't get the
    * corresponding splitting, and virgl depends on TGSI across link boundaries
    * having matching declarations.
    */
   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
      NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
   }

   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size,
              nir_lower_io_use_interpolated_input_intrinsics);

   nir_to_rc_lower_txp(s);
   NIR_PASS_V(s, nir_to_rc_lower_tex);

   bool progress;
   do {
      progress = false;
      NIR_PASS(progress, s, nir_opt_algebraic);
      NIR_PASS(progress, s, nir_opt_constant_folding);
   } while (progress);

   do {
      progress = false;
      NIR_PASS(progress, s, nir_opt_algebraic_late);
      if (progress) {
         NIR_PASS_V(s, nir_copy_prop);
         NIR_PASS_V(s, nir_opt_dce);
         NIR_PASS_V(s, nir_opt_cse);
      }
   } while (progress);

   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS_V(s, r300_nir_prepare_presubtract);
   }

   NIR_PASS_V(s, nir_lower_int_to_float);
   NIR_PASS_V(s, nir_copy_prop);
   NIR_PASS_V(s, r300_nir_post_integer_lowering);
   NIR_PASS_V(s, nir_lower_bool_to_float, is_r500 || s->info.stage == MESA_SHADER_FRAGMENT);
   /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
   NIR_PASS_V(s, nir_copy_prop);
   /* CSE cleanup after late ftrunc lowering. */
   NIR_PASS_V(s, nir_opt_cse);
   /* At this point we need to clean;
    *  a) fcsel_gt that come from the ftrunc lowering on R300,
    *  b) all flavours of fcsels that read three different temp sources on R500.
    */
   if (s->info.stage == MESA_SHADER_VERTEX) {
      if (is_r500)
         NIR_PASS_V(s, r300_nir_lower_fcsel_r500);
      else
         NIR_PASS_V(s, r300_nir_lower_fcsel_r300);
      NIR_PASS_V(s, r300_nir_lower_flrp);
   } else {
      NIR_PASS_V(s, r300_nir_lower_comparison_fs);
   }
   NIR_PASS_V(s, r300_nir_opt_algebraic_late);
   NIR_PASS_V(s, nir_opt_dce);

   nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
                               nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;

   NIR_PASS_V(s, nir_opt_move, move_all);
   NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true);
   /* Late vectorizing after nir_move_vec_src_uses_to_dest helps instructions but
    * increases register usage. Testing shows this is beneficial only in VS.
    */
   if (s->info.stage == MESA_SHADER_VERTEX)
      NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);

   NIR_PASS_V(s, nir_convert_from_ssa, true);
   NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL);

   /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
    */
   NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
   NIR_PASS_V(s, nir_opt_dce);

   /* See comment in ntr_get_alu_src for supported modifiers */
   NIR_PASS_V(s, nir_legacy_trivialize, !c->lower_fabs);

   if (NIR_DEBUG(TGSI)) {
      fprintf(stderr, "NIR before translation to TGSI:\n");
      nir_print_shader(s, stderr);
   }

   c->s = s;
   c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
   ureg_setup_shader_info(c->ureg, &s->info);
   if (s->info.use_legacy_math_rules && screen->caps.legacy_math_rules)
      ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);

   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      /* The draw module's polygon stipple layer doesn't respect the chosen
       * coordinate mode, so leave it as unspecified unless we're actually
       * reading the position in the shader already.  See
       * gl-2.1-polygon-stipple-fs on softpipe.
       */
      if ((s->info.inputs_read & VARYING_BIT_POS) ||
          BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
                       s->info.fs.origin_upper_left ? TGSI_FS_COORD_ORIGIN_UPPER_LEFT
                                                    : TGSI_FS_COORD_ORIGIN_LOWER_LEFT);

         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
                       s->info.fs.pixel_center_integer ? TGSI_FS_COORD_PIXEL_CENTER_INTEGER
                                                       : TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
      }
   }
   /* Emit the main function */
   nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
   ntr_emit_impl(c, impl);
   ureg_END(c->ureg);

   tgsi_tokens = ureg_get_tokens(c->ureg, NULL);

   if (NIR_DEBUG(TGSI)) {
      fprintf(stderr, "TGSI after translation from NIR:\n");
      tgsi_dump(tgsi_tokens, 0);
   }

   ureg_destroy(c->ureg);

   ralloc_free(c);
   ralloc_free(s);

   return tgsi_tokens;
}