nouveau/vulkan/nvk_cmd_indirect.c

/*
 * Copyright © 2024 Collabora Ltd. and Red Hat Inc.
 * SPDX-License-Identifier: MIT
 */
#include "nvk_cmd_buffer.h"
#include "nvk_device.h"
#include "nvk_entrypoints.h"
#include "nvk_indirect_execution_set.h"
#include "nvk_mme.h"
#include "nvk_physical_device.h"
#include "nvk_shader.h"

#include "nir_builder.h"
#include "vk_pipeline.h"

#include "nv_push.h"
#include "nv_push_cl9097.h"
#include "nv_push_cl906f.h"
#include "nv_push_cla0c0.h"
#include "nv_push_clb1c0.h"
#include "nv_push_clc6c0.h"

struct nvk_indirect_commands_layout {
   struct vk_object_base base;

   VkShaderStageFlags stages;

   /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */
   VkShaderStageFlags set_stages;

   uint32_t cmd_seq_stride_B;
   uint32_t qmd_size_per_seq_B;

   struct nvk_shader *init;
   struct nvk_shader *process;
};

VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base,
                               VkIndirectCommandsLayoutEXT,
                               VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);

struct process_cmd_push {
    uint64_t in_addr;
    uint64_t out_addr;
    uint64_t qmd_pool_addr;
    uint64_t count_addr;
    uint32_t max_seq_count;
    uint32_t ies_stride;
    uint64_t ies_addr;
    uint64_t root_addr;
};

struct process_cmd_in {
    nir_def *in_addr;
    nir_def *out_addr;
    nir_def *qmd_pool_addr;
    nir_def *count_addr;
    nir_def *max_seq_count;
    nir_def *ies_stride;
    nir_def *ies_addr;
    nir_def *root_addr;
};

static nir_def *
load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
{
   nir_deref_instr *deref =
      nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
   return nir_load_deref(b, deref);
}

static struct process_cmd_in
load_process_cmd_in(nir_builder *b)
{
   struct glsl_struct_field push_fields[] = {
      { .type = glsl_uint64_t_type(), .name = "in_addr",       .offset = 0x00 },
      { .type = glsl_uint64_t_type(), .name = "out_addr",      .offset = 0x08 },
      { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 },
      { .type = glsl_uint64_t_type(), .name = "count_addr",    .offset = 0x18 },
      { .type = glsl_uint_type(),     .name = "max_seq_count", .offset = 0x20 },
      { .type = glsl_uint_type(),     .name = "ies_stride",    .offset = 0x24 },
      { .type = glsl_uint64_t_type(), .name = "ies_addr",      .offset = 0x28 },
      { .type = glsl_uint64_t_type(), .name = "root_addr",     .offset = 0x30 },
   };
   const struct glsl_type *push_iface_type =
      glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
                          GLSL_INTERFACE_PACKING_STD140,
                          false /* row_major */, "push");
   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
                                            push_iface_type, "push");

   return (struct process_cmd_in) {
      .in_addr       = load_struct_var(b, push, 0),
      .out_addr      = load_struct_var(b, push, 1),
      .qmd_pool_addr = load_struct_var(b, push, 2),
      .count_addr    = load_struct_var(b, push, 3),
      .max_seq_count = load_struct_var(b, push, 4),
      .ies_stride    = load_struct_var(b, push, 5),
      .ies_addr      = load_struct_var(b, push, 6),
      .root_addr     = load_struct_var(b, push, 7),
   };
}

static nir_def *
build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx)
{
   nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx);
   return nir_iadd(b, in->ies_addr, offset);
}

static nir_def *
load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw)
{
   return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32);
}

static void
store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw,
                nir_def *data)
{
   assert(data->bit_size == 32 && data->num_components == 1);
   nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1);
}

static void
nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y)
{
   nir_def *x = nir_load_var(b, x_var);
   x = nir_iadd(b, x, y);
   nir_store_var(b, x_var, x, 0x1);
}

static void
nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y)
{
   nir_def *x = nir_load_var(b, x_var);
   x = nir_iadd_imm(b, x, y);
   nir_store_var(b, x_var, x, 0x1);
}

struct nvk_nir_push {
   nir_variable *addr;
   nir_variable *dw_count;
   unsigned max_dw_count;
};

static void
nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr)
{
   p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(),
                                       "nvk_nir_push::addr");
   nir_store_var(b, p->addr, addr, 0x1);
   p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(),
                                           "nvk_nir_push::dw_count");
   nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1);
   p->max_dw_count = 0;
}

static inline void
nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw)
{
   store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
   nir_iadd_to_var_imm(b, p->addr, 4);
   nir_iadd_to_var_imm(b, p->dw_count, 1);
   p->max_dw_count++;
}

static inline void
nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p,
                      nir_def *src_dw_addr_in, nir_def *dw_count,
                      uint32_t max_dw_count)
{
   nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
   nir_store_var(b, i, nir_imm_int(b, 0), 0x1);

   nir_variable *src_dw_addr =
      nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr");
   nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1);

   nir_push_loop(b);
   {
      nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count));
      {
         nir_jump(b, nir_jump_break);
      }
      nir_pop_if(b, NULL);

      nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0);
      store_global_dw(b, nir_load_var(b, p->addr), 0, dw);

      nir_iadd_to_var_imm(b, i, 1);
      nir_iadd_to_var_imm(b, p->addr, 4);
      nir_iadd_to_var_imm(b, src_dw_addr, 4);
   }
   nir_pop_loop(b, NULL);

   nir_iadd_to_var(b, p->dw_count, dw_count);
   p->max_dw_count += max_dw_count;
}

static inline void
nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p,
                     int subc, int mthd, unsigned dw_count)
{
   uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count);
   nvk_nir_push_dw(b, p, nir_imm_int(b, hdr));
}
#define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \
   nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw))

static void
nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop)
{
   nir_push_loop(b);
   {
      nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count),
                                    p->max_dw_count));
      {
         nir_jump(b, nir_jump_break);
      }
      nir_pop_if(b, NULL);

      store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop));
      nir_iadd_to_var_imm(b, p->addr, 4);
      nir_iadd_to_var_imm(b, p->dw_count, 1);
   }
   nir_pop_loop(b, NULL);
}
#define nvk_nir_pad_NOP(b, p, class) \
   nvk_nir_build_pad_NOP((b), (p), \
      NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0))

#define QMD_ALIGN 0x100
#define QMD_ALLOC_SIZE QMD_ALIGN
#define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \
                       sizeof(struct nvk_root_descriptor_table))

static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0,
              "QMD size is not properly algined");
static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0,
              "Root descriptor table size is not aligned");

static void
copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr,
                    nir_def **repl_dw, uint32_t dw_count)
{
   for (uint32_t i = 0; i < dw_count; i++) {
      nir_def *dw;
      if (repl_dw[i] == NULL)
         dw = load_global_dw(b, src_addr, i);
      else
         dw = repl_dw[i];
      store_global_dw(b, dst_addr, i, dw);
   }
}

static void
build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
                         nir_def *in_addr, nir_def *seq_idx,
                         struct process_cmd_in *in,
                         struct nvk_physical_device *pdev,
                         const VkIndirectCommandsLayoutCreateInfoEXT *info,
                         uint32_t *qmd_size_per_seq_B_out)
{
   /* If we don't have any indirect execution set, the currently bound shader
    * will be passed in there.
    */
   nir_def *shader_qmd_addr = in->ies_addr;

   nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {};

#define root_dw(member) ( \
   assert(nvk_root_descriptor_offset(member) % 4 == 0), \
   nvk_root_descriptor_offset(member) / 4)

   root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0);
   root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0);
   root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0);

   *qmd_size_per_seq_B_out = 0;
   for (uint32_t t = 0; t < info->tokenCount; t++) {
      const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];

      nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
      switch (token->type) {
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: {
         assert(token->data.pExecutionSet->shaderStages ==
                VK_SHADER_STAGE_COMPUTE_BIT);
         assert(t == 0);

         nir_def *idx = load_global_dw(b, token_addr, 0);
         shader_qmd_addr = build_exec_set_addr(b, in, idx);
         break;
      }

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: {
         const VkPushConstantRange *pc_range =
            &token->data.pPushConstant->updateRange;

         assert(pc_range->offset % 4 == 0);
         assert(pc_range->size % 4 == 0);

         const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4);
         for (uint32_t i = 0; i < pc_range->size / 4; i++)
            root_repl[start_dw + i] = load_global_dw(b, token_addr, i);
         break;
      }

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: {
         const VkPushConstantRange *pc_range =
            &token->data.pPushConstant->updateRange;

         assert(pc_range->offset % 4 == 0);
         assert(pc_range->size == 4);

         const uint32_t dw = root_dw(push) + (pc_range->offset / 4);
         root_repl[dw] = seq_idx;
         break;
      }

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: {
         nir_def *disp_size_x = load_global_dw(b, token_addr, 0);
         nir_def *disp_size_y = load_global_dw(b, token_addr, 1);
         nir_def *disp_size_z = load_global_dw(b, token_addr, 2);

         *qmd_size_per_seq_B_out += QMD_ROOT_SIZE;

         nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0),
                                              nir_ine_imm(b, disp_size_y, 0)),
                                   nir_ine_imm(b, disp_size_z, 0)));
         {
            /* The first dword in qmd_addr is an allocator in units of 256
             * bytes.
             */
            nir_def *qmd_idx =
               nir_global_atomic(b, 32, in->qmd_pool_addr,
                                 nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN),
                                 .atomic_op = nir_atomic_op_iadd);
            nir_def *qmd_offset =
               nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN);
            nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset);
            nir_def *root_addr =
               nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd));

            /* Upload and patch the root descriptor table */
            root_repl[root_dw(cs.group_count[0])] = disp_size_x;
            root_repl[root_dw(cs.group_count[1])] = disp_size_y;
            root_repl[root_dw(cs.group_count[2])] = disp_size_z;
            copy_repl_global_dw(b, root_addr, in->root_addr,
                                root_repl, ARRAY_SIZE(root_repl));

            /* Upload and patch the QMD */
            const struct nak_qmd_dispatch_size_layout qmd_layout =
               nak_get_qmd_dispatch_size_layout(&pdev->info);
            assert(qmd_layout.x_start % 32 == 0);
            assert(qmd_layout.y_start == qmd_layout.x_start + 32);
            assert(qmd_layout.z_start == qmd_layout.x_start + 64);

            nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {};
            qmd_repl[qmd_layout.x_start / 32] = disp_size_x;
            qmd_repl[qmd_layout.y_start / 32] = disp_size_y;
            qmd_repl[qmd_layout.z_start / 32] = disp_size_z;

            /* TODO: Get these from NAK? */
            const uint32_t cb0_lo_start = 1024, cb0_hi_start = 1056;
            qmd_repl[cb0_lo_start / 32] = nir_unpack_64_2x32_split_x(b, root_addr);
            qmd_repl[cb0_hi_start / 32] =
               nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_hi_start / 32),
                          nir_unpack_64_2x32_split_y(b, root_addr));

            copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
                                qmd_repl, ARRAY_SIZE(qmd_repl));

            /* Now emit commands */
            nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
            invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
            nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
            nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
            nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));

            nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1);
            nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8)));

            if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) {
               uint32_t signal;
               V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal,
                  PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
               nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1);
               nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
            } else {
               uint32_t signal;
               V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, {
                  .invalidate = INVALIDATE_TRUE,
                  .schedule = SCHEDULE_TRUE
               });
               nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1);
               nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
            }
         }
         nir_pop_if(b, NULL);
         break;
      }

      default:
         unreachable("Unsupported indirect token type");
      }
   }
}

/*
 * Graphics
 */

static void
build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
                   struct process_cmd_in *in,
                   struct nvk_physical_device *pdev,
                   const VkIndirectCommandsExecutionSetTokenEXT *token)
{
   switch (token->type) {
   case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
      nir_def *idx = load_global_dw(b, token_addr, 0);
      nir_def *push_addr = build_exec_set_addr(b, in, idx);
      nir_def *dw_count = load_global_dw(b, push_addr, 0);
      const uint16_t max_dw_count =
         nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages);
      nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
                            dw_count, max_dw_count);
      break;
   }

   case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
      int32_t i = 0;
      gl_shader_stage type_stage[6] = {};
      nir_def *type_shader_idx[6] = {};
      gl_shader_stage last_vtgm = MESA_SHADER_VERTEX;
      u_foreach_bit(s, token->shaderStages) {
         gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);

         if (stage != MESA_SHADER_FRAGMENT)
            last_vtgm = stage;

         uint32_t type = mesa_to_nv9097_shader_type(stage);
         type_stage[type] = stage;
         type_shader_idx[type] = load_global_dw(b, token_addr, i++);
      }

      for (uint32_t type = 0; type < 6; type++) {
         nir_def *shader_idx = type_shader_idx[type];
         if (shader_idx == NULL)
            continue;

         bool is_last_vtgm = type_stage[type] == last_vtgm;

         nir_def *push_addr = build_exec_set_addr(b, in, shader_idx);
         nir_def *hdr = load_global_dw(b, push_addr, 0);
         nir_def *dw_count =
            nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm));
         const uint16_t max_dw_count =
            nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages,
                                            is_last_vtgm);
         nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
                               dw_count, max_dw_count);
      }
      break;
   }

   default:
      unreachable("Unknown indirect execution set type");
   }
}

static void
build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
                     const VkIndirectCommandsPushConstantTokenEXT *token)
{
   const VkPushConstantRange *pc_range = &token->updateRange;

   // TODO: Compute
   assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));

   assert(pc_range->offset % 4 == 0);
   assert(pc_range->size % 4 == 0);
   const uint32_t dw_count = pc_range->size / 4;

   nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count);
   nvk_nir_push_dw(b, p, nir_imm_int(b,
      nvk_root_descriptor_offset(push) + pc_range->offset));
   for (uint32_t i = 0; i < dw_count; i++)
      nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i));
}

static void
build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p,
                       nir_def *token_addr, nir_def *seq_idx,
                       const VkIndirectCommandsPushConstantTokenEXT *token)
{
   const VkPushConstantRange *pc_range = &token->updateRange;

   // TODO: Compute
   assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));

   assert(pc_range->offset % 4 == 0);
   assert(pc_range->size == 4);
   nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2);
   nvk_nir_push_dw(b, p, nir_imm_int(b,
      nvk_root_descriptor_offset(push) + pc_range->offset));
   nvk_nir_push_dw(b, p, seq_idx);
}

static void
build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
             UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token)
{
   nir_def *addr_lo = load_global_dw(b, token_addr, 0);
   nir_def *addr_hi = load_global_dw(b, token_addr, 1);
   nir_def *size_B  = load_global_dw(b, token_addr, 2);
   nir_def *idx_fmt = load_global_dw(b, token_addr, 3);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4);
   nvk_nir_push_dw(b, p, addr_hi);
   nvk_nir_push_dw(b, p, addr_lo);
   nvk_nir_push_dw(b, p, size_B);
   nvk_nir_push_dw(b, p, idx_fmt);
}

static nir_def *
nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride)
{
   return nir_pack_32_2x16_split(b, nir_u2u16(b, stride),
                                    nir_u2u16(b, vb_idx));
}

static void
build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
             const VkIndirectCommandsVertexBufferTokenEXT *token)
{
   nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit);
   nir_def *addr_lo  = load_global_dw(b, token_addr, 0);
   nir_def *addr_hi  = load_global_dw(b, token_addr, 1);
   nir_def *size_B   = load_global_dw(b, token_addr, 2);
   nir_def *stride_B = load_global_dw(b, token_addr, 3);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4);
   nvk_nir_push_dw(b, p, vb_idx);
   nvk_nir_push_dw(b, p, addr_hi);
   nvk_nir_push_dw(b, p, addr_lo);
   nvk_nir_push_dw(b, p, size_B);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1);
   nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B));
}

static void
build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
{
   nir_def *vertex_count   = load_global_dw(b, token_addr, 0);
   nir_def *instance_count = load_global_dw(b, token_addr, 1);
   nir_def *first_vertex   = load_global_dw(b, token_addr, 2);
   nir_def *first_instance = load_global_dw(b, token_addr, 3);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5);
   nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
   nvk_nir_push_dw(b, p, vertex_count);
   nvk_nir_push_dw(b, p, instance_count);
   nvk_nir_push_dw(b, p, first_vertex);
   nvk_nir_push_dw(b, p, first_instance);
}

static void
build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
{
   nir_def *index_count    = load_global_dw(b, token_addr, 0);
   nir_def *instance_count = load_global_dw(b, token_addr, 1);
   nir_def *first_index    = load_global_dw(b, token_addr, 2);
   nir_def *vertex_offset  = load_global_dw(b, token_addr, 3);
   nir_def *first_instance = load_global_dw(b, token_addr, 4);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6);
   nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
   nvk_nir_push_dw(b, p, index_count);
   nvk_nir_push_dw(b, p, instance_count);
   nvk_nir_push_dw(b, p, first_index);
   nvk_nir_push_dw(b, p, vertex_offset);
   nvk_nir_push_dw(b, p, first_instance);
}

static void
build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
{
   nir_def *addr_lo = load_global_dw(b, token_addr, 0);
   nir_def *addr_hi = load_global_dw(b, token_addr, 1);
   nir_def *stride  = load_global_dw(b, token_addr, 2);
   nir_def *count   = load_global_dw(b, token_addr, 3);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4);
   nvk_nir_push_dw(b, p, addr_hi);
   nvk_nir_push_dw(b, p, addr_lo);
   nvk_nir_push_dw(b, p, count);
   nvk_nir_push_dw(b, p, stride);
}

static void
build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p,
                         nir_def *token_addr)
{
   nir_def *addr_lo = load_global_dw(b, token_addr, 0);
   nir_def *addr_hi = load_global_dw(b, token_addr, 1);
   nir_def *stride  = load_global_dw(b, token_addr, 2);
   nir_def *count   = load_global_dw(b, token_addr, 3);

   nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4);
   nvk_nir_push_dw(b, p, addr_hi);
   nvk_nir_push_dw(b, p, addr_lo);
   nvk_nir_push_dw(b, p, count);
   nvk_nir_push_dw(b, p, stride);
}

static void
build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
                          nir_def *in_addr, nir_def *seq_idx,
                          struct process_cmd_in *in,
                          struct nvk_physical_device *pdev,
                          const VkIndirectCommandsLayoutCreateInfoEXT *info)
{
   for (uint32_t t = 0; t < info->tokenCount; t++) {
      const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];

      nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
      switch (token->type) {
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT:
         assert(t == 0);
         build_gfx_set_exec(b, p, token_addr, in, pdev,
                            token->data.pExecutionSet);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT:
         build_push_gfx_const(b, p, token_addr, token->data.pPushConstant);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT:
         build_push_gfx_seq_idx(b, p, token_addr, seq_idx,
                                token->data.pPushConstant);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT:
         build_set_ib(b, p, token_addr, token->data.pIndexBuffer);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT:
         build_set_vb(b, p, token_addr, token->data.pVertexBuffer);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT:
         build_draw_indexed(b, p, token_addr);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT:
         build_draw(b, p, token_addr);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT:
         build_draw_indexed_count(b, p, token_addr);
         break;

      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT:
         build_draw_count(b, p, token_addr);
         break;

      default:
         unreachable("Unsupported indirect token type");
      }
   }
}

static VkResult
build_init_shader(struct nvk_device *dev,
                  const VkIndirectCommandsLayoutCreateInfoEXT *info,
                  uint32_t qmd_size_per_seq_B,
                  const VkAllocationCallbacks *pAllocator,
                  struct nvk_shader **shader_out)
{
   /* There's nothing to initialize for graphics */
   if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
      assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
      *shader_out = NULL;
      return VK_SUCCESS;
   }

   if (qmd_size_per_seq_B == 0) {
      *shader_out = NULL;
      return VK_SUCCESS;
   }

   nir_builder build =
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
                                     "nvk-init-indirect-commands");
   build.shader->info.workgroup_size[0] = 32;
   nir_builder *b = &build;

   struct process_cmd_in in = load_process_cmd_in(b);

   if (qmd_size_per_seq_B > 0) {
      /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we
       * allocate don't stomp the allocator.
       */
      assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
      store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1));
   }

   return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
}

static VkResult
build_process_shader(struct nvk_device *dev,
                     const VkIndirectCommandsLayoutCreateInfoEXT *info,
                     const VkAllocationCallbacks *pAllocator,
                     struct nvk_shader **shader_out,
                     uint32_t *cmd_seq_stride_B_out,
                     uint32_t *qmd_size_per_seq_B_out)
{
   struct nvk_physical_device *pdev = nvk_device_physical(dev);

   nir_builder build =
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
                                     "nvk-process-indirect-commands");
   build.shader->info.workgroup_size[0] = 32;
   nir_builder *b = &build;

   struct process_cmd_in in = load_process_cmd_in(b);

   nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);

   /* We always execute a 32-wide shader and nothing guarantees that
    * max_seq_count is a multiple of 32 so we need to bail if our index is
    * above the maximum.  If we're inside the maximum but less than the count,
    * we setill need to emit a bunch of NOP.
    */
   nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count));
   {
      nir_jump(b, nir_jump_halt);
   }
   nir_pop_if(b, NULL);

   nir_def *ind_count;
   nir_push_if(b, nir_ine_imm(b, in.count_addr, 0));
   {
      ind_count = load_global_dw(b, in.count_addr, 0);
      ind_count = nir_umin(b, ind_count, in.max_seq_count);
   }
   nir_pop_if(b, NULL);
   nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count);

   nir_def *in_seq_addr = nir_iadd(b, in.in_addr,
      nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride));
   /* We'll replace this later once we know what it is */
   nir_def *out_stride = nir_imm_int(b, 0xc0ffee0);
   nir_def *out_seq_addr = nir_iadd(b, in.out_addr,
      nir_imul_2x32_64(b, seq_idx, out_stride));

   struct nvk_nir_push push = {};
   nvk_nir_push_start(b, &push, out_seq_addr);

   nir_push_if(b, nir_ult(b, seq_idx, count));
   {
      if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
         assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
         build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx,
                                  &in, pdev, info, qmd_size_per_seq_B_out);
      } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
         assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
         *qmd_size_per_seq_B_out = 0;
         build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx,
                                   &in, pdev, info);
      } else {
         unreachable("Unknown shader stage");
      }
   }
   nir_pop_if(b, NULL);

   /* Always pad the command buffer.  In the case where seq_idx >= count, the
    * entire sequence will be NO_OPERATION.
    */
   if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
      nvk_nir_pad_NOP(b, &push, NVA0C0);
   } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
      nvk_nir_pad_NOP(b, &push, NV9097);
   } else {
      unreachable("Unknown shader stage");
   }

   /* Replace the out stride with the actual size of a command stream */
   nir_load_const_instr *out_stride_const =
      nir_instr_as_load_const(out_stride->parent_instr);
   out_stride_const->value[0].u32 = push.max_dw_count * 4;

   /* We also output this stride to go in the layout struct */
   *cmd_seq_stride_B_out = push.max_dw_count * 4;

   return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
}

static void
nvk_indirect_commands_layout_destroy(struct nvk_device *dev,
                                     struct nvk_indirect_commands_layout *layout,
                                     const VkAllocationCallbacks *alloc)
{
   if (layout->init != NULL)
      vk_shader_destroy(&dev->vk, &layout->init->vk, alloc);
   if (layout->process != NULL)
      vk_shader_destroy(&dev->vk, &layout->process->vk, alloc);
   vk_object_free(&dev->vk, alloc, layout);
}

VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateIndirectCommandsLayoutEXT(
    VkDevice _device,
    const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
    const VkAllocationCallbacks *pAllocator,
    VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
{
   VK_FROM_HANDLE(nvk_device, dev, _device);
   VkResult result;

   struct nvk_indirect_commands_layout *layout =
      vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout),
                       VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
   if (layout == NULL)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

   layout->stages = pCreateInfo->shaderStages;

   /* From the Vulkan 1.3.XXX spec:
    *
    *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093
    *
    *    "The number of tokens in the pTokens array with type equal to
    *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than
    *    or equal to 1"
    *
    * and
    *
    *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139
    *
    *    "If the pTokens array contains a
    *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this
    *    token must be the first token in the array"
    */
   if (pCreateInfo->tokenCount > 0 &&
       pCreateInfo->pTokens[0].type ==
         VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) {
      const VkIndirectCommandsExecutionSetTokenEXT *token =
         pCreateInfo->pTokens[0].data.pExecutionSet;

      /* Pipelines should never mismatch here. */
      if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT)
         assert(token->shaderStages == pCreateInfo->shaderStages);

      layout->set_stages = token->shaderStages;
   }

   result = build_process_shader(dev, pCreateInfo, pAllocator,
                                 &layout->process, &layout->cmd_seq_stride_B,
                                 &layout->qmd_size_per_seq_B);
   if (result != VK_SUCCESS) {
      nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
      return result;
   }

   if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) {
      nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "Too many tokens in IndirectCommandsLayout");
   }

   result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B,
                              pAllocator, &layout->init);
   if (result != VK_SUCCESS) {
      nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
      return result;
   }

   *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout);

   return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL
nvk_DestroyIndirectCommandsLayoutEXT(
    VkDevice _device,
    VkIndirectCommandsLayoutEXT indirectCommandsLayout,
    const VkAllocationCallbacks *pAllocator)
{
   VK_FROM_HANDLE(nvk_device, dev, _device);
   VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
                  indirectCommandsLayout);

   if (layout == NULL)
      return;

   nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
}

VKAPI_ATTR void VKAPI_CALL
nvk_GetGeneratedCommandsMemoryRequirementsEXT(
    VkDevice _device,
    const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
    VkMemoryRequirements2 *pMemoryRequirements)
{
   VK_FROM_HANDLE(nvk_device, dev, _device);
   VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
                  pInfo->indirectCommandsLayout);
   struct nvk_physical_device *pdev = nvk_device_physical(dev);

   uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount;
   if (layout->qmd_size_per_seq_B > 0) {
      size = align64(size, QMD_ALIGN);
      size += QMD_ALLOC_SIZE;
      size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount;
   }

   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
      .size = size,
      .alignment = QMD_ALIGN,
      .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
   };
}

static void
nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd,
                     const VkGeneratedCommandsInfoEXT *info,
                     const struct nvk_cmd_state *state)
{
   VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet);
   VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
                  info->indirectCommandsLayout);

   struct process_cmd_push push = {
      .in_addr       = info->indirectAddress,
      .out_addr      = info->preprocessAddress,
      .count_addr    = info->sequenceCountAddress,
      .max_seq_count = info->maxSequenceCount,
   };

   uint64_t qmd_addr = 0;
   if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
      uint32_t global_size[3] = { 0, 0, 0 };
      VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr,
                                             &push.root_addr);
      if (unlikely(result != VK_SUCCESS)) {
         vk_command_buffer_set_error(&cmd->vk, result);
         return;
      }
   }

   if (layout->set_stages == 0) {
      push.ies_addr = qmd_addr;
   } else {
      assert(layout->set_stages == layout->stages);
      push.ies_addr   = ies->mem->va->addr;
      push.ies_stride = ies->stride_B;
   }

   if (layout->qmd_size_per_seq_B > 0) {
      assert(info->preprocessAddress % QMD_ALIGN == 0);
      uint64_t qmd_offset =
         layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
      qmd_offset = align64(qmd_offset, QMD_ALIGN);
      push.qmd_pool_addr = info->preprocessAddress + qmd_offset;
   }

   if (layout->init != NULL) {
      nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1);

      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
      P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0);
   }

   nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push),
                           DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1);
}

static void
nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd,
                            const VkGeneratedCommandsInfoEXT *info)
{
   struct nvk_descriptor_state *desc =
      nvk_get_descriptor_state_for_stages(cmd, info->shaderStages);
   nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
                                      const VkGeneratedCommandsInfoEXT *info,
                                      VkCommandBuffer stateCommandBuffer)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer);

   nvk_cmd_flush_process_state(state_cmd, info);
   nvk_cmd_process_cmds(cmd, info, &state_cmd->state);
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
                                   VkBool32 isPreprocessed,
                                   const VkGeneratedCommandsInfoEXT *info)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
                  info->indirectCommandsLayout);

   if (!isPreprocessed) {
      nvk_cmd_flush_process_state(cmd, info);
      nvk_cmd_process_cmds(cmd, info, &cmd->state);

      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
      P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, {
         .data = DATA_TRUE,
         .constant = CONSTANT_TRUE,
         .flush_data = FLUSH_DATA_TRUE,
      });
      P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
      __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
   }

   if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
      assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
      nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors);
   } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
      assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));

      nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
      nvk_cmd_flush_gfx_dynamic_state(cmd);

      if (layout->set_stages == 0) {
         /* In this case, we're using the CPU-bound shaders */
         nvk_cmd_flush_gfx_shaders(cmd);
         nvk_cmd_flush_gfx_cbufs(cmd);
      } else {
         /* From the Vulkan 1.3.XXX spec:
          *
          *    "If indirectCommandsLayout was created with a token sequence
          *    that contained the
          *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and
          *    indirectExecutionSet was created using
          *    VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every
          *    executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT
          *    token must bind all the shader stages set in the
          *    VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to
          *    create indirectCommandsLayout"
          *
          * So we unbind anything not explicitly bound by the layout and trust
          * the layout to bind the rest.
          */
         assert(layout->set_stages == layout->stages);

         uint8_t set_types = 0;
         u_foreach_bit(s, layout->set_stages) {
            gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
            uint32_t type = mesa_to_nv9097_shader_type(stage);
            set_types |= BITFIELD_BIT(type);
         }

         uint8_t unset_types = BITFIELD_MASK(6) & ~set_types;

         struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
         u_foreach_bit(type, unset_types) {
            P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
               .enable  = ENABLE_FALSE,
               .type    = type,
            });
         }
      }
   }

   ASSERTED const uint64_t size =
      layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
   assert(size <= info->preprocessSize);

   uint64_t addr = info->preprocessAddress;
   uint64_t seq_count = info->maxSequenceCount;

   /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if
    * the kernel inserts a sync point between two of our pushes, it doesn't
    * break a single command.
    */
   const uint32_t max_seq_per_push =
      (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B;

   while (seq_count > 0) {
      uint32_t push_seq = MIN2(seq_count, max_seq_per_push);
      uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B;
      nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B);
      addr += push_size_B;
      seq_count -= push_seq;
   }

   if (layout->set_stages != 0) {
      if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
         cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS;
      }
   }
}