• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_builder.h"
29 #include "brw_generator.h"
30 #include "brw_nir.h"
31 #include "brw_private.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "dev/intel_debug.h"
34 
35 #include <memory>
36 
37 using namespace brw;
38 
39 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)40 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
41                                    UNUSED const void *data)
42 {
43    if (instr->type != nir_instr_type_intrinsic)
44       return false;
45    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
46    return intrin->intrinsic == nir_intrinsic_load_uniform;
47 }
48 
49 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,void * data)50 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
51                                  void *data)
52 {
53    assert(instr->type == nir_instr_type_intrinsic);
54    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
55    assert(intrin->intrinsic == nir_intrinsic_load_uniform);
56 
57    /* Use the first few bytes of InlineData as push constants. */
58    if (nir_src_is_const(intrin->src[0])) {
59       int offset =
60          BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 +
61          nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
62       int range = intrin->def.num_components * intrin->def.bit_size / 8;
63       if ((offset + range) <= (int)(BRW_TASK_MESH_INLINE_DATA_SIZE_DW * 4)) {
64          return nir_load_inline_data_intel(b,
65                                            intrin->def.num_components,
66                                            intrin->def.bit_size,
67                                            .base = offset);
68       }
69    }
70 
71    return brw_nir_load_global_const(b, intrin,
72                                     nir_load_inline_data_intel(b, 1, 64, 0), 0);
73 }
74 
75 static bool
brw_nir_lower_load_uniforms(nir_shader * nir,const struct intel_device_info * devinfo)76 brw_nir_lower_load_uniforms(nir_shader *nir,
77                             const struct intel_device_info *devinfo)
78 {
79    return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
80                                         brw_nir_lower_load_uniforms_impl,
81                                         (void *)devinfo);
82 }
83 
84 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)85 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
86 {
87    return glsl_count_dword_slots(type, bindless);
88 }
89 
90 /* TODO(mesh): Make this a common function. */
91 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)92 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
93 {
94    assert(glsl_type_is_vector_or_scalar(type));
95 
96    uint32_t comp_size = glsl_type_is_boolean(type)
97       ? 4 : glsl_get_bit_size(type) / 8;
98    unsigned length = glsl_get_vector_elements(type);
99    *size = comp_size * length,
100    *align = comp_size * (length == 3 ? 4 : length);
101 }
102 
103 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)104 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
105                                            nir_intrinsic_instr *intrin,
106                                            void *data)
107 {
108    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
109       return false;
110 
111    b->cursor = nir_before_instr(&intrin->instr);
112 
113    nir_def *local_invocation_index = nir_load_local_invocation_index(b);
114 
115    /* Make sure that the mesh workgroup size is taken from the first invocation
116     * (nir_intrinsic_launch_mesh_workgroups requirement)
117     */
118    nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
119    nir_if *if_stmt = nir_push_if(b, cmp);
120    {
121       /* TUE header contains 4 words:
122        *
123        * - Word 0 for Task Count.
124        *
125        * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
126        *   3D dispatch into the 1D dispatch supported by HW.
127        */
128       nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
129       nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
130       nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
131       nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
132       nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
133       nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
134    }
135    nir_pop_if(b, if_stmt);
136 
137    nir_instr_remove(&intrin->instr);
138 
139    return true;
140 }
141 
142 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)143 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
144 {
145    return nir_shader_intrinsics_pass(nir,
146                                        brw_nir_lower_launch_mesh_workgroups_instr,
147                                        nir_metadata_none,
148                                        NULL);
149 }
150 
151 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)152 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
153 {
154    memset(map, 0, sizeof(*map));
155 
156    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
157             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
158 
159    /* From bspec: "It is suggested that SW reserve the 16 bytes following the
160     * TUE Header, and therefore start the SW-defined data structure at 32B
161     * alignment.  This allows the TUE Header to always be written as 32 bytes
162     * with 32B alignment, the most optimal write performance case."
163     */
164    map->per_task_data_start_dw = 8;
165 
166    /* Lowering to explicit types will start offsets from task_payload_size, so
167     * set it to start after the header.
168     */
169    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
170    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
171             nir_var_mem_task_payload, shared_type_info);
172    NIR_PASS(_, nir, nir_lower_explicit_io,
173             nir_var_mem_task_payload, nir_address_format_32bit_offset);
174 
175    map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
176 }
177 
178 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)179 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
180 {
181    fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
182 }
183 
184 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)185 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
186                                           nir_intrinsic_instr *intrin,
187                                           void *data)
188 {
189    switch (intrin->intrinsic) {
190    case nir_intrinsic_store_task_payload:
191    case nir_intrinsic_load_task_payload: {
192       nir_src *offset_src = nir_get_io_offset_src(intrin);
193 
194       if (nir_src_is_const(*offset_src))
195          assert(nir_src_as_uint(*offset_src) % 4 == 0);
196 
197       b->cursor = nir_before_instr(&intrin->instr);
198 
199       /* Regular I/O uses dwords while explicit I/O used for task payload uses
200        * bytes.  Normalize it to dwords.
201        *
202        * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
203        */
204 
205       nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
206       nir_src_rewrite(offset_src, offset);
207 
208       unsigned base = nir_intrinsic_base(intrin);
209       assert(base % 4 == 0);
210       nir_intrinsic_set_base(intrin, base / 4);
211 
212       return true;
213    }
214 
215    default:
216       return false;
217    }
218 }
219 
220 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)221 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
222 {
223    return nir_shader_intrinsics_pass(nir,
224                                        brw_nir_adjust_task_payload_offsets_instr,
225                                        nir_metadata_control_flow,
226                                        NULL);
227 }
228 
229 void
brw_nir_adjust_payload(nir_shader * shader)230 brw_nir_adjust_payload(nir_shader *shader)
231 {
232    /* Adjustment of task payload offsets must be performed *after* last pass
233     * which interprets them as bytes, because it changes their unit.
234     */
235    bool adjusted = false;
236    NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
237    if (adjusted) /* clean up the mess created by offset adjustments */
238       NIR_PASS(_, shader, nir_opt_constant_folding);
239 }
240 
241 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)242 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
243                                            nir_intrinsic_instr *intrin,
244                                            void *data)
245 {
246    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
247       return false;
248 
249    /* nir_lower_task_shader uses "range" as task payload size. */
250    unsigned range = nir_intrinsic_range(intrin);
251    /* This will avoid special case in nir_lower_task_shader dealing with
252     * not vec4-aligned payload when payload_in_shared workaround is enabled.
253     */
254    nir_intrinsic_set_range(intrin, ALIGN(range, 16));
255 
256    return true;
257 }
258 
259 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)260 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
261 {
262    return nir_shader_intrinsics_pass(nir,
263                                        brw_nir_align_launch_mesh_workgroups_instr,
264                                        nir_metadata_control_flow,
265                                        NULL);
266 }
267 
268 static bool
lower_set_vtx_and_prim_to_temp_write(nir_builder * b,nir_intrinsic_instr * intrin,void * data)269 lower_set_vtx_and_prim_to_temp_write(nir_builder *b,
270                                      nir_intrinsic_instr *intrin,
271                                      void *data)
272 {
273    if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
274       return false;
275 
276    /* Detect some cases of invalid primitive count. They might lead to URB
277     * memory corruption, where workgroups overwrite each other output memory.
278     */
279    if (nir_src_is_const(intrin->src[1]) &&
280        nir_src_as_uint(intrin->src[1]) > b->shader->info.mesh.max_primitives_out)
281       unreachable("number of primitives bigger than max specified");
282 
283    b->cursor = nir_instr_remove(&intrin->instr);
284 
285    nir_variable *temporary_primitive_count = (nir_variable *)data;
286    nir_store_var(b, temporary_primitive_count, intrin->src[1].ssa, 0x1);
287 
288    return true;
289 }
290 
291 static bool
brw_nir_lower_mesh_primitive_count(nir_shader * nir)292 brw_nir_lower_mesh_primitive_count(nir_shader *nir)
293 {
294    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
295 
296    nir_variable *temporary_primitive_count =
297       nir_local_variable_create(impl,
298                                 glsl_uint_type(),
299                                 "__temp_primitive_count");
300 
301    nir_shader_intrinsics_pass(nir,
302                               lower_set_vtx_and_prim_to_temp_write,
303                               nir_metadata_control_flow,
304                               temporary_primitive_count);
305 
306    nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
307 
308    nir_store_var(b, temporary_primitive_count, nir_imm_int(b, 0), 0x1);
309 
310    b->cursor = nir_after_impl(impl);
311 
312    /* Have a single lane write the primitive count */
313    nir_def *local_invocation_index = nir_load_local_invocation_index(b);
314    nir_push_if(b, nir_ieq_imm(b, local_invocation_index, 0));
315    {
316       nir_variable *final_primitive_count =
317          nir_create_variable_with_location(nir, nir_var_shader_out,
318                                            VARYING_SLOT_PRIMITIVE_COUNT,
319                                            glsl_uint_type());
320       final_primitive_count->name = ralloc_strdup(final_primitive_count,
321                                                   "gl_PrimitiveCountNV");
322       final_primitive_count->data.interpolation = INTERP_MODE_NONE;
323 
324       nir_store_var(b, final_primitive_count,
325                     nir_load_var(b, temporary_primitive_count), 0x1);
326    }
327    nir_pop_if(b, NULL);
328 
329    nir_metadata_preserve(impl, nir_metadata_none);
330 
331    nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_COUNT;
332 
333    return true;
334 }
335 
336 static void
brw_emit_urb_fence(fs_visitor & s)337 brw_emit_urb_fence(fs_visitor &s)
338 {
339    const brw_builder bld1 = brw_builder(&s).at_end().exec_all().group(1, 0);
340    brw_reg dst = bld1.vgrf(BRW_TYPE_UD);
341    fs_inst *fence = bld1.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
342                               brw_vec8_grf(0, 0),
343                               brw_imm_ud(true),
344                               brw_imm_ud(0));
345    fence->sfid = BRW_SFID_URB;
346    /* The logical thing here would likely be a THREADGROUP fence but that's
347     * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
348     *
349     * Gfx12.5 has a comment about this on BSpec 53533 :
350     *
351     *    "If fence scope is Local or Threadgroup, HW ignores the flush type
352     *     and operates as if it was set to None (no flush)"
353     *
354     * Software workaround from HSD-22014129519 indicates that a GPU fence
355     * resolves the issue.
356     */
357    fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_GPU,
358                                     LSC_FLUSH_TYPE_NONE, true);
359 
360    bld1.emit(FS_OPCODE_SCHEDULING_FENCE, bld1.null_reg_ud(), &dst, 1);
361 }
362 
363 static bool
run_task_mesh(fs_visitor & s,bool allow_spilling)364 run_task_mesh(fs_visitor &s, bool allow_spilling)
365 {
366    assert(s.stage == MESA_SHADER_TASK ||
367           s.stage == MESA_SHADER_MESH);
368 
369    s.payload_ = new task_mesh_thread_payload(s);
370 
371    nir_to_brw(&s);
372 
373    if (s.failed)
374       return false;
375 
376    brw_emit_urb_fence(s);
377 
378    s.emit_cs_terminate();
379 
380    brw_calculate_cfg(s);
381 
382    brw_optimize(s);
383 
384    s.assign_curb_setup();
385 
386    brw_lower_3src_null_dest(s);
387    brw_workaround_memory_fence_before_eot(s);
388    brw_workaround_emit_dummy_mov_instruction(s);
389 
390    brw_allocate_registers(s, allow_spilling);
391 
392    brw_workaround_source_arf_before_eot(s);
393 
394    return !s.failed;
395 }
396 
397 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)398 brw_compile_task(const struct brw_compiler *compiler,
399                  struct brw_compile_task_params *params)
400 {
401    const struct intel_device_info *devinfo = compiler->devinfo;
402    struct nir_shader *nir = params->base.nir;
403    const struct brw_task_prog_key *key = params->key;
404    struct brw_task_prog_data *prog_data = params->prog_data;
405    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
406 
407    brw_nir_lower_tue_outputs(nir, &prog_data->map);
408 
409    NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
410 
411    nir_lower_task_shader_options lower_ts_opt = {
412       .payload_to_shared_for_atomics = true,
413       .payload_to_shared_for_small_types = true,
414       /* The actual payload data starts after the TUE header and padding,
415        * so skip those when copying.
416        */
417       .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
418    };
419    NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
420 
421    NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
422 
423    prog_data->base.base.stage = MESA_SHADER_TASK;
424    prog_data->base.base.total_shared = nir->info.shared_size;
425    prog_data->base.base.total_scratch = 0;
426 
427    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
428    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
429    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
430 
431    prog_data->uses_drawid =
432       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
433 
434    NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
435    prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
436 
437    brw_simd_selection_state simd_state{
438       .devinfo = compiler->devinfo,
439       .prog_data = &prog_data->base,
440       .required_width = brw_required_dispatch_width(&nir->info),
441    };
442 
443    std::unique_ptr<fs_visitor> v[3];
444 
445    for (unsigned i = 0; i < 3; i++) {
446       const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
447 
448       if (!brw_simd_should_compile(simd_state, simd))
449          continue;
450 
451       const unsigned dispatch_width = 8 << simd;
452 
453       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
454       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
455 
456       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
457 
458       brw_postprocess_nir(shader, compiler, debug_enabled,
459                           key->base.robust_flags);
460 
461       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
462                                              &key->base,
463                                              &prog_data->base.base,
464                                              shader, dispatch_width,
465                                              params->base.stats != NULL,
466                                              debug_enabled);
467 
468       if (prog_data->base.prog_mask) {
469          unsigned first = ffs(prog_data->base.prog_mask) - 1;
470          v[simd]->import_uniforms(v[first].get());
471       }
472 
473       const bool allow_spilling = simd == 0 ||
474          (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
475       if (run_task_mesh(*v[simd], allow_spilling)) {
476          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
477 
478          if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
479             break;
480       } else {
481          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
482       }
483    }
484 
485    int selected_simd = brw_simd_select(simd_state);
486    if (selected_simd < 0) {
487       params->base.error_str =
488          ralloc_asprintf(params->base.mem_ctx,
489                          "Can't compile shader: "
490                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
491                          simd_state.error[0], simd_state.error[1],
492                          simd_state.error[2]);
493       return NULL;
494    }
495 
496    fs_visitor *selected = v[selected_simd].get();
497    prog_data->base.prog_mask = 1 << selected_simd;
498    prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
499                                         selected->grf_used);
500 
501    if (unlikely(debug_enabled)) {
502       fprintf(stderr, "Task Output ");
503       brw_print_tue_map(stderr, &prog_data->map);
504    }
505 
506    brw_generator g(compiler, &params->base, &prog_data->base.base,
507                   MESA_SHADER_TASK);
508    if (unlikely(debug_enabled)) {
509       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
510                                      "%s task shader %s",
511                                      nir->info.label ? nir->info.label
512                                                      : "unnamed",
513                                      nir->info.name));
514    }
515 
516    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
517                    selected->performance_analysis.require(), params->base.stats);
518    g.add_const_data(nir->constant_data, nir->constant_data_size);
519    return g.get_assembly();
520 }
521 
522 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)523 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
524 {
525    if (!map)
526       return;
527 
528    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
529 
530    bool progress = false;
531 
532    NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
533             nir_var_mem_task_payload, shared_type_info);
534 
535    if (progress) {
536       /* The types for Task Output and Mesh Input should match, so their sizes
537        * should also match.
538        */
539       assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
540    } else {
541       /* Mesh doesn't read any input, to make it clearer set the
542        * task_payload_size to zero instead of keeping an incomplete size that
543        * just includes the header.
544        */
545       nir->info.task_payload_size = 0;
546    }
547 
548    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
549             nir_address_format_32bit_offset);
550 }
551 
552 /* Attribute types. Flat attributes have to be a separate class because
553  * flat and interpolated attributes can't share the same vec4 slot
554  * (see 3DSTATE_SBE.ConstantInterpolationEnable).
555  */
556 enum {
557    PRIM, /* per primitive */
558    VERT, /* per vertex interpolated */
559    VERT_FLAT, /* per vertex flat */
560 };
561 
562 struct attr_desc {
563    int location;
564    const struct glsl_type *type;
565    unsigned dwords;
566    unsigned slots;
567 };
568 
569 struct attr_type_info {
570    /* order of attributes, negative values are holes */
571    std::list<struct attr_desc> *order;
572 
573    /* attributes after which there's hole of size equal to array index */
574    std::list<int> holes[5];
575 };
576 
577 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)578 brw_mue_assign_position(const struct attr_desc *attr,
579                         struct brw_mue_map *map,
580                         unsigned start_dw)
581 {
582    bool is_array = glsl_type_is_array(attr->type);
583    int location = attr->location;
584    unsigned remaining = attr->dwords;
585 
586    for (unsigned slot = 0; slot < attr->slots; ++slot) {
587       map->start_dw[location + slot] = start_dw;
588 
589       unsigned sz;
590 
591       if (is_array) {
592          assert(attr->dwords % attr->slots == 0);
593          sz = attr->dwords / attr->slots;
594       } else {
595          sz = MIN2(remaining, 4);
596       }
597 
598       map->len_dw[location + slot] = sz;
599       start_dw += sz;
600       remaining -= sz;
601    }
602 }
603 
604 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)605 brw_nir_find_complete_variable_with_location(nir_shader *shader,
606                                              nir_variable_mode mode,
607                                              int location)
608 {
609    nir_variable *best_var = NULL;
610    unsigned last_size = 0;
611 
612    nir_foreach_variable_with_modes(var, shader, mode) {
613       if (var->data.location != location)
614          continue;
615 
616       unsigned new_size = glsl_count_dword_slots(var->type, false);
617       if (new_size > last_size) {
618          best_var = var;
619          last_size = new_size;
620       }
621    }
622 
623    return best_var;
624 }
625 
626 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)627 brw_sum_size(const std::list<struct attr_desc> &orders)
628 {
629    unsigned sz = 0;
630    for (auto it = orders.cbegin(); it != orders.cend(); ++it)
631       sz += (*it).dwords;
632    return sz;
633 }
634 
635 /* Finds order of outputs which require minimum size, without splitting
636  * of URB read/write messages (which operate on vec4-aligned memory).
637  */
638 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)639 brw_compute_mue_layout(const struct brw_compiler *compiler,
640                        std::list<struct attr_desc> *orders,
641                        uint64_t outputs_written,
642                        struct nir_shader *nir,
643                        bool *pack_prim_data_into_header,
644                        bool *pack_vert_data_into_header)
645 {
646    const struct shader_info *info = &nir->info;
647 
648    struct attr_type_info data[3];
649 
650    if ((compiler->mesh.mue_header_packing & 1) == 0)
651       *pack_prim_data_into_header = false;
652    if ((compiler->mesh.mue_header_packing & 2) == 0)
653       *pack_vert_data_into_header = false;
654 
655    for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
656       data[i].order = &orders[i];
657 
658    /* If packing into header is enabled, add a hole of size 4 and add
659     * a virtual location to keep the algorithm happy (it expects holes
660     * to be preceded by some location). We'll remove those virtual
661     * locations at the end.
662     */
663    const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
664    assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
665 
666    struct attr_desc d;
667    d.location = virtual_header_location;
668    d.type = NULL;
669    d.dwords = 0;
670    d.slots = 0;
671 
672    struct attr_desc h;
673    h.location = -1;
674    h.type = NULL;
675    h.dwords = 4;
676    h.slots = 0;
677 
678    if (*pack_prim_data_into_header) {
679       orders[PRIM].push_back(d);
680       orders[PRIM].push_back(h);
681       data[PRIM].holes[4].push_back(virtual_header_location);
682    }
683 
684    if (*pack_vert_data_into_header) {
685       orders[VERT].push_back(d);
686       orders[VERT].push_back(h);
687       data[VERT].holes[4].push_back(virtual_header_location);
688    }
689 
690    u_foreach_bit64(location, outputs_written) {
691       if ((BITFIELD64_BIT(location) & outputs_written) == 0)
692          continue;
693 
694       /* At this point there are both complete and split variables as
695        * outputs. We need the complete variable to compute the required
696        * size.
697        */
698       nir_variable *var =
699             brw_nir_find_complete_variable_with_location(nir,
700                                                          nir_var_shader_out,
701                                                          location);
702 
703       d.location = location;
704       d.type     = brw_nir_get_var_type(nir, var);
705       d.dwords   = glsl_count_dword_slots(d.type, false);
706       d.slots    = glsl_count_attribute_slots(d.type, false);
707 
708       struct attr_type_info *type_data;
709 
710       if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
711          type_data = &data[PRIM];
712       else if (var->data.interpolation == INTERP_MODE_FLAT)
713          type_data = &data[VERT_FLAT];
714       else
715          type_data = &data[VERT];
716 
717       std::list<struct attr_desc> *order = type_data->order;
718       std::list<int> *holes = type_data->holes;
719 
720       outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
721 
722       /* special case to use hole of size 4 */
723       if (d.dwords == 4 && !holes[4].empty()) {
724          holes[4].pop_back();
725 
726          assert(order->front().location == virtual_header_location);
727          order->pop_front();
728 
729          assert(order->front().location == -1);
730          assert(order->front().dwords == 4);
731          order->front() = d;
732 
733          continue;
734       }
735 
736       int mod = d.dwords % 4;
737       if (mod == 0) {
738          order->push_back(d);
739          continue;
740       }
741 
742       h.location = -1;
743       h.type = NULL;
744       h.dwords = 4 - mod;
745       h.slots = 0;
746 
747       if (!compiler->mesh.mue_compaction) {
748          order->push_back(d);
749          order->push_back(h);
750          continue;
751       }
752 
753       if (d.dwords > 4) {
754          order->push_back(d);
755          order->push_back(h);
756          holes[h.dwords].push_back(location);
757          continue;
758       }
759 
760       assert(d.dwords < 4);
761 
762       unsigned found = 0;
763       /* try to find the smallest hole big enough to hold this attribute */
764       for (unsigned sz = d.dwords; sz <= 4; sz++){
765          if (!holes[sz].empty()) {
766             found = sz;
767             break;
768          }
769       }
770 
771       /* append at the end if not found */
772       if (found == 0) {
773          order->push_back(d);
774          order->push_back(h);
775          holes[h.dwords].push_back(location);
776 
777          continue;
778       }
779 
780       assert(found <= 4);
781       assert(!holes[found].empty());
782       int after_loc = holes[found].back();
783       holes[found].pop_back();
784 
785       bool inserted_back = false;
786 
787       for (auto it = order->begin(); it != order->end(); ++it) {
788          if ((*it).location != after_loc)
789             continue;
790 
791          ++it;
792          /* must be a hole */
793          assert((*it).location < 0);
794          /* and it must be big enough */
795          assert(d.dwords <= (*it).dwords);
796 
797          if (d.dwords == (*it).dwords) {
798             /* exact size, just replace */
799             *it = d;
800          } else {
801             /* inexact size, shrink hole */
802             (*it).dwords -= d.dwords;
803             /* and insert new attribute before it */
804             order->insert(it, d);
805 
806             /* Insert shrunk hole in a spot so that the order of attributes
807              * is preserved.
808              */
809             std::list<int> &hole_list = holes[(*it).dwords];
810             std::list<int>::iterator insert_before = hole_list.end();
811 
812             for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
813                if ((*it2) >= (int)location) {
814                   insert_before = it2;
815                   break;
816                }
817             }
818 
819             hole_list.insert(insert_before, location);
820          }
821 
822          inserted_back = true;
823          break;
824       }
825 
826       assert(inserted_back);
827    }
828 
829    if (*pack_prim_data_into_header) {
830       if (orders[PRIM].front().location == virtual_header_location)
831          orders[PRIM].pop_front();
832 
833       if (!data[PRIM].holes[4].empty()) {
834          *pack_prim_data_into_header = false;
835 
836          assert(orders[PRIM].front().location == -1);
837          assert(orders[PRIM].front().dwords == 4);
838          orders[PRIM].pop_front();
839       }
840 
841       if (*pack_prim_data_into_header) {
842          unsigned sz = brw_sum_size(orders[PRIM]);
843 
844          if (sz % 8 == 0 || sz % 8 > 4)
845             *pack_prim_data_into_header = false;
846       }
847    }
848 
849    if (*pack_vert_data_into_header) {
850       if (orders[VERT].front().location == virtual_header_location)
851          orders[VERT].pop_front();
852 
853       if (!data[VERT].holes[4].empty()) {
854          *pack_vert_data_into_header = false;
855 
856          assert(orders[VERT].front().location == -1);
857          assert(orders[VERT].front().dwords == 4);
858          orders[VERT].pop_front();
859       }
860 
861       if (*pack_vert_data_into_header) {
862          unsigned sz = brw_sum_size(orders[VERT]) +
863                        brw_sum_size(orders[VERT_FLAT]);
864 
865          if (sz % 8 == 0 || sz % 8 > 4)
866             *pack_vert_data_into_header = false;
867       }
868    }
869 
870 
871    if (INTEL_DEBUG(DEBUG_MESH)) {
872       fprintf(stderr, "MUE attribute order:\n");
873       for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
874          if (!orders[i].empty())
875             fprintf(stderr, "%d: ", i);
876          for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
877             fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
878          }
879          if (!orders[i].empty())
880             fprintf(stderr, "\n");
881       }
882    }
883 }
884 
885 /* Mesh URB Entry consists of an initial section
886  *
887  *  - Primitive Count
888  *  - Primitive Indices (from 0 to Max-1)
889  *  - Padding to 32B if needed
890  *
891  * optionally followed by a section for per-primitive data,
892  * in which each primitive (from 0 to Max-1) gets
893  *
894  *  - Primitive Header (e.g. ViewportIndex)
895  *  - Primitive Custom Attributes
896  *
897  * then followed by a section for per-vertex data
898  *
899  *  - Vertex Header (e.g. Position)
900  *  - Vertex Custom Attributes
901  *
902  * Each per-element section has a pitch and a starting offset.  All the
903  * individual attributes offsets in start_dw are considering the first entry
904  * of the section (i.e. where the Position for first vertex, or ViewportIndex
905  * for first primitive).  Attributes for other elements are calculated using
906  * the pitch.
907  */
908 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)909 brw_compute_mue_map(const struct brw_compiler *compiler,
910                     struct nir_shader *nir, struct brw_mue_map *map,
911                     enum brw_mesh_index_format index_format, bool compact_mue)
912 {
913    memset(map, 0, sizeof(*map));
914 
915    memset(&map->start_dw[0], -1, sizeof(map->start_dw));
916    memset(&map->len_dw[0], 0, sizeof(map->len_dw));
917 
918    unsigned vertices_per_primitive =
919       mesa_vertices_per_prim(nir->info.mesh.primitive_type);
920 
921    map->max_primitives = nir->info.mesh.max_primitives_out;
922    map->max_vertices = nir->info.mesh.max_vertices_out;
923 
924    uint64_t outputs_written = nir->info.outputs_written;
925 
926    /* One dword for primitives count then K extra dwords for each primitive. */
927    switch (index_format) {
928    case BRW_INDEX_FORMAT_U32:
929       map->per_primitive_indices_dw = vertices_per_primitive;
930       break;
931    case BRW_INDEX_FORMAT_U888X:
932       map->per_primitive_indices_dw = 1;
933       break;
934    default:
935       unreachable("invalid index format");
936    }
937 
938    map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
939                                        map->max_primitives + 1, 8);
940 
941    /* Assign initial section. */
942    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
943       map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
944       map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
945       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
946    }
947    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
948       map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
949       map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
950             map->per_primitive_indices_dw * map->max_primitives;
951       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
952    }
953 
954    const uint64_t per_primitive_header_bits =
955          BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
956          BITFIELD64_BIT(VARYING_SLOT_LAYER) |
957          BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
958          BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
959 
960    const uint64_t per_vertex_header_bits =
961          BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
962          BITFIELD64_BIT(VARYING_SLOT_POS) |
963          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
964          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
965 
966    std::list<struct attr_desc> orders[3];
967    uint64_t regular_outputs = outputs_written &
968          ~(per_primitive_header_bits | per_vertex_header_bits);
969 
970    /* packing into prim header is possible only if prim header is present */
971    map->user_data_in_primitive_header = compact_mue &&
972          (outputs_written & per_primitive_header_bits) != 0;
973 
974    /* Packing into vert header is always possible, but we allow it only
975     * if full vec4 is available (so point size is not used) and there's
976     * nothing between it and normal vertex data (so no clip distances).
977     */
978    map->user_data_in_vertex_header = compact_mue &&
979          (outputs_written & per_vertex_header_bits) ==
980                BITFIELD64_BIT(VARYING_SLOT_POS);
981 
982    if (outputs_written & per_primitive_header_bits) {
983       bool zero_layer_viewport = false;
984       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
985          map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
986                map->per_primitive_start_dw + 0;
987          map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
988          /* Wa_16020916187: force 0 writes to layer and viewport slots */
989          zero_layer_viewport =
990             intel_needs_workaround(compiler->devinfo, 16020916187);
991       }
992 
993       if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) ||
994           zero_layer_viewport) {
995          map->start_dw[VARYING_SLOT_LAYER] =
996                map->per_primitive_start_dw + 1; /* RTAIndex */
997          map->len_dw[VARYING_SLOT_LAYER] = 1;
998       }
999 
1000       if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) ||
1001           zero_layer_viewport) {
1002           map->start_dw[VARYING_SLOT_VIEWPORT] =
1003                 map->per_primitive_start_dw + 2;
1004           map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
1005       }
1006 
1007       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
1008          map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
1009                map->per_primitive_start_dw + 3;
1010          map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
1011       }
1012 
1013       map->per_primitive_header_size_dw = 8;
1014       outputs_written &= ~per_primitive_header_bits;
1015    } else {
1016       map->per_primitive_header_size_dw = 0;
1017    }
1018 
1019    map->per_primitive_data_size_dw = 0;
1020 
1021    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1022     * will be compiled without access to the MUE map and won't be able to find
1023     * out where everything is.
1024     * Instead, keep doing things as we did before the packing, just laying out
1025     * everything in varying order, which is how the FS will expect them.
1026     */
1027    if (compact_mue) {
1028       brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
1029                              &map->user_data_in_primitive_header,
1030                              &map->user_data_in_vertex_header);
1031 
1032       unsigned start_dw = map->per_primitive_start_dw;
1033       if (map->user_data_in_primitive_header)
1034          start_dw += 4; /* first 4 dwords are used */
1035       else
1036          start_dw += map->per_primitive_header_size_dw;
1037       unsigned header_used_dw = 0;
1038 
1039       for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
1040          int location = (*it).location;
1041          if (location < 0) {
1042             start_dw += (*it).dwords;
1043             if (map->user_data_in_primitive_header && header_used_dw < 4)
1044                header_used_dw += (*it).dwords;
1045             else
1046                map->per_primitive_data_size_dw += (*it).dwords;
1047             assert(header_used_dw <= 4);
1048             continue;
1049          }
1050 
1051          assert(map->start_dw[location] == -1);
1052 
1053          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
1054                 location >= VARYING_SLOT_VAR0);
1055 
1056          brw_mue_assign_position(&*it, map, start_dw);
1057 
1058          start_dw += (*it).dwords;
1059          if (map->user_data_in_primitive_header && header_used_dw < 4)
1060             header_used_dw += (*it).dwords;
1061          else
1062             map->per_primitive_data_size_dw += (*it).dwords;
1063          assert(header_used_dw <= 4);
1064          outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1065       }
1066    } else {
1067       unsigned start_dw = map->per_primitive_start_dw +
1068                           map->per_primitive_header_size_dw;
1069 
1070       uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
1071       while (per_prim_outputs) {
1072          uint64_t location = ffsll(per_prim_outputs) - 1;
1073 
1074          assert(map->start_dw[location] == -1);
1075          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
1076                 location >= VARYING_SLOT_VAR0);
1077 
1078          nir_variable *var =
1079             brw_nir_find_complete_variable_with_location(nir,
1080                                                          nir_var_shader_out,
1081                                                          location);
1082          struct attr_desc d;
1083          d.location = location;
1084          d.type     = brw_nir_get_var_type(nir, var);
1085          d.dwords   = glsl_count_dword_slots(d.type, false);
1086          d.slots    = glsl_count_attribute_slots(d.type, false);
1087 
1088          brw_mue_assign_position(&d, map, start_dw);
1089 
1090          map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
1091          start_dw += ALIGN(d.dwords, 4);
1092 
1093          per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1094       }
1095    }
1096 
1097    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
1098                                        map->per_primitive_data_size_dw, 8);
1099 
1100    map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
1101                                     map->per_primitive_pitch_dw *
1102                                     map->max_primitives, 8);
1103 
1104    /* TODO(mesh): Multiview. */
1105    unsigned fixed_header_size = 8;
1106    map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
1107                                           nir->info.clip_distance_array_size +
1108                                           nir->info.cull_distance_array_size, 8);
1109 
1110    if (outputs_written & per_vertex_header_bits) {
1111       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
1112          map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
1113          map->len_dw[VARYING_SLOT_PSIZ] = 1;
1114       }
1115 
1116       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
1117          map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
1118          map->len_dw[VARYING_SLOT_POS] = 4;
1119       }
1120 
1121       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
1122          map->start_dw[VARYING_SLOT_CLIP_DIST0] =
1123                map->per_vertex_start_dw + fixed_header_size + 0;
1124          map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
1125       }
1126 
1127       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
1128          map->start_dw[VARYING_SLOT_CLIP_DIST1] =
1129                map->per_vertex_start_dw + fixed_header_size + 4;
1130          map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
1131       }
1132 
1133       outputs_written &= ~per_vertex_header_bits;
1134    }
1135 
1136    /* cull distances should be lowered earlier */
1137    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
1138    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
1139 
1140    map->per_vertex_data_size_dw = 0;
1141 
1142    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1143     * will be compiled without access to the MUE map and won't be able to find
1144     * out where everything is.
1145     * Instead, keep doing things as we did before the packing, just laying out
1146     * everything in varying order, which is how the FS will expect them.
1147     */
1148    if (compact_mue) {
1149       unsigned start_dw = map->per_vertex_start_dw;
1150       if (!map->user_data_in_vertex_header)
1151          start_dw += map->per_vertex_header_size_dw;
1152 
1153       unsigned header_used_dw = 0;
1154       for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1155          for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1156             int location = (*it).location;
1157             if (location < 0) {
1158                start_dw += (*it).dwords;
1159                if (map->user_data_in_vertex_header && header_used_dw < 4) {
1160                   header_used_dw += (*it).dwords;
1161                   assert(header_used_dw <= 4);
1162                   if (header_used_dw == 4)
1163                      start_dw += 4; /* jump over gl_position */
1164                } else {
1165                   map->per_vertex_data_size_dw += (*it).dwords;
1166                }
1167                continue;
1168             }
1169 
1170             assert(map->start_dw[location] == -1);
1171 
1172             assert(location >= VARYING_SLOT_VAR0);
1173 
1174             brw_mue_assign_position(&*it, map, start_dw);
1175 
1176             start_dw += (*it).dwords;
1177             if (map->user_data_in_vertex_header && header_used_dw < 4) {
1178                header_used_dw += (*it).dwords;
1179                assert(header_used_dw <= 4);
1180                if (header_used_dw == 4)
1181                   start_dw += 4; /* jump over gl_position */
1182             } else {
1183                map->per_vertex_data_size_dw += (*it).dwords;
1184             }
1185             outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1186          }
1187       }
1188    } else {
1189       unsigned start_dw = map->per_vertex_start_dw +
1190                           map->per_vertex_header_size_dw;
1191 
1192       uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1193       while (per_vertex_outputs) {
1194          uint64_t location = ffsll(per_vertex_outputs) - 1;
1195 
1196          assert(map->start_dw[location] == -1);
1197          assert(location >= VARYING_SLOT_VAR0);
1198 
1199          nir_variable *var =
1200             brw_nir_find_complete_variable_with_location(nir,
1201                                                          nir_var_shader_out,
1202                                                          location);
1203          struct attr_desc d;
1204          d.location = location;
1205          d.type     = brw_nir_get_var_type(nir, var);
1206          d.dwords   = glsl_count_dword_slots(d.type, false);
1207          d.slots    = glsl_count_attribute_slots(d.type, false);
1208 
1209          brw_mue_assign_position(&d, map, start_dw);
1210 
1211          map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1212          start_dw += ALIGN(d.dwords, 4);
1213 
1214          per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1215       }
1216    }
1217 
1218    map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1219                                     map->per_vertex_data_size_dw, 8);
1220 
1221    map->size_dw =
1222       map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1223 
1224    assert(map->size_dw % 8 == 0);
1225 }
1226 
1227 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1228 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1229 {
1230    fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1231            map->size_dw, map->max_primitives, map->max_vertices);
1232    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1233            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1234            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1235            map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1236    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1237            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1238            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1239            map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1240 
1241    fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1242            map->per_primitive_start_dw,
1243            map->per_primitive_header_size_dw,
1244            map->per_primitive_data_size_dw,
1245            map->per_primitive_pitch_dw);
1246 
1247    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1248       if (map->start_dw[i] < 0)
1249          continue;
1250 
1251       const unsigned offset = map->start_dw[i];
1252       const unsigned len = map->len_dw[i];
1253 
1254       if (offset < map->per_primitive_start_dw ||
1255           offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1256          continue;
1257 
1258       const char *name =
1259             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1260                                            MESA_SHADER_MESH);
1261 
1262       fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1263               name, i);
1264    }
1265 
1266    fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1267            map->per_vertex_start_dw,
1268            map->per_vertex_header_size_dw,
1269            map->per_vertex_data_size_dw,
1270            map->per_vertex_pitch_dw);
1271 
1272    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1273       if (map->start_dw[i] < 0)
1274          continue;
1275 
1276       const unsigned offset = map->start_dw[i];
1277       const unsigned len = map->len_dw[i];
1278 
1279       if (offset < map->per_vertex_start_dw ||
1280           offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1281          continue;
1282 
1283       nir_variable *var =
1284             nir_find_variable_with_location(nir, nir_var_shader_out, i);
1285       bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1286 
1287       const char *name =
1288             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1289                                            MESA_SHADER_MESH);
1290 
1291       fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1292               name, i, flat ? " (flat)" : "");
1293    }
1294 
1295    fprintf(fp, "\n");
1296 }
1297 
1298 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1299 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1300 {
1301    nir_foreach_shader_out_variable(var, nir) {
1302       int location = var->data.location;
1303       assert(location >= 0);
1304       assert(map->start_dw[location] != -1);
1305       var->data.driver_location = map->start_dw[location];
1306    }
1307 
1308    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1309             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1310 }
1311 
1312 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1313 brw_nir_initialize_mue(nir_shader *nir,
1314                        const struct brw_mue_map *map,
1315                        unsigned dispatch_width)
1316 {
1317    assert(map->per_primitive_header_size_dw > 0);
1318 
1319    nir_builder b;
1320    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1321    b = nir_builder_at(nir_before_impl(entrypoint));
1322 
1323    nir_def *dw_off = nir_imm_int(&b, 0);
1324    nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1325 
1326    /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1327 
1328    assert(!nir->info.workgroup_size_variable);
1329    const unsigned workgroup_size = nir->info.workgroup_size[0] *
1330                                    nir->info.workgroup_size[1] *
1331                                    nir->info.workgroup_size[2];
1332 
1333    /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1334 
1335    /* How many prims each invocation needs to cover without checking its index? */
1336    unsigned prims_per_inv = map->max_primitives / workgroup_size;
1337 
1338    /* Zero first 4 dwords of MUE Primitive Header:
1339     * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1340     */
1341 
1342    nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1343 
1344    /* Zero primitive headers distanced by workgroup_size, starting from
1345     * invocation index.
1346     */
1347    for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1348       nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1349                                            prim_in_inv * workgroup_size);
1350 
1351       nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1352                                      .base = (int)map->per_primitive_start_dw,
1353                                      .write_mask = WRITEMASK_XYZW,
1354                                      .component = 0,
1355                                      .src_type = nir_type_uint32);
1356    }
1357 
1358    /* How many prims are left? */
1359    unsigned remaining = map->max_primitives % workgroup_size;
1360 
1361    if (remaining) {
1362       /* Zero "remaining" primitive headers starting from the last one covered
1363        * by the loop above + workgroup_size.
1364        */
1365       nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1366       nir_if *if_stmt = nir_push_if(&b, cmp);
1367       {
1368          nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1369                                                prims_per_inv * workgroup_size);
1370 
1371          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1372                                         .base = (int)map->per_primitive_start_dw,
1373                                         .write_mask = WRITEMASK_XYZW,
1374                                         .component = 0,
1375                                         .src_type = nir_type_uint32);
1376       }
1377       nir_pop_if(&b, if_stmt);
1378    }
1379 
1380    /* If there's more than one subgroup, then we need to wait for all of them
1381     * to finish initialization before we can proceed. Otherwise some subgroups
1382     * may start filling MUE before other finished initializing.
1383     */
1384    if (workgroup_size > dispatch_width) {
1385       nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1386                          NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1387    }
1388 
1389    if (remaining) {
1390       nir_metadata_preserve(entrypoint, nir_metadata_none);
1391    } else {
1392       nir_metadata_preserve(entrypoint, nir_metadata_control_flow);
1393    }
1394 }
1395 
1396 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1397 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1398 {
1399    nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1400    nir_src *offset_src = nir_get_io_offset_src(intrin);
1401 
1402    b->cursor = nir_before_instr(&intrin->instr);
1403    nir_def *offset =
1404       nir_iadd(b,
1405                offset_src->ssa,
1406                nir_imul_imm(b, index_src->ssa, pitch));
1407    nir_src_rewrite(offset_src, offset);
1408 }
1409 
1410 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1411 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1412                                                 nir_intrinsic_instr *intrin,
1413                                                 void *data)
1414 {
1415    const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1416 
1417    /* Remap per_vertex and per_primitive offsets using the extra source and
1418     * the pitch.
1419     */
1420    switch (intrin->intrinsic) {
1421    case nir_intrinsic_load_per_vertex_output:
1422    case nir_intrinsic_store_per_vertex_output:
1423       brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1424 
1425       return true;
1426 
1427    case nir_intrinsic_load_per_primitive_output:
1428    case nir_intrinsic_store_per_primitive_output: {
1429       struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1430       uint32_t pitch;
1431       if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1432          pitch = map->per_primitive_indices_dw;
1433       else
1434          pitch = map->per_primitive_pitch_dw;
1435 
1436       brw_nir_adjust_offset(b, intrin, pitch);
1437 
1438       return true;
1439    }
1440 
1441    default:
1442       return false;
1443    }
1444 }
1445 
1446 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1447 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1448 {
1449    return nir_shader_intrinsics_pass(nir,
1450                                        brw_nir_adjust_offset_for_arrayed_indices_instr,
1451                                        nir_metadata_control_flow,
1452                                        (void *)map);
1453 }
1454 
1455 struct index_packing_state {
1456    unsigned vertices_per_primitive;
1457    nir_variable *original_prim_indices;
1458    nir_variable *packed_prim_indices;
1459 };
1460 
1461 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1462 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1463 {
1464    /* can single index fit into one byte of U888X format? */
1465    if (nir->info.mesh.max_vertices_out > 255)
1466       return false;
1467 
1468    state->vertices_per_primitive =
1469          mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1470    /* packing point indices doesn't help */
1471    if (state->vertices_per_primitive == 1)
1472       return false;
1473 
1474    state->original_prim_indices =
1475       nir_find_variable_with_location(nir,
1476                                       nir_var_shader_out,
1477                                       VARYING_SLOT_PRIMITIVE_INDICES);
1478    /* no indices = no changes to the shader, but it's still worth it,
1479     * because less URB space will be used
1480     */
1481    if (!state->original_prim_indices)
1482       return true;
1483 
1484    ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1485    assert(glsl_type_is_array(type));
1486    assert(glsl_type_is_vector(glsl_without_array(type)));
1487    assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1488 
1489    nir_foreach_function_impl(impl, nir) {
1490       nir_foreach_block(block, impl) {
1491          nir_foreach_instr(instr, block) {
1492             if (instr->type != nir_instr_type_intrinsic)
1493                continue;
1494 
1495             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1496 
1497             if (intrin->intrinsic != nir_intrinsic_store_deref) {
1498                /* any unknown deref operation on primitive indices -> don't pack */
1499                unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1500                for (unsigned i = 0; i < num_srcs; i++) {
1501                   nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1502                   if (!deref)
1503                      continue;
1504                   nir_variable *var = nir_deref_instr_get_variable(deref);
1505 
1506                   if (var == state->original_prim_indices)
1507                      return false;
1508                }
1509 
1510                continue;
1511             }
1512 
1513             nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1514             if (!deref)
1515                continue;
1516 
1517             nir_variable *var = nir_deref_instr_get_variable(deref);
1518             if (var != state->original_prim_indices)
1519                continue;
1520 
1521             if (deref->deref_type != nir_deref_type_array)
1522                return false; /* unknown chain of derefs */
1523 
1524             nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1525             if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1526                return false; /* unknown chain of derefs */
1527 
1528             assert (var_deref->var == state->original_prim_indices);
1529 
1530             unsigned write_mask = nir_intrinsic_write_mask(intrin);
1531 
1532             /* If only some components are written, then we can't easily pack.
1533              * In theory we could, by loading current dword value, bitmasking
1534              * one byte and storing back the whole dword, but it would be slow
1535              * and could actually decrease performance. TODO: reevaluate this
1536              * once there will be something hitting this.
1537              */
1538             if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1539                return false;
1540          }
1541       }
1542    }
1543 
1544    return true;
1545 }
1546 
1547 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1548 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1549                                  void *data)
1550 {
1551    if (intrin->intrinsic != nir_intrinsic_store_deref)
1552       return false;
1553 
1554    nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1555    if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1556       return false;
1557 
1558    nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1559    if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1560       return false;
1561 
1562    struct index_packing_state *state =
1563          (struct index_packing_state *)data;
1564 
1565    nir_variable *var = var_deref->var;
1566 
1567    if (var != state->original_prim_indices)
1568       return false;
1569 
1570    unsigned vertices_per_primitive = state->vertices_per_primitive;
1571 
1572    b->cursor = nir_before_instr(&intrin->instr);
1573 
1574    nir_deref_instr *new_var_deref =
1575          nir_build_deref_var(b, state->packed_prim_indices);
1576    nir_deref_instr *new_array_deref =
1577          nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1578 
1579    nir_src *data_src = &intrin->src[1];
1580    nir_def *data_def =
1581          data_src->ssa;
1582 
1583    nir_def *new_data =
1584          nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1585                     nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1586 
1587    if (vertices_per_primitive >= 3) {
1588       new_data =
1589             nir_ior(b, new_data,
1590                        nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1591    }
1592 
1593    nir_build_store_deref(b, &new_array_deref->def, new_data);
1594 
1595    nir_instr_remove(&intrin->instr);
1596 
1597    return true;
1598 }
1599 
1600 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1601 brw_pack_primitive_indices(nir_shader *nir, void *data)
1602 {
1603    struct index_packing_state *state = (struct index_packing_state *)data;
1604 
1605    const struct glsl_type *new_type =
1606          glsl_array_type(glsl_uint_type(),
1607                          nir->info.mesh.max_primitives_out,
1608                          0);
1609 
1610    state->packed_prim_indices =
1611          nir_variable_create(nir, nir_var_shader_out,
1612                              new_type, "gl_PrimitiveIndicesPacked");
1613    state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1614    state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1615    state->packed_prim_indices->data.per_primitive = 1;
1616 
1617    return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1618                                        nir_metadata_control_flow,
1619                                        data);
1620 }
1621 
1622 static bool
brw_mesh_autostrip_enable(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map)1623 brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader *nir,
1624                           struct brw_mue_map *map)
1625 {
1626    /* Auto-striping can be enabled when shader either doesn't write to
1627     * RTA Index and VP Index or writes the same values for all primitives.
1628     * Since determining whether shader writes the same value across the whole
1629     * workgroup (not just subgroup!) is tricky, we do the simplest possible
1630     * thing - say yes only when shader writes const values and they all match.
1631     *
1632     * TODO: improve this
1633     */
1634 
1635    if (compiler->devinfo->ver < 20)
1636       return false;
1637 
1638    const uint64_t outputs_written = nir->info.outputs_written;
1639 
1640    /* Wa_16020916187
1641     * We've allocated slots for layer/viewport in brw_compute_mue_map() if this
1642     * workaround is needed and will let brw_nir_initialize_mue() initialize
1643     * those to 0. The workaround also requires disabling autostrip.
1644     */
1645    if (intel_needs_workaround(compiler->devinfo, 16020916187) &&
1646        (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written))
1647        return false;
1648 
1649    if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
1650        map->start_dw[VARYING_SLOT_LAYER] < 0)
1651       return true;
1652 
1653    nir_def *vp = NULL;
1654    nir_def *layer = NULL;
1655 
1656    nir_foreach_function(function, nir) {
1657       if (!function->impl)
1658          continue;
1659 
1660       nir_foreach_block(block, function->impl) {
1661          nir_foreach_instr(instr, block) {
1662             if (instr->type != nir_instr_type_intrinsic)
1663                continue;
1664 
1665             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1666             if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
1667                continue;
1668 
1669             struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
1670             bool is_vp = io.location == VARYING_SLOT_VIEWPORT;
1671             bool is_layer = io.location == VARYING_SLOT_LAYER;
1672             if (!is_vp && !is_layer)
1673                continue;
1674 
1675             nir_src *src = &intrin->src[0];
1676 
1677             if (!nir_src_is_const(*src))
1678                return false;
1679 
1680             nir_def **cmp;
1681             if (is_vp)
1682                cmp = &vp;
1683             else
1684                cmp = &layer;
1685 
1686             if (*cmp == NULL)
1687                *cmp = src->ssa;
1688             else if (*cmp != src->ssa)
1689                return false;
1690          }
1691       }
1692    }
1693 
1694    return true;
1695 }
1696 
1697 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1698 brw_compile_mesh(const struct brw_compiler *compiler,
1699                  struct brw_compile_mesh_params *params)
1700 {
1701    const struct intel_device_info *devinfo = compiler->devinfo;
1702    struct nir_shader *nir = params->base.nir;
1703    const struct brw_mesh_prog_key *key = params->key;
1704    struct brw_mesh_prog_data *prog_data = params->prog_data;
1705    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1706 
1707    prog_data->base.base.stage = MESA_SHADER_MESH;
1708    prog_data->base.base.total_shared = nir->info.shared_size;
1709    prog_data->base.base.total_scratch = 0;
1710 
1711    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1712    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1713    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1714 
1715    prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1716    prog_data->cull_distance_mask =
1717          ((1 << nir->info.cull_distance_array_size) - 1) <<
1718           nir->info.clip_distance_array_size;
1719    prog_data->primitive_type = nir->info.mesh.primitive_type;
1720 
1721    struct index_packing_state index_packing_state = {};
1722    if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1723       if (index_packing_state.original_prim_indices)
1724          NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1725       prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1726    } else {
1727       prog_data->index_format = BRW_INDEX_FORMAT_U32;
1728    }
1729 
1730    prog_data->uses_drawid =
1731       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1732 
1733    NIR_PASS(_, nir, brw_nir_lower_mesh_primitive_count);
1734    NIR_PASS(_, nir, nir_opt_dce);
1735    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
1736 
1737    brw_nir_lower_tue_inputs(nir, params->tue_map);
1738 
1739    brw_compute_mue_map(compiler, nir, &prog_data->map,
1740                        prog_data->index_format, key->compact_mue);
1741    brw_nir_lower_mue_outputs(nir, &prog_data->map);
1742 
1743    prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
1744 
1745    NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
1746    prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
1747 
1748    brw_simd_selection_state simd_state{
1749       .devinfo = compiler->devinfo,
1750       .prog_data = &prog_data->base,
1751       .required_width = brw_required_dispatch_width(&nir->info),
1752    };
1753 
1754    std::unique_ptr<fs_visitor> v[3];
1755 
1756    for (unsigned i = 0; i < 3; i++) {
1757       const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
1758 
1759       if (!brw_simd_should_compile(simd_state, simd))
1760          continue;
1761 
1762       const unsigned dispatch_width = 8 << simd;
1763 
1764       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1765 
1766       /*
1767        * When Primitive Header is enabled, we may not generates writes to all
1768        * fields, so let's initialize everything.
1769        */
1770       if (prog_data->map.per_primitive_header_size_dw > 0)
1771          NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1772 
1773       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1774 
1775       NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1776       /* Load uniforms can do a better job for constants, so fold before it. */
1777       NIR_PASS(_, shader, nir_opt_constant_folding);
1778 
1779       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1780 
1781       brw_postprocess_nir(shader, compiler, debug_enabled,
1782                           key->base.robust_flags);
1783 
1784       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
1785                                              &key->base,
1786                                              &prog_data->base.base,
1787                                              shader, dispatch_width,
1788                                              params->base.stats != NULL,
1789                                              debug_enabled);
1790 
1791       if (prog_data->base.prog_mask) {
1792          unsigned first = ffs(prog_data->base.prog_mask) - 1;
1793          v[simd]->import_uniforms(v[first].get());
1794       }
1795 
1796       const bool allow_spilling = simd == 0 ||
1797          (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
1798       if (run_task_mesh(*v[simd], allow_spilling)) {
1799          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1800 
1801          if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
1802             break;
1803       } else {
1804          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1805       }
1806    }
1807 
1808    int selected_simd = brw_simd_select(simd_state);
1809    if (selected_simd < 0) {
1810       params->base.error_str =
1811          ralloc_asprintf(params->base.mem_ctx,
1812                          "Can't compile shader: "
1813                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1814                          simd_state.error[0], simd_state.error[1],
1815                          simd_state.error[2]);
1816       return NULL;
1817    }
1818 
1819    fs_visitor *selected = v[selected_simd].get();
1820    prog_data->base.prog_mask = 1 << selected_simd;
1821    prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
1822                                         selected->grf_used);
1823 
1824    if (unlikely(debug_enabled)) {
1825       if (params->tue_map) {
1826          fprintf(stderr, "Mesh Input ");
1827          brw_print_tue_map(stderr, params->tue_map);
1828       }
1829       fprintf(stderr, "Mesh Output ");
1830       brw_print_mue_map(stderr, &prog_data->map, nir);
1831    }
1832 
1833    brw_generator g(compiler, &params->base, &prog_data->base.base,
1834                   MESA_SHADER_MESH);
1835    if (unlikely(debug_enabled)) {
1836       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1837                                      "%s mesh shader %s",
1838                                      nir->info.label ? nir->info.label
1839                                                      : "unnamed",
1840                                      nir->info.name));
1841    }
1842 
1843    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1844                    selected->performance_analysis.require(), params->base.stats);
1845    g.add_const_data(nir->constant_data, nir->constant_data_size);
1846    return g.get_assembly();
1847 }
1848