1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_builder.h"
29 #include "brw_generator.h"
30 #include "brw_nir.h"
31 #include "brw_private.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "dev/intel_debug.h"
34
35 #include <memory>
36
37 using namespace brw;
38
39 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)40 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
41 UNUSED const void *data)
42 {
43 if (instr->type != nir_instr_type_intrinsic)
44 return false;
45 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
46 return intrin->intrinsic == nir_intrinsic_load_uniform;
47 }
48
49 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,void * data)50 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
51 void *data)
52 {
53 assert(instr->type == nir_instr_type_intrinsic);
54 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
55 assert(intrin->intrinsic == nir_intrinsic_load_uniform);
56
57 /* Use the first few bytes of InlineData as push constants. */
58 if (nir_src_is_const(intrin->src[0])) {
59 int offset =
60 BRW_TASK_MESH_PUSH_CONSTANTS_START_DW * 4 +
61 nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
62 int range = intrin->def.num_components * intrin->def.bit_size / 8;
63 if ((offset + range) <= (int)(BRW_TASK_MESH_INLINE_DATA_SIZE_DW * 4)) {
64 return nir_load_inline_data_intel(b,
65 intrin->def.num_components,
66 intrin->def.bit_size,
67 .base = offset);
68 }
69 }
70
71 return brw_nir_load_global_const(b, intrin,
72 nir_load_inline_data_intel(b, 1, 64, 0), 0);
73 }
74
75 static bool
brw_nir_lower_load_uniforms(nir_shader * nir,const struct intel_device_info * devinfo)76 brw_nir_lower_load_uniforms(nir_shader *nir,
77 const struct intel_device_info *devinfo)
78 {
79 return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
80 brw_nir_lower_load_uniforms_impl,
81 (void *)devinfo);
82 }
83
84 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)85 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
86 {
87 return glsl_count_dword_slots(type, bindless);
88 }
89
90 /* TODO(mesh): Make this a common function. */
91 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)92 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
93 {
94 assert(glsl_type_is_vector_or_scalar(type));
95
96 uint32_t comp_size = glsl_type_is_boolean(type)
97 ? 4 : glsl_get_bit_size(type) / 8;
98 unsigned length = glsl_get_vector_elements(type);
99 *size = comp_size * length,
100 *align = comp_size * (length == 3 ? 4 : length);
101 }
102
103 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)104 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
105 nir_intrinsic_instr *intrin,
106 void *data)
107 {
108 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
109 return false;
110
111 b->cursor = nir_before_instr(&intrin->instr);
112
113 nir_def *local_invocation_index = nir_load_local_invocation_index(b);
114
115 /* Make sure that the mesh workgroup size is taken from the first invocation
116 * (nir_intrinsic_launch_mesh_workgroups requirement)
117 */
118 nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
119 nir_if *if_stmt = nir_push_if(b, cmp);
120 {
121 /* TUE header contains 4 words:
122 *
123 * - Word 0 for Task Count.
124 *
125 * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
126 * 3D dispatch into the 1D dispatch supported by HW.
127 */
128 nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
129 nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
130 nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
131 nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
132 nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
133 nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
134 }
135 nir_pop_if(b, if_stmt);
136
137 nir_instr_remove(&intrin->instr);
138
139 return true;
140 }
141
142 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)143 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
144 {
145 return nir_shader_intrinsics_pass(nir,
146 brw_nir_lower_launch_mesh_workgroups_instr,
147 nir_metadata_none,
148 NULL);
149 }
150
151 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)152 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
153 {
154 memset(map, 0, sizeof(*map));
155
156 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
157 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
158
159 /* From bspec: "It is suggested that SW reserve the 16 bytes following the
160 * TUE Header, and therefore start the SW-defined data structure at 32B
161 * alignment. This allows the TUE Header to always be written as 32 bytes
162 * with 32B alignment, the most optimal write performance case."
163 */
164 map->per_task_data_start_dw = 8;
165
166 /* Lowering to explicit types will start offsets from task_payload_size, so
167 * set it to start after the header.
168 */
169 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
170 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
171 nir_var_mem_task_payload, shared_type_info);
172 NIR_PASS(_, nir, nir_lower_explicit_io,
173 nir_var_mem_task_payload, nir_address_format_32bit_offset);
174
175 map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
176 }
177
178 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)179 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
180 {
181 fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
182 }
183
184 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)185 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
186 nir_intrinsic_instr *intrin,
187 void *data)
188 {
189 switch (intrin->intrinsic) {
190 case nir_intrinsic_store_task_payload:
191 case nir_intrinsic_load_task_payload: {
192 nir_src *offset_src = nir_get_io_offset_src(intrin);
193
194 if (nir_src_is_const(*offset_src))
195 assert(nir_src_as_uint(*offset_src) % 4 == 0);
196
197 b->cursor = nir_before_instr(&intrin->instr);
198
199 /* Regular I/O uses dwords while explicit I/O used for task payload uses
200 * bytes. Normalize it to dwords.
201 *
202 * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
203 */
204
205 nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
206 nir_src_rewrite(offset_src, offset);
207
208 unsigned base = nir_intrinsic_base(intrin);
209 assert(base % 4 == 0);
210 nir_intrinsic_set_base(intrin, base / 4);
211
212 return true;
213 }
214
215 default:
216 return false;
217 }
218 }
219
220 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)221 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
222 {
223 return nir_shader_intrinsics_pass(nir,
224 brw_nir_adjust_task_payload_offsets_instr,
225 nir_metadata_control_flow,
226 NULL);
227 }
228
229 void
brw_nir_adjust_payload(nir_shader * shader)230 brw_nir_adjust_payload(nir_shader *shader)
231 {
232 /* Adjustment of task payload offsets must be performed *after* last pass
233 * which interprets them as bytes, because it changes their unit.
234 */
235 bool adjusted = false;
236 NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
237 if (adjusted) /* clean up the mess created by offset adjustments */
238 NIR_PASS(_, shader, nir_opt_constant_folding);
239 }
240
241 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)242 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
243 nir_intrinsic_instr *intrin,
244 void *data)
245 {
246 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
247 return false;
248
249 /* nir_lower_task_shader uses "range" as task payload size. */
250 unsigned range = nir_intrinsic_range(intrin);
251 /* This will avoid special case in nir_lower_task_shader dealing with
252 * not vec4-aligned payload when payload_in_shared workaround is enabled.
253 */
254 nir_intrinsic_set_range(intrin, ALIGN(range, 16));
255
256 return true;
257 }
258
259 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)260 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
261 {
262 return nir_shader_intrinsics_pass(nir,
263 brw_nir_align_launch_mesh_workgroups_instr,
264 nir_metadata_control_flow,
265 NULL);
266 }
267
268 static bool
lower_set_vtx_and_prim_to_temp_write(nir_builder * b,nir_intrinsic_instr * intrin,void * data)269 lower_set_vtx_and_prim_to_temp_write(nir_builder *b,
270 nir_intrinsic_instr *intrin,
271 void *data)
272 {
273 if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
274 return false;
275
276 /* Detect some cases of invalid primitive count. They might lead to URB
277 * memory corruption, where workgroups overwrite each other output memory.
278 */
279 if (nir_src_is_const(intrin->src[1]) &&
280 nir_src_as_uint(intrin->src[1]) > b->shader->info.mesh.max_primitives_out)
281 unreachable("number of primitives bigger than max specified");
282
283 b->cursor = nir_instr_remove(&intrin->instr);
284
285 nir_variable *temporary_primitive_count = (nir_variable *)data;
286 nir_store_var(b, temporary_primitive_count, intrin->src[1].ssa, 0x1);
287
288 return true;
289 }
290
291 static bool
brw_nir_lower_mesh_primitive_count(nir_shader * nir)292 brw_nir_lower_mesh_primitive_count(nir_shader *nir)
293 {
294 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
295
296 nir_variable *temporary_primitive_count =
297 nir_local_variable_create(impl,
298 glsl_uint_type(),
299 "__temp_primitive_count");
300
301 nir_shader_intrinsics_pass(nir,
302 lower_set_vtx_and_prim_to_temp_write,
303 nir_metadata_control_flow,
304 temporary_primitive_count);
305
306 nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
307
308 nir_store_var(b, temporary_primitive_count, nir_imm_int(b, 0), 0x1);
309
310 b->cursor = nir_after_impl(impl);
311
312 /* Have a single lane write the primitive count */
313 nir_def *local_invocation_index = nir_load_local_invocation_index(b);
314 nir_push_if(b, nir_ieq_imm(b, local_invocation_index, 0));
315 {
316 nir_variable *final_primitive_count =
317 nir_create_variable_with_location(nir, nir_var_shader_out,
318 VARYING_SLOT_PRIMITIVE_COUNT,
319 glsl_uint_type());
320 final_primitive_count->name = ralloc_strdup(final_primitive_count,
321 "gl_PrimitiveCountNV");
322 final_primitive_count->data.interpolation = INTERP_MODE_NONE;
323
324 nir_store_var(b, final_primitive_count,
325 nir_load_var(b, temporary_primitive_count), 0x1);
326 }
327 nir_pop_if(b, NULL);
328
329 nir_metadata_preserve(impl, nir_metadata_none);
330
331 nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_COUNT;
332
333 return true;
334 }
335
336 static void
brw_emit_urb_fence(fs_visitor & s)337 brw_emit_urb_fence(fs_visitor &s)
338 {
339 const brw_builder bld1 = brw_builder(&s).at_end().exec_all().group(1, 0);
340 brw_reg dst = bld1.vgrf(BRW_TYPE_UD);
341 fs_inst *fence = bld1.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
342 brw_vec8_grf(0, 0),
343 brw_imm_ud(true),
344 brw_imm_ud(0));
345 fence->sfid = BRW_SFID_URB;
346 /* The logical thing here would likely be a THREADGROUP fence but that's
347 * still failing some tests like in dEQP-VK.mesh_shader.ext.query.*
348 *
349 * Gfx12.5 has a comment about this on BSpec 53533 :
350 *
351 * "If fence scope is Local or Threadgroup, HW ignores the flush type
352 * and operates as if it was set to None (no flush)"
353 *
354 * Software workaround from HSD-22014129519 indicates that a GPU fence
355 * resolves the issue.
356 */
357 fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_GPU,
358 LSC_FLUSH_TYPE_NONE, true);
359
360 bld1.emit(FS_OPCODE_SCHEDULING_FENCE, bld1.null_reg_ud(), &dst, 1);
361 }
362
363 static bool
run_task_mesh(fs_visitor & s,bool allow_spilling)364 run_task_mesh(fs_visitor &s, bool allow_spilling)
365 {
366 assert(s.stage == MESA_SHADER_TASK ||
367 s.stage == MESA_SHADER_MESH);
368
369 s.payload_ = new task_mesh_thread_payload(s);
370
371 nir_to_brw(&s);
372
373 if (s.failed)
374 return false;
375
376 brw_emit_urb_fence(s);
377
378 s.emit_cs_terminate();
379
380 brw_calculate_cfg(s);
381
382 brw_optimize(s);
383
384 s.assign_curb_setup();
385
386 brw_lower_3src_null_dest(s);
387 brw_workaround_memory_fence_before_eot(s);
388 brw_workaround_emit_dummy_mov_instruction(s);
389
390 brw_allocate_registers(s, allow_spilling);
391
392 brw_workaround_source_arf_before_eot(s);
393
394 return !s.failed;
395 }
396
397 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)398 brw_compile_task(const struct brw_compiler *compiler,
399 struct brw_compile_task_params *params)
400 {
401 const struct intel_device_info *devinfo = compiler->devinfo;
402 struct nir_shader *nir = params->base.nir;
403 const struct brw_task_prog_key *key = params->key;
404 struct brw_task_prog_data *prog_data = params->prog_data;
405 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
406
407 brw_nir_lower_tue_outputs(nir, &prog_data->map);
408
409 NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
410
411 nir_lower_task_shader_options lower_ts_opt = {
412 .payload_to_shared_for_atomics = true,
413 .payload_to_shared_for_small_types = true,
414 /* The actual payload data starts after the TUE header and padding,
415 * so skip those when copying.
416 */
417 .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
418 };
419 NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
420
421 NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
422
423 prog_data->base.base.stage = MESA_SHADER_TASK;
424 prog_data->base.base.total_shared = nir->info.shared_size;
425 prog_data->base.base.total_scratch = 0;
426
427 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
428 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
429 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
430
431 prog_data->uses_drawid =
432 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
433
434 NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
435 prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
436
437 brw_simd_selection_state simd_state{
438 .devinfo = compiler->devinfo,
439 .prog_data = &prog_data->base,
440 .required_width = brw_required_dispatch_width(&nir->info),
441 };
442
443 std::unique_ptr<fs_visitor> v[3];
444
445 for (unsigned i = 0; i < 3; i++) {
446 const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
447
448 if (!brw_simd_should_compile(simd_state, simd))
449 continue;
450
451 const unsigned dispatch_width = 8 << simd;
452
453 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
454 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
455
456 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
457
458 brw_postprocess_nir(shader, compiler, debug_enabled,
459 key->base.robust_flags);
460
461 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
462 &key->base,
463 &prog_data->base.base,
464 shader, dispatch_width,
465 params->base.stats != NULL,
466 debug_enabled);
467
468 if (prog_data->base.prog_mask) {
469 unsigned first = ffs(prog_data->base.prog_mask) - 1;
470 v[simd]->import_uniforms(v[first].get());
471 }
472
473 const bool allow_spilling = simd == 0 ||
474 (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
475 if (run_task_mesh(*v[simd], allow_spilling)) {
476 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
477
478 if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
479 break;
480 } else {
481 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
482 }
483 }
484
485 int selected_simd = brw_simd_select(simd_state);
486 if (selected_simd < 0) {
487 params->base.error_str =
488 ralloc_asprintf(params->base.mem_ctx,
489 "Can't compile shader: "
490 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
491 simd_state.error[0], simd_state.error[1],
492 simd_state.error[2]);
493 return NULL;
494 }
495
496 fs_visitor *selected = v[selected_simd].get();
497 prog_data->base.prog_mask = 1 << selected_simd;
498 prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
499 selected->grf_used);
500
501 if (unlikely(debug_enabled)) {
502 fprintf(stderr, "Task Output ");
503 brw_print_tue_map(stderr, &prog_data->map);
504 }
505
506 brw_generator g(compiler, ¶ms->base, &prog_data->base.base,
507 MESA_SHADER_TASK);
508 if (unlikely(debug_enabled)) {
509 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
510 "%s task shader %s",
511 nir->info.label ? nir->info.label
512 : "unnamed",
513 nir->info.name));
514 }
515
516 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
517 selected->performance_analysis.require(), params->base.stats);
518 g.add_const_data(nir->constant_data, nir->constant_data_size);
519 return g.get_assembly();
520 }
521
522 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)523 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
524 {
525 if (!map)
526 return;
527
528 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
529
530 bool progress = false;
531
532 NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
533 nir_var_mem_task_payload, shared_type_info);
534
535 if (progress) {
536 /* The types for Task Output and Mesh Input should match, so their sizes
537 * should also match.
538 */
539 assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
540 } else {
541 /* Mesh doesn't read any input, to make it clearer set the
542 * task_payload_size to zero instead of keeping an incomplete size that
543 * just includes the header.
544 */
545 nir->info.task_payload_size = 0;
546 }
547
548 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
549 nir_address_format_32bit_offset);
550 }
551
552 /* Attribute types. Flat attributes have to be a separate class because
553 * flat and interpolated attributes can't share the same vec4 slot
554 * (see 3DSTATE_SBE.ConstantInterpolationEnable).
555 */
556 enum {
557 PRIM, /* per primitive */
558 VERT, /* per vertex interpolated */
559 VERT_FLAT, /* per vertex flat */
560 };
561
562 struct attr_desc {
563 int location;
564 const struct glsl_type *type;
565 unsigned dwords;
566 unsigned slots;
567 };
568
569 struct attr_type_info {
570 /* order of attributes, negative values are holes */
571 std::list<struct attr_desc> *order;
572
573 /* attributes after which there's hole of size equal to array index */
574 std::list<int> holes[5];
575 };
576
577 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)578 brw_mue_assign_position(const struct attr_desc *attr,
579 struct brw_mue_map *map,
580 unsigned start_dw)
581 {
582 bool is_array = glsl_type_is_array(attr->type);
583 int location = attr->location;
584 unsigned remaining = attr->dwords;
585
586 for (unsigned slot = 0; slot < attr->slots; ++slot) {
587 map->start_dw[location + slot] = start_dw;
588
589 unsigned sz;
590
591 if (is_array) {
592 assert(attr->dwords % attr->slots == 0);
593 sz = attr->dwords / attr->slots;
594 } else {
595 sz = MIN2(remaining, 4);
596 }
597
598 map->len_dw[location + slot] = sz;
599 start_dw += sz;
600 remaining -= sz;
601 }
602 }
603
604 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)605 brw_nir_find_complete_variable_with_location(nir_shader *shader,
606 nir_variable_mode mode,
607 int location)
608 {
609 nir_variable *best_var = NULL;
610 unsigned last_size = 0;
611
612 nir_foreach_variable_with_modes(var, shader, mode) {
613 if (var->data.location != location)
614 continue;
615
616 unsigned new_size = glsl_count_dword_slots(var->type, false);
617 if (new_size > last_size) {
618 best_var = var;
619 last_size = new_size;
620 }
621 }
622
623 return best_var;
624 }
625
626 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)627 brw_sum_size(const std::list<struct attr_desc> &orders)
628 {
629 unsigned sz = 0;
630 for (auto it = orders.cbegin(); it != orders.cend(); ++it)
631 sz += (*it).dwords;
632 return sz;
633 }
634
635 /* Finds order of outputs which require minimum size, without splitting
636 * of URB read/write messages (which operate on vec4-aligned memory).
637 */
638 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)639 brw_compute_mue_layout(const struct brw_compiler *compiler,
640 std::list<struct attr_desc> *orders,
641 uint64_t outputs_written,
642 struct nir_shader *nir,
643 bool *pack_prim_data_into_header,
644 bool *pack_vert_data_into_header)
645 {
646 const struct shader_info *info = &nir->info;
647
648 struct attr_type_info data[3];
649
650 if ((compiler->mesh.mue_header_packing & 1) == 0)
651 *pack_prim_data_into_header = false;
652 if ((compiler->mesh.mue_header_packing & 2) == 0)
653 *pack_vert_data_into_header = false;
654
655 for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
656 data[i].order = &orders[i];
657
658 /* If packing into header is enabled, add a hole of size 4 and add
659 * a virtual location to keep the algorithm happy (it expects holes
660 * to be preceded by some location). We'll remove those virtual
661 * locations at the end.
662 */
663 const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
664 assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
665
666 struct attr_desc d;
667 d.location = virtual_header_location;
668 d.type = NULL;
669 d.dwords = 0;
670 d.slots = 0;
671
672 struct attr_desc h;
673 h.location = -1;
674 h.type = NULL;
675 h.dwords = 4;
676 h.slots = 0;
677
678 if (*pack_prim_data_into_header) {
679 orders[PRIM].push_back(d);
680 orders[PRIM].push_back(h);
681 data[PRIM].holes[4].push_back(virtual_header_location);
682 }
683
684 if (*pack_vert_data_into_header) {
685 orders[VERT].push_back(d);
686 orders[VERT].push_back(h);
687 data[VERT].holes[4].push_back(virtual_header_location);
688 }
689
690 u_foreach_bit64(location, outputs_written) {
691 if ((BITFIELD64_BIT(location) & outputs_written) == 0)
692 continue;
693
694 /* At this point there are both complete and split variables as
695 * outputs. We need the complete variable to compute the required
696 * size.
697 */
698 nir_variable *var =
699 brw_nir_find_complete_variable_with_location(nir,
700 nir_var_shader_out,
701 location);
702
703 d.location = location;
704 d.type = brw_nir_get_var_type(nir, var);
705 d.dwords = glsl_count_dword_slots(d.type, false);
706 d.slots = glsl_count_attribute_slots(d.type, false);
707
708 struct attr_type_info *type_data;
709
710 if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
711 type_data = &data[PRIM];
712 else if (var->data.interpolation == INTERP_MODE_FLAT)
713 type_data = &data[VERT_FLAT];
714 else
715 type_data = &data[VERT];
716
717 std::list<struct attr_desc> *order = type_data->order;
718 std::list<int> *holes = type_data->holes;
719
720 outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
721
722 /* special case to use hole of size 4 */
723 if (d.dwords == 4 && !holes[4].empty()) {
724 holes[4].pop_back();
725
726 assert(order->front().location == virtual_header_location);
727 order->pop_front();
728
729 assert(order->front().location == -1);
730 assert(order->front().dwords == 4);
731 order->front() = d;
732
733 continue;
734 }
735
736 int mod = d.dwords % 4;
737 if (mod == 0) {
738 order->push_back(d);
739 continue;
740 }
741
742 h.location = -1;
743 h.type = NULL;
744 h.dwords = 4 - mod;
745 h.slots = 0;
746
747 if (!compiler->mesh.mue_compaction) {
748 order->push_back(d);
749 order->push_back(h);
750 continue;
751 }
752
753 if (d.dwords > 4) {
754 order->push_back(d);
755 order->push_back(h);
756 holes[h.dwords].push_back(location);
757 continue;
758 }
759
760 assert(d.dwords < 4);
761
762 unsigned found = 0;
763 /* try to find the smallest hole big enough to hold this attribute */
764 for (unsigned sz = d.dwords; sz <= 4; sz++){
765 if (!holes[sz].empty()) {
766 found = sz;
767 break;
768 }
769 }
770
771 /* append at the end if not found */
772 if (found == 0) {
773 order->push_back(d);
774 order->push_back(h);
775 holes[h.dwords].push_back(location);
776
777 continue;
778 }
779
780 assert(found <= 4);
781 assert(!holes[found].empty());
782 int after_loc = holes[found].back();
783 holes[found].pop_back();
784
785 bool inserted_back = false;
786
787 for (auto it = order->begin(); it != order->end(); ++it) {
788 if ((*it).location != after_loc)
789 continue;
790
791 ++it;
792 /* must be a hole */
793 assert((*it).location < 0);
794 /* and it must be big enough */
795 assert(d.dwords <= (*it).dwords);
796
797 if (d.dwords == (*it).dwords) {
798 /* exact size, just replace */
799 *it = d;
800 } else {
801 /* inexact size, shrink hole */
802 (*it).dwords -= d.dwords;
803 /* and insert new attribute before it */
804 order->insert(it, d);
805
806 /* Insert shrunk hole in a spot so that the order of attributes
807 * is preserved.
808 */
809 std::list<int> &hole_list = holes[(*it).dwords];
810 std::list<int>::iterator insert_before = hole_list.end();
811
812 for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
813 if ((*it2) >= (int)location) {
814 insert_before = it2;
815 break;
816 }
817 }
818
819 hole_list.insert(insert_before, location);
820 }
821
822 inserted_back = true;
823 break;
824 }
825
826 assert(inserted_back);
827 }
828
829 if (*pack_prim_data_into_header) {
830 if (orders[PRIM].front().location == virtual_header_location)
831 orders[PRIM].pop_front();
832
833 if (!data[PRIM].holes[4].empty()) {
834 *pack_prim_data_into_header = false;
835
836 assert(orders[PRIM].front().location == -1);
837 assert(orders[PRIM].front().dwords == 4);
838 orders[PRIM].pop_front();
839 }
840
841 if (*pack_prim_data_into_header) {
842 unsigned sz = brw_sum_size(orders[PRIM]);
843
844 if (sz % 8 == 0 || sz % 8 > 4)
845 *pack_prim_data_into_header = false;
846 }
847 }
848
849 if (*pack_vert_data_into_header) {
850 if (orders[VERT].front().location == virtual_header_location)
851 orders[VERT].pop_front();
852
853 if (!data[VERT].holes[4].empty()) {
854 *pack_vert_data_into_header = false;
855
856 assert(orders[VERT].front().location == -1);
857 assert(orders[VERT].front().dwords == 4);
858 orders[VERT].pop_front();
859 }
860
861 if (*pack_vert_data_into_header) {
862 unsigned sz = brw_sum_size(orders[VERT]) +
863 brw_sum_size(orders[VERT_FLAT]);
864
865 if (sz % 8 == 0 || sz % 8 > 4)
866 *pack_vert_data_into_header = false;
867 }
868 }
869
870
871 if (INTEL_DEBUG(DEBUG_MESH)) {
872 fprintf(stderr, "MUE attribute order:\n");
873 for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
874 if (!orders[i].empty())
875 fprintf(stderr, "%d: ", i);
876 for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
877 fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
878 }
879 if (!orders[i].empty())
880 fprintf(stderr, "\n");
881 }
882 }
883 }
884
885 /* Mesh URB Entry consists of an initial section
886 *
887 * - Primitive Count
888 * - Primitive Indices (from 0 to Max-1)
889 * - Padding to 32B if needed
890 *
891 * optionally followed by a section for per-primitive data,
892 * in which each primitive (from 0 to Max-1) gets
893 *
894 * - Primitive Header (e.g. ViewportIndex)
895 * - Primitive Custom Attributes
896 *
897 * then followed by a section for per-vertex data
898 *
899 * - Vertex Header (e.g. Position)
900 * - Vertex Custom Attributes
901 *
902 * Each per-element section has a pitch and a starting offset. All the
903 * individual attributes offsets in start_dw are considering the first entry
904 * of the section (i.e. where the Position for first vertex, or ViewportIndex
905 * for first primitive). Attributes for other elements are calculated using
906 * the pitch.
907 */
908 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)909 brw_compute_mue_map(const struct brw_compiler *compiler,
910 struct nir_shader *nir, struct brw_mue_map *map,
911 enum brw_mesh_index_format index_format, bool compact_mue)
912 {
913 memset(map, 0, sizeof(*map));
914
915 memset(&map->start_dw[0], -1, sizeof(map->start_dw));
916 memset(&map->len_dw[0], 0, sizeof(map->len_dw));
917
918 unsigned vertices_per_primitive =
919 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
920
921 map->max_primitives = nir->info.mesh.max_primitives_out;
922 map->max_vertices = nir->info.mesh.max_vertices_out;
923
924 uint64_t outputs_written = nir->info.outputs_written;
925
926 /* One dword for primitives count then K extra dwords for each primitive. */
927 switch (index_format) {
928 case BRW_INDEX_FORMAT_U32:
929 map->per_primitive_indices_dw = vertices_per_primitive;
930 break;
931 case BRW_INDEX_FORMAT_U888X:
932 map->per_primitive_indices_dw = 1;
933 break;
934 default:
935 unreachable("invalid index format");
936 }
937
938 map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
939 map->max_primitives + 1, 8);
940
941 /* Assign initial section. */
942 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
943 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
944 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
945 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
946 }
947 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
948 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
949 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
950 map->per_primitive_indices_dw * map->max_primitives;
951 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
952 }
953
954 const uint64_t per_primitive_header_bits =
955 BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
956 BITFIELD64_BIT(VARYING_SLOT_LAYER) |
957 BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
958 BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
959
960 const uint64_t per_vertex_header_bits =
961 BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
962 BITFIELD64_BIT(VARYING_SLOT_POS) |
963 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
964 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
965
966 std::list<struct attr_desc> orders[3];
967 uint64_t regular_outputs = outputs_written &
968 ~(per_primitive_header_bits | per_vertex_header_bits);
969
970 /* packing into prim header is possible only if prim header is present */
971 map->user_data_in_primitive_header = compact_mue &&
972 (outputs_written & per_primitive_header_bits) != 0;
973
974 /* Packing into vert header is always possible, but we allow it only
975 * if full vec4 is available (so point size is not used) and there's
976 * nothing between it and normal vertex data (so no clip distances).
977 */
978 map->user_data_in_vertex_header = compact_mue &&
979 (outputs_written & per_vertex_header_bits) ==
980 BITFIELD64_BIT(VARYING_SLOT_POS);
981
982 if (outputs_written & per_primitive_header_bits) {
983 bool zero_layer_viewport = false;
984 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
985 map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
986 map->per_primitive_start_dw + 0;
987 map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
988 /* Wa_16020916187: force 0 writes to layer and viewport slots */
989 zero_layer_viewport =
990 intel_needs_workaround(compiler->devinfo, 16020916187);
991 }
992
993 if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) ||
994 zero_layer_viewport) {
995 map->start_dw[VARYING_SLOT_LAYER] =
996 map->per_primitive_start_dw + 1; /* RTAIndex */
997 map->len_dw[VARYING_SLOT_LAYER] = 1;
998 }
999
1000 if ((outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) ||
1001 zero_layer_viewport) {
1002 map->start_dw[VARYING_SLOT_VIEWPORT] =
1003 map->per_primitive_start_dw + 2;
1004 map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
1005 }
1006
1007 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
1008 map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
1009 map->per_primitive_start_dw + 3;
1010 map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
1011 }
1012
1013 map->per_primitive_header_size_dw = 8;
1014 outputs_written &= ~per_primitive_header_bits;
1015 } else {
1016 map->per_primitive_header_size_dw = 0;
1017 }
1018
1019 map->per_primitive_data_size_dw = 0;
1020
1021 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1022 * will be compiled without access to the MUE map and won't be able to find
1023 * out where everything is.
1024 * Instead, keep doing things as we did before the packing, just laying out
1025 * everything in varying order, which is how the FS will expect them.
1026 */
1027 if (compact_mue) {
1028 brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
1029 &map->user_data_in_primitive_header,
1030 &map->user_data_in_vertex_header);
1031
1032 unsigned start_dw = map->per_primitive_start_dw;
1033 if (map->user_data_in_primitive_header)
1034 start_dw += 4; /* first 4 dwords are used */
1035 else
1036 start_dw += map->per_primitive_header_size_dw;
1037 unsigned header_used_dw = 0;
1038
1039 for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
1040 int location = (*it).location;
1041 if (location < 0) {
1042 start_dw += (*it).dwords;
1043 if (map->user_data_in_primitive_header && header_used_dw < 4)
1044 header_used_dw += (*it).dwords;
1045 else
1046 map->per_primitive_data_size_dw += (*it).dwords;
1047 assert(header_used_dw <= 4);
1048 continue;
1049 }
1050
1051 assert(map->start_dw[location] == -1);
1052
1053 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
1054 location >= VARYING_SLOT_VAR0);
1055
1056 brw_mue_assign_position(&*it, map, start_dw);
1057
1058 start_dw += (*it).dwords;
1059 if (map->user_data_in_primitive_header && header_used_dw < 4)
1060 header_used_dw += (*it).dwords;
1061 else
1062 map->per_primitive_data_size_dw += (*it).dwords;
1063 assert(header_used_dw <= 4);
1064 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1065 }
1066 } else {
1067 unsigned start_dw = map->per_primitive_start_dw +
1068 map->per_primitive_header_size_dw;
1069
1070 uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
1071 while (per_prim_outputs) {
1072 uint64_t location = ffsll(per_prim_outputs) - 1;
1073
1074 assert(map->start_dw[location] == -1);
1075 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
1076 location >= VARYING_SLOT_VAR0);
1077
1078 nir_variable *var =
1079 brw_nir_find_complete_variable_with_location(nir,
1080 nir_var_shader_out,
1081 location);
1082 struct attr_desc d;
1083 d.location = location;
1084 d.type = brw_nir_get_var_type(nir, var);
1085 d.dwords = glsl_count_dword_slots(d.type, false);
1086 d.slots = glsl_count_attribute_slots(d.type, false);
1087
1088 brw_mue_assign_position(&d, map, start_dw);
1089
1090 map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
1091 start_dw += ALIGN(d.dwords, 4);
1092
1093 per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1094 }
1095 }
1096
1097 map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
1098 map->per_primitive_data_size_dw, 8);
1099
1100 map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
1101 map->per_primitive_pitch_dw *
1102 map->max_primitives, 8);
1103
1104 /* TODO(mesh): Multiview. */
1105 unsigned fixed_header_size = 8;
1106 map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
1107 nir->info.clip_distance_array_size +
1108 nir->info.cull_distance_array_size, 8);
1109
1110 if (outputs_written & per_vertex_header_bits) {
1111 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
1112 map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
1113 map->len_dw[VARYING_SLOT_PSIZ] = 1;
1114 }
1115
1116 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
1117 map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
1118 map->len_dw[VARYING_SLOT_POS] = 4;
1119 }
1120
1121 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
1122 map->start_dw[VARYING_SLOT_CLIP_DIST0] =
1123 map->per_vertex_start_dw + fixed_header_size + 0;
1124 map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
1125 }
1126
1127 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
1128 map->start_dw[VARYING_SLOT_CLIP_DIST1] =
1129 map->per_vertex_start_dw + fixed_header_size + 4;
1130 map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
1131 }
1132
1133 outputs_written &= ~per_vertex_header_bits;
1134 }
1135
1136 /* cull distances should be lowered earlier */
1137 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
1138 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
1139
1140 map->per_vertex_data_size_dw = 0;
1141
1142 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
1143 * will be compiled without access to the MUE map and won't be able to find
1144 * out where everything is.
1145 * Instead, keep doing things as we did before the packing, just laying out
1146 * everything in varying order, which is how the FS will expect them.
1147 */
1148 if (compact_mue) {
1149 unsigned start_dw = map->per_vertex_start_dw;
1150 if (!map->user_data_in_vertex_header)
1151 start_dw += map->per_vertex_header_size_dw;
1152
1153 unsigned header_used_dw = 0;
1154 for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1155 for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1156 int location = (*it).location;
1157 if (location < 0) {
1158 start_dw += (*it).dwords;
1159 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1160 header_used_dw += (*it).dwords;
1161 assert(header_used_dw <= 4);
1162 if (header_used_dw == 4)
1163 start_dw += 4; /* jump over gl_position */
1164 } else {
1165 map->per_vertex_data_size_dw += (*it).dwords;
1166 }
1167 continue;
1168 }
1169
1170 assert(map->start_dw[location] == -1);
1171
1172 assert(location >= VARYING_SLOT_VAR0);
1173
1174 brw_mue_assign_position(&*it, map, start_dw);
1175
1176 start_dw += (*it).dwords;
1177 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1178 header_used_dw += (*it).dwords;
1179 assert(header_used_dw <= 4);
1180 if (header_used_dw == 4)
1181 start_dw += 4; /* jump over gl_position */
1182 } else {
1183 map->per_vertex_data_size_dw += (*it).dwords;
1184 }
1185 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1186 }
1187 }
1188 } else {
1189 unsigned start_dw = map->per_vertex_start_dw +
1190 map->per_vertex_header_size_dw;
1191
1192 uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1193 while (per_vertex_outputs) {
1194 uint64_t location = ffsll(per_vertex_outputs) - 1;
1195
1196 assert(map->start_dw[location] == -1);
1197 assert(location >= VARYING_SLOT_VAR0);
1198
1199 nir_variable *var =
1200 brw_nir_find_complete_variable_with_location(nir,
1201 nir_var_shader_out,
1202 location);
1203 struct attr_desc d;
1204 d.location = location;
1205 d.type = brw_nir_get_var_type(nir, var);
1206 d.dwords = glsl_count_dword_slots(d.type, false);
1207 d.slots = glsl_count_attribute_slots(d.type, false);
1208
1209 brw_mue_assign_position(&d, map, start_dw);
1210
1211 map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1212 start_dw += ALIGN(d.dwords, 4);
1213
1214 per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1215 }
1216 }
1217
1218 map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1219 map->per_vertex_data_size_dw, 8);
1220
1221 map->size_dw =
1222 map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1223
1224 assert(map->size_dw % 8 == 0);
1225 }
1226
1227 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1228 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1229 {
1230 fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1231 map->size_dw, map->max_primitives, map->max_vertices);
1232 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1233 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1234 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1235 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1236 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1237 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1238 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1239 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1240
1241 fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1242 map->per_primitive_start_dw,
1243 map->per_primitive_header_size_dw,
1244 map->per_primitive_data_size_dw,
1245 map->per_primitive_pitch_dw);
1246
1247 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1248 if (map->start_dw[i] < 0)
1249 continue;
1250
1251 const unsigned offset = map->start_dw[i];
1252 const unsigned len = map->len_dw[i];
1253
1254 if (offset < map->per_primitive_start_dw ||
1255 offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1256 continue;
1257
1258 const char *name =
1259 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1260 MESA_SHADER_MESH);
1261
1262 fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1263 name, i);
1264 }
1265
1266 fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1267 map->per_vertex_start_dw,
1268 map->per_vertex_header_size_dw,
1269 map->per_vertex_data_size_dw,
1270 map->per_vertex_pitch_dw);
1271
1272 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1273 if (map->start_dw[i] < 0)
1274 continue;
1275
1276 const unsigned offset = map->start_dw[i];
1277 const unsigned len = map->len_dw[i];
1278
1279 if (offset < map->per_vertex_start_dw ||
1280 offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1281 continue;
1282
1283 nir_variable *var =
1284 nir_find_variable_with_location(nir, nir_var_shader_out, i);
1285 bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1286
1287 const char *name =
1288 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1289 MESA_SHADER_MESH);
1290
1291 fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1292 name, i, flat ? " (flat)" : "");
1293 }
1294
1295 fprintf(fp, "\n");
1296 }
1297
1298 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1299 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1300 {
1301 nir_foreach_shader_out_variable(var, nir) {
1302 int location = var->data.location;
1303 assert(location >= 0);
1304 assert(map->start_dw[location] != -1);
1305 var->data.driver_location = map->start_dw[location];
1306 }
1307
1308 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1309 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1310 }
1311
1312 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1313 brw_nir_initialize_mue(nir_shader *nir,
1314 const struct brw_mue_map *map,
1315 unsigned dispatch_width)
1316 {
1317 assert(map->per_primitive_header_size_dw > 0);
1318
1319 nir_builder b;
1320 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1321 b = nir_builder_at(nir_before_impl(entrypoint));
1322
1323 nir_def *dw_off = nir_imm_int(&b, 0);
1324 nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1325
1326 /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1327
1328 assert(!nir->info.workgroup_size_variable);
1329 const unsigned workgroup_size = nir->info.workgroup_size[0] *
1330 nir->info.workgroup_size[1] *
1331 nir->info.workgroup_size[2];
1332
1333 /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1334
1335 /* How many prims each invocation needs to cover without checking its index? */
1336 unsigned prims_per_inv = map->max_primitives / workgroup_size;
1337
1338 /* Zero first 4 dwords of MUE Primitive Header:
1339 * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1340 */
1341
1342 nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1343
1344 /* Zero primitive headers distanced by workgroup_size, starting from
1345 * invocation index.
1346 */
1347 for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1348 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1349 prim_in_inv * workgroup_size);
1350
1351 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1352 .base = (int)map->per_primitive_start_dw,
1353 .write_mask = WRITEMASK_XYZW,
1354 .component = 0,
1355 .src_type = nir_type_uint32);
1356 }
1357
1358 /* How many prims are left? */
1359 unsigned remaining = map->max_primitives % workgroup_size;
1360
1361 if (remaining) {
1362 /* Zero "remaining" primitive headers starting from the last one covered
1363 * by the loop above + workgroup_size.
1364 */
1365 nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1366 nir_if *if_stmt = nir_push_if(&b, cmp);
1367 {
1368 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1369 prims_per_inv * workgroup_size);
1370
1371 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1372 .base = (int)map->per_primitive_start_dw,
1373 .write_mask = WRITEMASK_XYZW,
1374 .component = 0,
1375 .src_type = nir_type_uint32);
1376 }
1377 nir_pop_if(&b, if_stmt);
1378 }
1379
1380 /* If there's more than one subgroup, then we need to wait for all of them
1381 * to finish initialization before we can proceed. Otherwise some subgroups
1382 * may start filling MUE before other finished initializing.
1383 */
1384 if (workgroup_size > dispatch_width) {
1385 nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1386 NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1387 }
1388
1389 if (remaining) {
1390 nir_metadata_preserve(entrypoint, nir_metadata_none);
1391 } else {
1392 nir_metadata_preserve(entrypoint, nir_metadata_control_flow);
1393 }
1394 }
1395
1396 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1397 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1398 {
1399 nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1400 nir_src *offset_src = nir_get_io_offset_src(intrin);
1401
1402 b->cursor = nir_before_instr(&intrin->instr);
1403 nir_def *offset =
1404 nir_iadd(b,
1405 offset_src->ssa,
1406 nir_imul_imm(b, index_src->ssa, pitch));
1407 nir_src_rewrite(offset_src, offset);
1408 }
1409
1410 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1411 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1412 nir_intrinsic_instr *intrin,
1413 void *data)
1414 {
1415 const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1416
1417 /* Remap per_vertex and per_primitive offsets using the extra source and
1418 * the pitch.
1419 */
1420 switch (intrin->intrinsic) {
1421 case nir_intrinsic_load_per_vertex_output:
1422 case nir_intrinsic_store_per_vertex_output:
1423 brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1424
1425 return true;
1426
1427 case nir_intrinsic_load_per_primitive_output:
1428 case nir_intrinsic_store_per_primitive_output: {
1429 struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1430 uint32_t pitch;
1431 if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1432 pitch = map->per_primitive_indices_dw;
1433 else
1434 pitch = map->per_primitive_pitch_dw;
1435
1436 brw_nir_adjust_offset(b, intrin, pitch);
1437
1438 return true;
1439 }
1440
1441 default:
1442 return false;
1443 }
1444 }
1445
1446 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1447 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1448 {
1449 return nir_shader_intrinsics_pass(nir,
1450 brw_nir_adjust_offset_for_arrayed_indices_instr,
1451 nir_metadata_control_flow,
1452 (void *)map);
1453 }
1454
1455 struct index_packing_state {
1456 unsigned vertices_per_primitive;
1457 nir_variable *original_prim_indices;
1458 nir_variable *packed_prim_indices;
1459 };
1460
1461 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1462 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1463 {
1464 /* can single index fit into one byte of U888X format? */
1465 if (nir->info.mesh.max_vertices_out > 255)
1466 return false;
1467
1468 state->vertices_per_primitive =
1469 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1470 /* packing point indices doesn't help */
1471 if (state->vertices_per_primitive == 1)
1472 return false;
1473
1474 state->original_prim_indices =
1475 nir_find_variable_with_location(nir,
1476 nir_var_shader_out,
1477 VARYING_SLOT_PRIMITIVE_INDICES);
1478 /* no indices = no changes to the shader, but it's still worth it,
1479 * because less URB space will be used
1480 */
1481 if (!state->original_prim_indices)
1482 return true;
1483
1484 ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1485 assert(glsl_type_is_array(type));
1486 assert(glsl_type_is_vector(glsl_without_array(type)));
1487 assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1488
1489 nir_foreach_function_impl(impl, nir) {
1490 nir_foreach_block(block, impl) {
1491 nir_foreach_instr(instr, block) {
1492 if (instr->type != nir_instr_type_intrinsic)
1493 continue;
1494
1495 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1496
1497 if (intrin->intrinsic != nir_intrinsic_store_deref) {
1498 /* any unknown deref operation on primitive indices -> don't pack */
1499 unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1500 for (unsigned i = 0; i < num_srcs; i++) {
1501 nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1502 if (!deref)
1503 continue;
1504 nir_variable *var = nir_deref_instr_get_variable(deref);
1505
1506 if (var == state->original_prim_indices)
1507 return false;
1508 }
1509
1510 continue;
1511 }
1512
1513 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1514 if (!deref)
1515 continue;
1516
1517 nir_variable *var = nir_deref_instr_get_variable(deref);
1518 if (var != state->original_prim_indices)
1519 continue;
1520
1521 if (deref->deref_type != nir_deref_type_array)
1522 return false; /* unknown chain of derefs */
1523
1524 nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1525 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1526 return false; /* unknown chain of derefs */
1527
1528 assert (var_deref->var == state->original_prim_indices);
1529
1530 unsigned write_mask = nir_intrinsic_write_mask(intrin);
1531
1532 /* If only some components are written, then we can't easily pack.
1533 * In theory we could, by loading current dword value, bitmasking
1534 * one byte and storing back the whole dword, but it would be slow
1535 * and could actually decrease performance. TODO: reevaluate this
1536 * once there will be something hitting this.
1537 */
1538 if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1539 return false;
1540 }
1541 }
1542 }
1543
1544 return true;
1545 }
1546
1547 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1548 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1549 void *data)
1550 {
1551 if (intrin->intrinsic != nir_intrinsic_store_deref)
1552 return false;
1553
1554 nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1555 if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1556 return false;
1557
1558 nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1559 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1560 return false;
1561
1562 struct index_packing_state *state =
1563 (struct index_packing_state *)data;
1564
1565 nir_variable *var = var_deref->var;
1566
1567 if (var != state->original_prim_indices)
1568 return false;
1569
1570 unsigned vertices_per_primitive = state->vertices_per_primitive;
1571
1572 b->cursor = nir_before_instr(&intrin->instr);
1573
1574 nir_deref_instr *new_var_deref =
1575 nir_build_deref_var(b, state->packed_prim_indices);
1576 nir_deref_instr *new_array_deref =
1577 nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1578
1579 nir_src *data_src = &intrin->src[1];
1580 nir_def *data_def =
1581 data_src->ssa;
1582
1583 nir_def *new_data =
1584 nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1585 nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1586
1587 if (vertices_per_primitive >= 3) {
1588 new_data =
1589 nir_ior(b, new_data,
1590 nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1591 }
1592
1593 nir_build_store_deref(b, &new_array_deref->def, new_data);
1594
1595 nir_instr_remove(&intrin->instr);
1596
1597 return true;
1598 }
1599
1600 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1601 brw_pack_primitive_indices(nir_shader *nir, void *data)
1602 {
1603 struct index_packing_state *state = (struct index_packing_state *)data;
1604
1605 const struct glsl_type *new_type =
1606 glsl_array_type(glsl_uint_type(),
1607 nir->info.mesh.max_primitives_out,
1608 0);
1609
1610 state->packed_prim_indices =
1611 nir_variable_create(nir, nir_var_shader_out,
1612 new_type, "gl_PrimitiveIndicesPacked");
1613 state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1614 state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1615 state->packed_prim_indices->data.per_primitive = 1;
1616
1617 return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1618 nir_metadata_control_flow,
1619 data);
1620 }
1621
1622 static bool
brw_mesh_autostrip_enable(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map)1623 brw_mesh_autostrip_enable(const struct brw_compiler *compiler, struct nir_shader *nir,
1624 struct brw_mue_map *map)
1625 {
1626 /* Auto-striping can be enabled when shader either doesn't write to
1627 * RTA Index and VP Index or writes the same values for all primitives.
1628 * Since determining whether shader writes the same value across the whole
1629 * workgroup (not just subgroup!) is tricky, we do the simplest possible
1630 * thing - say yes only when shader writes const values and they all match.
1631 *
1632 * TODO: improve this
1633 */
1634
1635 if (compiler->devinfo->ver < 20)
1636 return false;
1637
1638 const uint64_t outputs_written = nir->info.outputs_written;
1639
1640 /* Wa_16020916187
1641 * We've allocated slots for layer/viewport in brw_compute_mue_map() if this
1642 * workaround is needed and will let brw_nir_initialize_mue() initialize
1643 * those to 0. The workaround also requires disabling autostrip.
1644 */
1645 if (intel_needs_workaround(compiler->devinfo, 16020916187) &&
1646 (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) & outputs_written))
1647 return false;
1648
1649 if (map->start_dw[VARYING_SLOT_VIEWPORT] < 0 &&
1650 map->start_dw[VARYING_SLOT_LAYER] < 0)
1651 return true;
1652
1653 nir_def *vp = NULL;
1654 nir_def *layer = NULL;
1655
1656 nir_foreach_function(function, nir) {
1657 if (!function->impl)
1658 continue;
1659
1660 nir_foreach_block(block, function->impl) {
1661 nir_foreach_instr(instr, block) {
1662 if (instr->type != nir_instr_type_intrinsic)
1663 continue;
1664
1665 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1666 if (intrin->intrinsic != nir_intrinsic_store_per_primitive_output)
1667 continue;
1668
1669 struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
1670 bool is_vp = io.location == VARYING_SLOT_VIEWPORT;
1671 bool is_layer = io.location == VARYING_SLOT_LAYER;
1672 if (!is_vp && !is_layer)
1673 continue;
1674
1675 nir_src *src = &intrin->src[0];
1676
1677 if (!nir_src_is_const(*src))
1678 return false;
1679
1680 nir_def **cmp;
1681 if (is_vp)
1682 cmp = &vp;
1683 else
1684 cmp = &layer;
1685
1686 if (*cmp == NULL)
1687 *cmp = src->ssa;
1688 else if (*cmp != src->ssa)
1689 return false;
1690 }
1691 }
1692 }
1693
1694 return true;
1695 }
1696
1697 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1698 brw_compile_mesh(const struct brw_compiler *compiler,
1699 struct brw_compile_mesh_params *params)
1700 {
1701 const struct intel_device_info *devinfo = compiler->devinfo;
1702 struct nir_shader *nir = params->base.nir;
1703 const struct brw_mesh_prog_key *key = params->key;
1704 struct brw_mesh_prog_data *prog_data = params->prog_data;
1705 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1706
1707 prog_data->base.base.stage = MESA_SHADER_MESH;
1708 prog_data->base.base.total_shared = nir->info.shared_size;
1709 prog_data->base.base.total_scratch = 0;
1710
1711 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1712 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1713 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1714
1715 prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1716 prog_data->cull_distance_mask =
1717 ((1 << nir->info.cull_distance_array_size) - 1) <<
1718 nir->info.clip_distance_array_size;
1719 prog_data->primitive_type = nir->info.mesh.primitive_type;
1720
1721 struct index_packing_state index_packing_state = {};
1722 if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1723 if (index_packing_state.original_prim_indices)
1724 NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1725 prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1726 } else {
1727 prog_data->index_format = BRW_INDEX_FORMAT_U32;
1728 }
1729
1730 prog_data->uses_drawid =
1731 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1732
1733 NIR_PASS(_, nir, brw_nir_lower_mesh_primitive_count);
1734 NIR_PASS(_, nir, nir_opt_dce);
1735 NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
1736
1737 brw_nir_lower_tue_inputs(nir, params->tue_map);
1738
1739 brw_compute_mue_map(compiler, nir, &prog_data->map,
1740 prog_data->index_format, key->compact_mue);
1741 brw_nir_lower_mue_outputs(nir, &prog_data->map);
1742
1743 prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);
1744
1745 NIR_PASS(_, nir, brw_nir_lower_load_uniforms, compiler->devinfo);
1746 prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
1747
1748 brw_simd_selection_state simd_state{
1749 .devinfo = compiler->devinfo,
1750 .prog_data = &prog_data->base,
1751 .required_width = brw_required_dispatch_width(&nir->info),
1752 };
1753
1754 std::unique_ptr<fs_visitor> v[3];
1755
1756 for (unsigned i = 0; i < 3; i++) {
1757 const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
1758
1759 if (!brw_simd_should_compile(simd_state, simd))
1760 continue;
1761
1762 const unsigned dispatch_width = 8 << simd;
1763
1764 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1765
1766 /*
1767 * When Primitive Header is enabled, we may not generates writes to all
1768 * fields, so let's initialize everything.
1769 */
1770 if (prog_data->map.per_primitive_header_size_dw > 0)
1771 NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1772
1773 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1774
1775 NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1776 /* Load uniforms can do a better job for constants, so fold before it. */
1777 NIR_PASS(_, shader, nir_opt_constant_folding);
1778
1779 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1780
1781 brw_postprocess_nir(shader, compiler, debug_enabled,
1782 key->base.robust_flags);
1783
1784 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
1785 &key->base,
1786 &prog_data->base.base,
1787 shader, dispatch_width,
1788 params->base.stats != NULL,
1789 debug_enabled);
1790
1791 if (prog_data->base.prog_mask) {
1792 unsigned first = ffs(prog_data->base.prog_mask) - 1;
1793 v[simd]->import_uniforms(v[first].get());
1794 }
1795
1796 const bool allow_spilling = simd == 0 ||
1797 (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
1798 if (run_task_mesh(*v[simd], allow_spilling)) {
1799 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1800
1801 if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
1802 break;
1803 } else {
1804 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1805 }
1806 }
1807
1808 int selected_simd = brw_simd_select(simd_state);
1809 if (selected_simd < 0) {
1810 params->base.error_str =
1811 ralloc_asprintf(params->base.mem_ctx,
1812 "Can't compile shader: "
1813 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1814 simd_state.error[0], simd_state.error[1],
1815 simd_state.error[2]);
1816 return NULL;
1817 }
1818
1819 fs_visitor *selected = v[selected_simd].get();
1820 prog_data->base.prog_mask = 1 << selected_simd;
1821 prog_data->base.base.grf_used = MAX2(prog_data->base.base.grf_used,
1822 selected->grf_used);
1823
1824 if (unlikely(debug_enabled)) {
1825 if (params->tue_map) {
1826 fprintf(stderr, "Mesh Input ");
1827 brw_print_tue_map(stderr, params->tue_map);
1828 }
1829 fprintf(stderr, "Mesh Output ");
1830 brw_print_mue_map(stderr, &prog_data->map, nir);
1831 }
1832
1833 brw_generator g(compiler, ¶ms->base, &prog_data->base.base,
1834 MESA_SHADER_MESH);
1835 if (unlikely(debug_enabled)) {
1836 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1837 "%s mesh shader %s",
1838 nir->info.label ? nir->info.label
1839 : "unnamed",
1840 nir->info.name));
1841 }
1842
1843 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1844 selected->performance_analysis.require(), params->base.stats);
1845 g.add_const_data(nir->constant_data, nir->constant_data_size);
1846 return g.get_assembly();
1847 }
1848