1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_nir.h"
29 #include "brw_private.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "dev/intel_debug.h"
32
33 #include <memory>
34
35 using namespace brw;
36
37 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)38 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
39 UNUSED const void *data)
40 {
41 if (instr->type != nir_instr_type_intrinsic)
42 return false;
43 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
44 return intrin->intrinsic == nir_intrinsic_load_uniform;
45 }
46
47 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)48 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
49 UNUSED void *data)
50 {
51 assert(instr->type == nir_instr_type_intrinsic);
52 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
53 assert(intrin->intrinsic == nir_intrinsic_load_uniform);
54
55 /* Read the first few 32-bit scalars from InlineData. */
56 if (nir_src_is_const(intrin->src[0]) &&
57 intrin->def.bit_size == 32 &&
58 intrin->def.num_components == 1) {
59 unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
60 unsigned off_dw = off / 4;
61 if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
62 off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
63 return nir_load_mesh_inline_data_intel(b, 32, off_dw);
64 }
65 }
66
67 return brw_nir_load_global_const(b, intrin,
68 nir_load_mesh_inline_data_intel(b, 64, 0), 0);
69 }
70
71 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)72 brw_nir_lower_load_uniforms(nir_shader *nir)
73 {
74 return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
75 brw_nir_lower_load_uniforms_impl, NULL);
76 }
77
78 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)79 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
80 {
81 return glsl_count_dword_slots(type, bindless);
82 }
83
84 /* TODO(mesh): Make this a common function. */
85 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)86 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
87 {
88 assert(glsl_type_is_vector_or_scalar(type));
89
90 uint32_t comp_size = glsl_type_is_boolean(type)
91 ? 4 : glsl_get_bit_size(type) / 8;
92 unsigned length = glsl_get_vector_elements(type);
93 *size = comp_size * length,
94 *align = comp_size * (length == 3 ? 4 : length);
95 }
96
97 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)98 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
99 nir_intrinsic_instr *intrin,
100 void *data)
101 {
102 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
103 return false;
104
105 b->cursor = nir_before_instr(&intrin->instr);
106
107 nir_def *local_invocation_index = nir_load_local_invocation_index(b);
108
109 /* Make sure that the mesh workgroup size is taken from the first invocation
110 * (nir_intrinsic_launch_mesh_workgroups requirement)
111 */
112 nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
113 nir_if *if_stmt = nir_push_if(b, cmp);
114 {
115 /* TUE header contains 4 words:
116 *
117 * - Word 0 for Task Count.
118 *
119 * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
120 * 3D dispatch into the 1D dispatch supported by HW.
121 */
122 nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
123 nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
124 nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
125 nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
126 nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
127 nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
128 }
129 nir_pop_if(b, if_stmt);
130
131 nir_instr_remove(&intrin->instr);
132
133 return true;
134 }
135
136 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)137 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
138 {
139 return nir_shader_intrinsics_pass(nir,
140 brw_nir_lower_launch_mesh_workgroups_instr,
141 nir_metadata_none,
142 NULL);
143 }
144
145 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)146 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
147 {
148 memset(map, 0, sizeof(*map));
149
150 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
151 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
152
153 /* From bspec: "It is suggested that SW reserve the 16 bytes following the
154 * TUE Header, and therefore start the SW-defined data structure at 32B
155 * alignment. This allows the TUE Header to always be written as 32 bytes
156 * with 32B alignment, the most optimal write performance case."
157 */
158 map->per_task_data_start_dw = 8;
159
160 /* Lowering to explicit types will start offsets from task_payload_size, so
161 * set it to start after the header.
162 */
163 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
164 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
165 nir_var_mem_task_payload, shared_type_info);
166 NIR_PASS(_, nir, nir_lower_explicit_io,
167 nir_var_mem_task_payload, nir_address_format_32bit_offset);
168
169 map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
170 }
171
172 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)173 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
174 {
175 fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
176 }
177
178 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)179 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
180 nir_intrinsic_instr *intrin,
181 void *data)
182 {
183 switch (intrin->intrinsic) {
184 case nir_intrinsic_store_task_payload:
185 case nir_intrinsic_load_task_payload: {
186 nir_src *offset_src = nir_get_io_offset_src(intrin);
187
188 if (nir_src_is_const(*offset_src))
189 assert(nir_src_as_uint(*offset_src) % 4 == 0);
190
191 b->cursor = nir_before_instr(&intrin->instr);
192
193 /* Regular I/O uses dwords while explicit I/O used for task payload uses
194 * bytes. Normalize it to dwords.
195 *
196 * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
197 */
198
199 nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
200 nir_src_rewrite(offset_src, offset);
201
202 unsigned base = nir_intrinsic_base(intrin);
203 assert(base % 4 == 0);
204 nir_intrinsic_set_base(intrin, base / 4);
205
206 return true;
207 }
208
209 default:
210 return false;
211 }
212 }
213
214 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)215 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
216 {
217 return nir_shader_intrinsics_pass(nir,
218 brw_nir_adjust_task_payload_offsets_instr,
219 nir_metadata_block_index |
220 nir_metadata_dominance,
221 NULL);
222 }
223
224 void
brw_nir_adjust_payload(nir_shader * shader)225 brw_nir_adjust_payload(nir_shader *shader)
226 {
227 /* Adjustment of task payload offsets must be performed *after* last pass
228 * which interprets them as bytes, because it changes their unit.
229 */
230 bool adjusted = false;
231 NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
232 if (adjusted) /* clean up the mess created by offset adjustments */
233 NIR_PASS(_, shader, nir_opt_constant_folding);
234 }
235
236 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)237 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
238 nir_intrinsic_instr *intrin,
239 void *data)
240 {
241 if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
242 return false;
243
244 /* nir_lower_task_shader uses "range" as task payload size. */
245 unsigned range = nir_intrinsic_range(intrin);
246 /* This will avoid special case in nir_lower_task_shader dealing with
247 * not vec4-aligned payload when payload_in_shared workaround is enabled.
248 */
249 nir_intrinsic_set_range(intrin, ALIGN(range, 16));
250
251 return true;
252 }
253
254 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)255 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
256 {
257 return nir_shader_intrinsics_pass(nir,
258 brw_nir_align_launch_mesh_workgroups_instr,
259 nir_metadata_block_index |
260 nir_metadata_dominance,
261 NULL);
262 }
263
264 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)265 brw_compile_task(const struct brw_compiler *compiler,
266 struct brw_compile_task_params *params)
267 {
268 struct nir_shader *nir = params->base.nir;
269 const struct brw_task_prog_key *key = params->key;
270 struct brw_task_prog_data *prog_data = params->prog_data;
271 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
272
273 brw_nir_lower_tue_outputs(nir, &prog_data->map);
274
275 NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
276
277 nir_lower_task_shader_options lower_ts_opt = {
278 .payload_to_shared_for_atomics = true,
279 .payload_to_shared_for_small_types = true,
280 /* The actual payload data starts after the TUE header and padding,
281 * so skip those when copying.
282 */
283 .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
284 };
285 NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
286
287 NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
288
289 prog_data->base.base.stage = MESA_SHADER_TASK;
290 prog_data->base.base.total_shared = nir->info.shared_size;
291 prog_data->base.base.total_scratch = 0;
292
293 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
294 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
295 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
296
297 prog_data->uses_drawid =
298 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
299
300 brw_simd_selection_state simd_state{
301 .devinfo = compiler->devinfo,
302 .prog_data = &prog_data->base,
303 .required_width = brw_required_dispatch_width(&nir->info),
304 };
305
306 std::unique_ptr<fs_visitor> v[3];
307
308 for (unsigned simd = 0; simd < 3; simd++) {
309 if (!brw_simd_should_compile(simd_state, simd))
310 continue;
311
312 const unsigned dispatch_width = 8 << simd;
313
314 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
315 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
316
317 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
318 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
319
320 brw_postprocess_nir(shader, compiler, debug_enabled,
321 key->base.robust_flags);
322
323 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
324 &key->base,
325 &prog_data->base.base,
326 shader, dispatch_width,
327 params->base.stats != NULL,
328 debug_enabled);
329
330 if (prog_data->base.prog_mask) {
331 unsigned first = ffs(prog_data->base.prog_mask) - 1;
332 v[simd]->import_uniforms(v[first].get());
333 }
334
335 const bool allow_spilling = !brw_simd_any_compiled(simd_state);
336 if (v[simd]->run_task(allow_spilling))
337 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
338 else
339 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
340 }
341
342 int selected_simd = brw_simd_select(simd_state);
343 if (selected_simd < 0) {
344 params->base.error_str =
345 ralloc_asprintf(params->base.mem_ctx,
346 "Can't compile shader: "
347 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
348 simd_state.error[0], simd_state.error[1],
349 simd_state.error[2]);
350 return NULL;
351 }
352
353 fs_visitor *selected = v[selected_simd].get();
354 prog_data->base.prog_mask = 1 << selected_simd;
355
356 if (unlikely(debug_enabled)) {
357 fprintf(stderr, "Task Output ");
358 brw_print_tue_map(stderr, &prog_data->map);
359 }
360
361 fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
362 MESA_SHADER_TASK);
363 if (unlikely(debug_enabled)) {
364 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
365 "%s task shader %s",
366 nir->info.label ? nir->info.label
367 : "unnamed",
368 nir->info.name));
369 }
370
371 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
372 selected->performance_analysis.require(), params->base.stats);
373 g.add_const_data(nir->constant_data, nir->constant_data_size);
374 return g.get_assembly();
375 }
376
377 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)378 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
379 {
380 if (!map)
381 return;
382
383 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
384
385 bool progress = false;
386
387 NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
388 nir_var_mem_task_payload, shared_type_info);
389
390 if (progress) {
391 /* The types for Task Output and Mesh Input should match, so their sizes
392 * should also match.
393 */
394 assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
395 } else {
396 /* Mesh doesn't read any input, to make it clearer set the
397 * task_payload_size to zero instead of keeping an incomplete size that
398 * just includes the header.
399 */
400 nir->info.task_payload_size = 0;
401 }
402
403 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
404 nir_address_format_32bit_offset);
405 }
406
407 /* Attribute types. Flat attributes have to be a separate class because
408 * flat and interpolated attributes can't share the same vec4 slot
409 * (see 3DSTATE_SBE.ConstantInterpolationEnable).
410 */
411 enum {
412 PRIM, /* per primitive */
413 VERT, /* per vertex interpolated */
414 VERT_FLAT, /* per vertex flat */
415 };
416
417 struct attr_desc {
418 int location;
419 const struct glsl_type *type;
420 unsigned dwords;
421 unsigned slots;
422 };
423
424 struct attr_type_info {
425 /* order of attributes, negative values are holes */
426 std::list<struct attr_desc> *order;
427
428 /* attributes after which there's hole of size equal to array index */
429 std::list<int> holes[5];
430 };
431
432 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)433 brw_mue_assign_position(const struct attr_desc *attr,
434 struct brw_mue_map *map,
435 unsigned start_dw)
436 {
437 bool is_array = glsl_type_is_array(attr->type);
438 int location = attr->location;
439 unsigned remaining = attr->dwords;
440
441 for (unsigned slot = 0; slot < attr->slots; ++slot) {
442 map->start_dw[location + slot] = start_dw;
443
444 unsigned sz;
445
446 if (is_array) {
447 assert(attr->dwords % attr->slots == 0);
448 sz = attr->dwords / attr->slots;
449 } else {
450 sz = MIN2(remaining, 4);
451 }
452
453 map->len_dw[location + slot] = sz;
454 start_dw += sz;
455 remaining -= sz;
456 }
457 }
458
459 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)460 brw_nir_find_complete_variable_with_location(nir_shader *shader,
461 nir_variable_mode mode,
462 int location)
463 {
464 nir_variable *best_var = NULL;
465 unsigned last_size = 0;
466
467 nir_foreach_variable_with_modes(var, shader, mode) {
468 if (var->data.location != location)
469 continue;
470
471 unsigned new_size = glsl_count_dword_slots(var->type, false);
472 if (new_size > last_size) {
473 best_var = var;
474 last_size = new_size;
475 }
476 }
477
478 return best_var;
479 }
480
481 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)482 brw_sum_size(const std::list<struct attr_desc> &orders)
483 {
484 unsigned sz = 0;
485 for (auto it = orders.cbegin(); it != orders.cend(); ++it)
486 sz += (*it).dwords;
487 return sz;
488 }
489
490 /* Finds order of outputs which require minimum size, without splitting
491 * of URB read/write messages (which operate on vec4-aligned memory).
492 */
493 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)494 brw_compute_mue_layout(const struct brw_compiler *compiler,
495 std::list<struct attr_desc> *orders,
496 uint64_t outputs_written,
497 struct nir_shader *nir,
498 bool *pack_prim_data_into_header,
499 bool *pack_vert_data_into_header)
500 {
501 const struct shader_info *info = &nir->info;
502
503 struct attr_type_info data[3];
504
505 if ((compiler->mesh.mue_header_packing & 1) == 0)
506 *pack_prim_data_into_header = false;
507 if ((compiler->mesh.mue_header_packing & 2) == 0)
508 *pack_vert_data_into_header = false;
509
510 for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
511 data[i].order = &orders[i];
512
513 /* If packing into header is enabled, add a hole of size 4 and add
514 * a virtual location to keep the algorithm happy (it expects holes
515 * to be preceded by some location). We'll remove those virtual
516 * locations at the end.
517 */
518 const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
519 assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
520
521 struct attr_desc d;
522 d.location = virtual_header_location;
523 d.type = NULL;
524 d.dwords = 0;
525 d.slots = 0;
526
527 struct attr_desc h;
528 h.location = -1;
529 h.type = NULL;
530 h.dwords = 4;
531 h.slots = 0;
532
533 if (*pack_prim_data_into_header) {
534 orders[PRIM].push_back(d);
535 orders[PRIM].push_back(h);
536 data[PRIM].holes[4].push_back(virtual_header_location);
537 }
538
539 if (*pack_vert_data_into_header) {
540 orders[VERT].push_back(d);
541 orders[VERT].push_back(h);
542 data[VERT].holes[4].push_back(virtual_header_location);
543 }
544
545 u_foreach_bit64(location, outputs_written) {
546 if ((BITFIELD64_BIT(location) & outputs_written) == 0)
547 continue;
548
549 /* At this point there are both complete and split variables as
550 * outputs. We need the complete variable to compute the required
551 * size.
552 */
553 nir_variable *var =
554 brw_nir_find_complete_variable_with_location(nir,
555 nir_var_shader_out,
556 location);
557
558 d.location = location;
559 d.type = brw_nir_get_var_type(nir, var);
560 d.dwords = glsl_count_dword_slots(d.type, false);
561 d.slots = glsl_count_attribute_slots(d.type, false);
562
563 struct attr_type_info *type_data;
564
565 if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
566 type_data = &data[PRIM];
567 else if (var->data.interpolation == INTERP_MODE_FLAT)
568 type_data = &data[VERT_FLAT];
569 else
570 type_data = &data[VERT];
571
572 std::list<struct attr_desc> *order = type_data->order;
573 std::list<int> *holes = type_data->holes;
574
575 outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
576
577 /* special case to use hole of size 4 */
578 if (d.dwords == 4 && !holes[4].empty()) {
579 holes[4].pop_back();
580
581 assert(order->front().location == virtual_header_location);
582 order->pop_front();
583
584 assert(order->front().location == -1);
585 assert(order->front().dwords == 4);
586 order->front() = d;
587
588 continue;
589 }
590
591 int mod = d.dwords % 4;
592 if (mod == 0) {
593 order->push_back(d);
594 continue;
595 }
596
597 h.location = -1;
598 h.type = NULL;
599 h.dwords = 4 - mod;
600 h.slots = 0;
601
602 if (!compiler->mesh.mue_compaction) {
603 order->push_back(d);
604 order->push_back(h);
605 continue;
606 }
607
608 if (d.dwords > 4) {
609 order->push_back(d);
610 order->push_back(h);
611 holes[h.dwords].push_back(location);
612 continue;
613 }
614
615 assert(d.dwords < 4);
616
617 unsigned found = 0;
618 /* try to find the smallest hole big enough to hold this attribute */
619 for (unsigned sz = d.dwords; sz <= 4; sz++){
620 if (!holes[sz].empty()) {
621 found = sz;
622 break;
623 }
624 }
625
626 /* append at the end if not found */
627 if (found == 0) {
628 order->push_back(d);
629 order->push_back(h);
630 holes[h.dwords].push_back(location);
631
632 continue;
633 }
634
635 assert(found <= 4);
636 assert(!holes[found].empty());
637 int after_loc = holes[found].back();
638 holes[found].pop_back();
639
640 bool inserted_back = false;
641
642 for (auto it = order->begin(); it != order->end(); ++it) {
643 if ((*it).location != after_loc)
644 continue;
645
646 ++it;
647 /* must be a hole */
648 assert((*it).location < 0);
649 /* and it must be big enough */
650 assert(d.dwords <= (*it).dwords);
651
652 if (d.dwords == (*it).dwords) {
653 /* exact size, just replace */
654 *it = d;
655 } else {
656 /* inexact size, shrink hole */
657 (*it).dwords -= d.dwords;
658 /* and insert new attribute before it */
659 order->insert(it, d);
660
661 /* Insert shrunk hole in a spot so that the order of attributes
662 * is preserved.
663 */
664 std::list<int> &hole_list = holes[(*it).dwords];
665 std::list<int>::iterator insert_before = hole_list.end();
666
667 for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
668 if ((*it2) >= (int)location) {
669 insert_before = it2;
670 break;
671 }
672 }
673
674 hole_list.insert(insert_before, location);
675 }
676
677 inserted_back = true;
678 break;
679 }
680
681 assert(inserted_back);
682 }
683
684 if (*pack_prim_data_into_header) {
685 if (orders[PRIM].front().location == virtual_header_location)
686 orders[PRIM].pop_front();
687
688 if (!data[PRIM].holes[4].empty()) {
689 *pack_prim_data_into_header = false;
690
691 assert(orders[PRIM].front().location == -1);
692 assert(orders[PRIM].front().dwords == 4);
693 orders[PRIM].pop_front();
694 }
695
696 if (*pack_prim_data_into_header) {
697 unsigned sz = brw_sum_size(orders[PRIM]);
698
699 if (sz % 8 == 0 || sz % 8 > 4)
700 *pack_prim_data_into_header = false;
701 }
702 }
703
704 if (*pack_vert_data_into_header) {
705 if (orders[VERT].front().location == virtual_header_location)
706 orders[VERT].pop_front();
707
708 if (!data[VERT].holes[4].empty()) {
709 *pack_vert_data_into_header = false;
710
711 assert(orders[VERT].front().location == -1);
712 assert(orders[VERT].front().dwords == 4);
713 orders[VERT].pop_front();
714 }
715
716 if (*pack_vert_data_into_header) {
717 unsigned sz = brw_sum_size(orders[VERT]) +
718 brw_sum_size(orders[VERT_FLAT]);
719
720 if (sz % 8 == 0 || sz % 8 > 4)
721 *pack_vert_data_into_header = false;
722 }
723 }
724
725
726 if (INTEL_DEBUG(DEBUG_MESH)) {
727 fprintf(stderr, "MUE attribute order:\n");
728 for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
729 if (!orders[i].empty())
730 fprintf(stderr, "%d: ", i);
731 for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
732 fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
733 }
734 if (!orders[i].empty())
735 fprintf(stderr, "\n");
736 }
737 }
738 }
739
740 /* Mesh URB Entry consists of an initial section
741 *
742 * - Primitive Count
743 * - Primitive Indices (from 0 to Max-1)
744 * - Padding to 32B if needed
745 *
746 * optionally followed by a section for per-primitive data,
747 * in which each primitive (from 0 to Max-1) gets
748 *
749 * - Primitive Header (e.g. ViewportIndex)
750 * - Primitive Custom Attributes
751 *
752 * then followed by a section for per-vertex data
753 *
754 * - Vertex Header (e.g. Position)
755 * - Vertex Custom Attributes
756 *
757 * Each per-element section has a pitch and a starting offset. All the
758 * individual attributes offsets in start_dw are considering the first entry
759 * of the section (i.e. where the Position for first vertex, or ViewportIndex
760 * for first primitive). Attributes for other elements are calculated using
761 * the pitch.
762 */
763 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)764 brw_compute_mue_map(const struct brw_compiler *compiler,
765 struct nir_shader *nir, struct brw_mue_map *map,
766 enum brw_mesh_index_format index_format, bool compact_mue)
767 {
768 memset(map, 0, sizeof(*map));
769
770 memset(&map->start_dw[0], -1, sizeof(map->start_dw));
771 memset(&map->len_dw[0], 0, sizeof(map->len_dw));
772
773 unsigned vertices_per_primitive =
774 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
775
776 map->max_primitives = nir->info.mesh.max_primitives_out;
777 map->max_vertices = nir->info.mesh.max_vertices_out;
778
779 uint64_t outputs_written = nir->info.outputs_written;
780
781 /* One dword for primitives count then K extra dwords for each primitive. */
782 switch (index_format) {
783 case BRW_INDEX_FORMAT_U32:
784 map->per_primitive_indices_dw = vertices_per_primitive;
785 break;
786 case BRW_INDEX_FORMAT_U888X:
787 map->per_primitive_indices_dw = 1;
788 break;
789 default:
790 unreachable("invalid index format");
791 }
792
793 map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
794 map->max_primitives + 1, 8);
795
796 /* Assign initial section. */
797 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
798 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
799 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
800 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
801 }
802 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
803 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
804 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
805 map->per_primitive_indices_dw * map->max_primitives;
806 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
807 }
808
809 const uint64_t per_primitive_header_bits =
810 BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
811 BITFIELD64_BIT(VARYING_SLOT_LAYER) |
812 BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
813 BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
814
815 const uint64_t per_vertex_header_bits =
816 BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
817 BITFIELD64_BIT(VARYING_SLOT_POS) |
818 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
819 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
820
821 std::list<struct attr_desc> orders[3];
822 uint64_t regular_outputs = outputs_written &
823 ~(per_primitive_header_bits | per_vertex_header_bits);
824
825 /* packing into prim header is possible only if prim header is present */
826 map->user_data_in_primitive_header = compact_mue &&
827 (outputs_written & per_primitive_header_bits) != 0;
828
829 /* Packing into vert header is always possible, but we allow it only
830 * if full vec4 is available (so point size is not used) and there's
831 * nothing between it and normal vertex data (so no clip distances).
832 */
833 map->user_data_in_vertex_header = compact_mue &&
834 (outputs_written & per_vertex_header_bits) ==
835 BITFIELD64_BIT(VARYING_SLOT_POS);
836
837 if (outputs_written & per_primitive_header_bits) {
838 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
839 map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
840 map->per_primitive_start_dw + 0;
841 map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
842 }
843
844 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
845 map->start_dw[VARYING_SLOT_LAYER] =
846 map->per_primitive_start_dw + 1; /* RTAIndex */
847 map->len_dw[VARYING_SLOT_LAYER] = 1;
848 }
849
850 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
851 map->start_dw[VARYING_SLOT_VIEWPORT] =
852 map->per_primitive_start_dw + 2;
853 map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
854 }
855
856 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
857 map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
858 map->per_primitive_start_dw + 3;
859 map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
860 }
861
862 map->per_primitive_header_size_dw = 8;
863 outputs_written &= ~per_primitive_header_bits;
864 } else {
865 map->per_primitive_header_size_dw = 0;
866 }
867
868 map->per_primitive_data_size_dw = 0;
869
870 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
871 * will be compiled without access to the MUE map and won't be able to find
872 * out where everything is.
873 * Instead, keep doing things as we did before the packing, just laying out
874 * everything in varying order, which is how the FS will expect them.
875 */
876 if (compact_mue) {
877 brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
878 &map->user_data_in_primitive_header,
879 &map->user_data_in_vertex_header);
880
881 unsigned start_dw = map->per_primitive_start_dw;
882 if (map->user_data_in_primitive_header)
883 start_dw += 4; /* first 4 dwords are used */
884 else
885 start_dw += map->per_primitive_header_size_dw;
886 unsigned header_used_dw = 0;
887
888 for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
889 int location = (*it).location;
890 if (location < 0) {
891 start_dw += (*it).dwords;
892 if (map->user_data_in_primitive_header && header_used_dw < 4)
893 header_used_dw += (*it).dwords;
894 else
895 map->per_primitive_data_size_dw += (*it).dwords;
896 assert(header_used_dw <= 4);
897 continue;
898 }
899
900 assert(map->start_dw[location] == -1);
901
902 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
903 location >= VARYING_SLOT_VAR0);
904
905 brw_mue_assign_position(&*it, map, start_dw);
906
907 start_dw += (*it).dwords;
908 if (map->user_data_in_primitive_header && header_used_dw < 4)
909 header_used_dw += (*it).dwords;
910 else
911 map->per_primitive_data_size_dw += (*it).dwords;
912 assert(header_used_dw <= 4);
913 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
914 }
915 } else {
916 unsigned start_dw = map->per_primitive_start_dw +
917 map->per_primitive_header_size_dw;
918
919 uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
920 while (per_prim_outputs) {
921 uint64_t location = ffsll(per_prim_outputs) - 1;
922
923 assert(map->start_dw[location] == -1);
924 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
925 location >= VARYING_SLOT_VAR0);
926
927 nir_variable *var =
928 brw_nir_find_complete_variable_with_location(nir,
929 nir_var_shader_out,
930 location);
931 struct attr_desc d;
932 d.location = location;
933 d.type = brw_nir_get_var_type(nir, var);
934 d.dwords = glsl_count_dword_slots(d.type, false);
935 d.slots = glsl_count_attribute_slots(d.type, false);
936
937 brw_mue_assign_position(&d, map, start_dw);
938
939 map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
940 start_dw += ALIGN(d.dwords, 4);
941
942 per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
943 }
944 }
945
946 map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
947 map->per_primitive_data_size_dw, 8);
948
949 map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
950 map->per_primitive_pitch_dw *
951 map->max_primitives, 8);
952
953 /* TODO(mesh): Multiview. */
954 unsigned fixed_header_size = 8;
955 map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
956 nir->info.clip_distance_array_size +
957 nir->info.cull_distance_array_size, 8);
958
959 if (outputs_written & per_vertex_header_bits) {
960 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
961 map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
962 map->len_dw[VARYING_SLOT_PSIZ] = 1;
963 }
964
965 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
966 map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
967 map->len_dw[VARYING_SLOT_POS] = 4;
968 }
969
970 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
971 map->start_dw[VARYING_SLOT_CLIP_DIST0] =
972 map->per_vertex_start_dw + fixed_header_size + 0;
973 map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
974 }
975
976 if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
977 map->start_dw[VARYING_SLOT_CLIP_DIST1] =
978 map->per_vertex_start_dw + fixed_header_size + 4;
979 map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
980 }
981
982 outputs_written &= ~per_vertex_header_bits;
983 }
984
985 /* cull distances should be lowered earlier */
986 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
987 assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
988
989 map->per_vertex_data_size_dw = 0;
990
991 /* For fast linked libraries, we can't pack the MUE, as the fragment shader
992 * will be compiled without access to the MUE map and won't be able to find
993 * out where everything is.
994 * Instead, keep doing things as we did before the packing, just laying out
995 * everything in varying order, which is how the FS will expect them.
996 */
997 if (compact_mue) {
998 unsigned start_dw = map->per_vertex_start_dw;
999 if (!map->user_data_in_vertex_header)
1000 start_dw += map->per_vertex_header_size_dw;
1001
1002 unsigned header_used_dw = 0;
1003 for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1004 for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1005 int location = (*it).location;
1006 if (location < 0) {
1007 start_dw += (*it).dwords;
1008 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1009 header_used_dw += (*it).dwords;
1010 assert(header_used_dw <= 4);
1011 if (header_used_dw == 4)
1012 start_dw += 4; /* jump over gl_position */
1013 } else {
1014 map->per_vertex_data_size_dw += (*it).dwords;
1015 }
1016 continue;
1017 }
1018
1019 assert(map->start_dw[location] == -1);
1020
1021 assert(location >= VARYING_SLOT_VAR0);
1022
1023 brw_mue_assign_position(&*it, map, start_dw);
1024
1025 start_dw += (*it).dwords;
1026 if (map->user_data_in_vertex_header && header_used_dw < 4) {
1027 header_used_dw += (*it).dwords;
1028 assert(header_used_dw <= 4);
1029 if (header_used_dw == 4)
1030 start_dw += 4; /* jump over gl_position */
1031 } else {
1032 map->per_vertex_data_size_dw += (*it).dwords;
1033 }
1034 outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1035 }
1036 }
1037 } else {
1038 unsigned start_dw = map->per_vertex_start_dw +
1039 map->per_vertex_header_size_dw;
1040
1041 uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1042 while (per_vertex_outputs) {
1043 uint64_t location = ffsll(per_vertex_outputs) - 1;
1044
1045 assert(map->start_dw[location] == -1);
1046 assert(location >= VARYING_SLOT_VAR0);
1047
1048 nir_variable *var =
1049 brw_nir_find_complete_variable_with_location(nir,
1050 nir_var_shader_out,
1051 location);
1052 struct attr_desc d;
1053 d.location = location;
1054 d.type = brw_nir_get_var_type(nir, var);
1055 d.dwords = glsl_count_dword_slots(d.type, false);
1056 d.slots = glsl_count_attribute_slots(d.type, false);
1057
1058 brw_mue_assign_position(&d, map, start_dw);
1059
1060 map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1061 start_dw += ALIGN(d.dwords, 4);
1062
1063 per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1064 }
1065 }
1066
1067 map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1068 map->per_vertex_data_size_dw, 8);
1069
1070 map->size_dw =
1071 map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1072
1073 assert(map->size_dw % 8 == 0);
1074 }
1075
1076 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1077 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1078 {
1079 fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1080 map->size_dw, map->max_primitives, map->max_vertices);
1081 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1082 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1083 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1084 map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1085 fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1086 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1087 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1088 map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1089
1090 fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1091 map->per_primitive_start_dw,
1092 map->per_primitive_header_size_dw,
1093 map->per_primitive_data_size_dw,
1094 map->per_primitive_pitch_dw);
1095
1096 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1097 if (map->start_dw[i] < 0)
1098 continue;
1099
1100 const unsigned offset = map->start_dw[i];
1101 const unsigned len = map->len_dw[i];
1102
1103 if (offset < map->per_primitive_start_dw ||
1104 offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1105 continue;
1106
1107 const char *name =
1108 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1109 MESA_SHADER_MESH);
1110
1111 fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1112 name, i);
1113 }
1114
1115 fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1116 map->per_vertex_start_dw,
1117 map->per_vertex_header_size_dw,
1118 map->per_vertex_data_size_dw,
1119 map->per_vertex_pitch_dw);
1120
1121 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1122 if (map->start_dw[i] < 0)
1123 continue;
1124
1125 const unsigned offset = map->start_dw[i];
1126 const unsigned len = map->len_dw[i];
1127
1128 if (offset < map->per_vertex_start_dw ||
1129 offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1130 continue;
1131
1132 nir_variable *var =
1133 nir_find_variable_with_location(nir, nir_var_shader_out, i);
1134 bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1135
1136 const char *name =
1137 gl_varying_slot_name_for_stage((gl_varying_slot)i,
1138 MESA_SHADER_MESH);
1139
1140 fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1141 name, i, flat ? " (flat)" : "");
1142 }
1143
1144 fprintf(fp, "\n");
1145 }
1146
1147 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1148 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1149 {
1150 nir_foreach_shader_out_variable(var, nir) {
1151 int location = var->data.location;
1152 assert(location >= 0);
1153 assert(map->start_dw[location] != -1);
1154 var->data.driver_location = map->start_dw[location];
1155 }
1156
1157 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1158 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1159 }
1160
1161 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1162 brw_nir_initialize_mue(nir_shader *nir,
1163 const struct brw_mue_map *map,
1164 unsigned dispatch_width)
1165 {
1166 assert(map->per_primitive_header_size_dw > 0);
1167
1168 nir_builder b;
1169 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1170 b = nir_builder_at(nir_before_impl(entrypoint));
1171
1172 nir_def *dw_off = nir_imm_int(&b, 0);
1173 nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1174
1175 /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1176
1177 assert(!nir->info.workgroup_size_variable);
1178 const unsigned workgroup_size = nir->info.workgroup_size[0] *
1179 nir->info.workgroup_size[1] *
1180 nir->info.workgroup_size[2];
1181
1182 /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1183
1184 /* How many prims each invocation needs to cover without checking its index? */
1185 unsigned prims_per_inv = map->max_primitives / workgroup_size;
1186
1187 /* Zero first 4 dwords of MUE Primitive Header:
1188 * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1189 */
1190
1191 nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1192
1193 /* Zero primitive headers distanced by workgroup_size, starting from
1194 * invocation index.
1195 */
1196 for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1197 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1198 prim_in_inv * workgroup_size);
1199
1200 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1201 .base = (int)map->per_primitive_start_dw,
1202 .write_mask = WRITEMASK_XYZW,
1203 .component = 0,
1204 .src_type = nir_type_uint32);
1205 }
1206
1207 /* How many prims are left? */
1208 unsigned remaining = map->max_primitives % workgroup_size;
1209
1210 if (remaining) {
1211 /* Zero "remaining" primitive headers starting from the last one covered
1212 * by the loop above + workgroup_size.
1213 */
1214 nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1215 nir_if *if_stmt = nir_push_if(&b, cmp);
1216 {
1217 nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1218 prims_per_inv * workgroup_size);
1219
1220 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1221 .base = (int)map->per_primitive_start_dw,
1222 .write_mask = WRITEMASK_XYZW,
1223 .component = 0,
1224 .src_type = nir_type_uint32);
1225 }
1226 nir_pop_if(&b, if_stmt);
1227 }
1228
1229 /* If there's more than one subgroup, then we need to wait for all of them
1230 * to finish initialization before we can proceed. Otherwise some subgroups
1231 * may start filling MUE before other finished initializing.
1232 */
1233 if (workgroup_size > dispatch_width) {
1234 nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1235 NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1236 }
1237
1238 if (remaining) {
1239 nir_metadata_preserve(entrypoint, nir_metadata_none);
1240 } else {
1241 nir_metadata_preserve(entrypoint, nir_metadata_block_index |
1242 nir_metadata_dominance);
1243 }
1244 }
1245
1246 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1247 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1248 {
1249 nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1250 nir_src *offset_src = nir_get_io_offset_src(intrin);
1251
1252 b->cursor = nir_before_instr(&intrin->instr);
1253 nir_def *offset =
1254 nir_iadd(b,
1255 offset_src->ssa,
1256 nir_imul_imm(b, index_src->ssa, pitch));
1257 nir_src_rewrite(offset_src, offset);
1258 }
1259
1260 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1261 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1262 nir_intrinsic_instr *intrin,
1263 void *data)
1264 {
1265 const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1266
1267 /* Remap per_vertex and per_primitive offsets using the extra source and
1268 * the pitch.
1269 */
1270 switch (intrin->intrinsic) {
1271 case nir_intrinsic_load_per_vertex_output:
1272 case nir_intrinsic_store_per_vertex_output:
1273 brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1274
1275 return true;
1276
1277 case nir_intrinsic_load_per_primitive_output:
1278 case nir_intrinsic_store_per_primitive_output: {
1279 struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1280 uint32_t pitch;
1281 if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1282 pitch = map->per_primitive_indices_dw;
1283 else
1284 pitch = map->per_primitive_pitch_dw;
1285
1286 brw_nir_adjust_offset(b, intrin, pitch);
1287
1288 return true;
1289 }
1290
1291 default:
1292 return false;
1293 }
1294 }
1295
1296 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1297 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1298 {
1299 return nir_shader_intrinsics_pass(nir,
1300 brw_nir_adjust_offset_for_arrayed_indices_instr,
1301 nir_metadata_block_index |
1302 nir_metadata_dominance,
1303 (void *)map);
1304 }
1305
1306 struct index_packing_state {
1307 unsigned vertices_per_primitive;
1308 nir_variable *original_prim_indices;
1309 nir_variable *packed_prim_indices;
1310 };
1311
1312 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1313 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1314 {
1315 /* can single index fit into one byte of U888X format? */
1316 if (nir->info.mesh.max_vertices_out > 255)
1317 return false;
1318
1319 state->vertices_per_primitive =
1320 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1321 /* packing point indices doesn't help */
1322 if (state->vertices_per_primitive == 1)
1323 return false;
1324
1325 state->original_prim_indices =
1326 nir_find_variable_with_location(nir,
1327 nir_var_shader_out,
1328 VARYING_SLOT_PRIMITIVE_INDICES);
1329 /* no indices = no changes to the shader, but it's still worth it,
1330 * because less URB space will be used
1331 */
1332 if (!state->original_prim_indices)
1333 return true;
1334
1335 ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1336 assert(glsl_type_is_array(type));
1337 assert(glsl_type_is_vector(glsl_without_array(type)));
1338 assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1339
1340 nir_foreach_function_impl(impl, nir) {
1341 nir_foreach_block(block, impl) {
1342 nir_foreach_instr(instr, block) {
1343 if (instr->type != nir_instr_type_intrinsic)
1344 continue;
1345
1346 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1347
1348 if (intrin->intrinsic != nir_intrinsic_store_deref) {
1349 /* any unknown deref operation on primitive indices -> don't pack */
1350 unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1351 for (unsigned i = 0; i < num_srcs; i++) {
1352 nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1353 if (!deref)
1354 continue;
1355 nir_variable *var = nir_deref_instr_get_variable(deref);
1356
1357 if (var == state->original_prim_indices)
1358 return false;
1359 }
1360
1361 continue;
1362 }
1363
1364 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1365 if (!deref)
1366 continue;
1367
1368 nir_variable *var = nir_deref_instr_get_variable(deref);
1369 if (var != state->original_prim_indices)
1370 continue;
1371
1372 if (deref->deref_type != nir_deref_type_array)
1373 return false; /* unknown chain of derefs */
1374
1375 nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1376 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1377 return false; /* unknown chain of derefs */
1378
1379 assert (var_deref->var == state->original_prim_indices);
1380
1381 unsigned write_mask = nir_intrinsic_write_mask(intrin);
1382
1383 /* If only some components are written, then we can't easily pack.
1384 * In theory we could, by loading current dword value, bitmasking
1385 * one byte and storing back the whole dword, but it would be slow
1386 * and could actually decrease performance. TODO: reevaluate this
1387 * once there will be something hitting this.
1388 */
1389 if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1390 return false;
1391 }
1392 }
1393 }
1394
1395 return true;
1396 }
1397
1398 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1399 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1400 void *data)
1401 {
1402 if (intrin->intrinsic != nir_intrinsic_store_deref)
1403 return false;
1404
1405 nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1406 if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1407 return false;
1408
1409 nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1410 if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1411 return false;
1412
1413 struct index_packing_state *state =
1414 (struct index_packing_state *)data;
1415
1416 nir_variable *var = var_deref->var;
1417
1418 if (var != state->original_prim_indices)
1419 return false;
1420
1421 unsigned vertices_per_primitive = state->vertices_per_primitive;
1422
1423 b->cursor = nir_before_instr(&intrin->instr);
1424
1425 nir_deref_instr *new_var_deref =
1426 nir_build_deref_var(b, state->packed_prim_indices);
1427 nir_deref_instr *new_array_deref =
1428 nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1429
1430 nir_src *data_src = &intrin->src[1];
1431 nir_def *data_def =
1432 data_src->ssa;
1433
1434 nir_def *new_data =
1435 nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1436 nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1437
1438 if (vertices_per_primitive >= 3) {
1439 new_data =
1440 nir_ior(b, new_data,
1441 nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1442 }
1443
1444 nir_build_store_deref(b, &new_array_deref->def, new_data);
1445
1446 nir_instr_remove(&intrin->instr);
1447
1448 return true;
1449 }
1450
1451 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1452 brw_pack_primitive_indices(nir_shader *nir, void *data)
1453 {
1454 struct index_packing_state *state = (struct index_packing_state *)data;
1455
1456 const struct glsl_type *new_type =
1457 glsl_array_type(glsl_uint_type(),
1458 nir->info.mesh.max_primitives_out,
1459 0);
1460
1461 state->packed_prim_indices =
1462 nir_variable_create(nir, nir_var_shader_out,
1463 new_type, "gl_PrimitiveIndicesPacked");
1464 state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1465 state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1466 state->packed_prim_indices->data.per_primitive = 1;
1467
1468 return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1469 nir_metadata_block_index |
1470 nir_metadata_dominance,
1471 data);
1472 }
1473
1474 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1475 brw_compile_mesh(const struct brw_compiler *compiler,
1476 struct brw_compile_mesh_params *params)
1477 {
1478 struct nir_shader *nir = params->base.nir;
1479 const struct brw_mesh_prog_key *key = params->key;
1480 struct brw_mesh_prog_data *prog_data = params->prog_data;
1481 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1482
1483 prog_data->base.base.stage = MESA_SHADER_MESH;
1484 prog_data->base.base.total_shared = nir->info.shared_size;
1485 prog_data->base.base.total_scratch = 0;
1486
1487 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1488 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1489 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1490
1491 prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1492 prog_data->cull_distance_mask =
1493 ((1 << nir->info.cull_distance_array_size) - 1) <<
1494 nir->info.clip_distance_array_size;
1495 prog_data->primitive_type = nir->info.mesh.primitive_type;
1496
1497 struct index_packing_state index_packing_state = {};
1498 if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1499 if (index_packing_state.original_prim_indices)
1500 NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1501 prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1502 } else {
1503 prog_data->index_format = BRW_INDEX_FORMAT_U32;
1504 }
1505
1506 prog_data->uses_drawid =
1507 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1508
1509 brw_nir_lower_tue_inputs(nir, params->tue_map);
1510
1511 brw_compute_mue_map(compiler, nir, &prog_data->map,
1512 prog_data->index_format, key->compact_mue);
1513 brw_nir_lower_mue_outputs(nir, &prog_data->map);
1514
1515 brw_simd_selection_state simd_state{
1516 .devinfo = compiler->devinfo,
1517 .prog_data = &prog_data->base,
1518 .required_width = brw_required_dispatch_width(&nir->info),
1519 };
1520
1521 std::unique_ptr<fs_visitor> v[3];
1522
1523 for (int simd = 0; simd < 3; simd++) {
1524 if (!brw_simd_should_compile(simd_state, simd))
1525 continue;
1526
1527 const unsigned dispatch_width = 8 << simd;
1528
1529 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1530
1531 /*
1532 * When Primitive Header is enabled, we may not generates writes to all
1533 * fields, so let's initialize everything.
1534 */
1535 if (prog_data->map.per_primitive_header_size_dw > 0)
1536 NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1537
1538 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1539
1540 NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1541 /* Load uniforms can do a better job for constants, so fold before it. */
1542 NIR_PASS(_, shader, nir_opt_constant_folding);
1543 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
1544
1545 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1546
1547 brw_postprocess_nir(shader, compiler, debug_enabled,
1548 key->base.robust_flags);
1549
1550 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
1551 &key->base,
1552 &prog_data->base.base,
1553 shader, dispatch_width,
1554 params->base.stats != NULL,
1555 debug_enabled);
1556
1557 if (prog_data->base.prog_mask) {
1558 unsigned first = ffs(prog_data->base.prog_mask) - 1;
1559 v[simd]->import_uniforms(v[first].get());
1560 }
1561
1562 const bool allow_spilling = !brw_simd_any_compiled(simd_state);
1563 if (v[simd]->run_mesh(allow_spilling))
1564 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1565 else
1566 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1567 }
1568
1569 int selected_simd = brw_simd_select(simd_state);
1570 if (selected_simd < 0) {
1571 params->base.error_str =
1572 ralloc_asprintf(params->base.mem_ctx,
1573 "Can't compile shader: "
1574 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1575 simd_state.error[0], simd_state.error[1],
1576 simd_state.error[2]);
1577 return NULL;
1578 }
1579
1580 fs_visitor *selected = v[selected_simd].get();
1581 prog_data->base.prog_mask = 1 << selected_simd;
1582
1583 if (unlikely(debug_enabled)) {
1584 if (params->tue_map) {
1585 fprintf(stderr, "Mesh Input ");
1586 brw_print_tue_map(stderr, params->tue_map);
1587 }
1588 fprintf(stderr, "Mesh Output ");
1589 brw_print_mue_map(stderr, &prog_data->map, nir);
1590 }
1591
1592 fs_generator g(compiler, ¶ms->base, &prog_data->base.base,
1593 MESA_SHADER_MESH);
1594 if (unlikely(debug_enabled)) {
1595 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1596 "%s mesh shader %s",
1597 nir->info.label ? nir->info.label
1598 : "unnamed",
1599 nir->info.name));
1600 }
1601
1602 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1603 selected->performance_analysis.require(), params->base.stats);
1604 g.add_const_data(nir->constant_data, nir->constant_data_size);
1605 return g.get_assembly();
1606 }
1607