• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <list>
25 #include <vector>
26 #include "brw_compiler.h"
27 #include "brw_fs.h"
28 #include "brw_nir.h"
29 #include "brw_private.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "dev/intel_debug.h"
32 
33 #include <memory>
34 
35 using namespace brw;
36 
37 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)38 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
39                                    UNUSED const void *data)
40 {
41    if (instr->type != nir_instr_type_intrinsic)
42       return false;
43    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
44    return intrin->intrinsic == nir_intrinsic_load_uniform;
45 }
46 
47 static nir_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)48 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
49                                  UNUSED void *data)
50 {
51    assert(instr->type == nir_instr_type_intrinsic);
52    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
53    assert(intrin->intrinsic == nir_intrinsic_load_uniform);
54 
55    /* Read the first few 32-bit scalars from InlineData. */
56    if (nir_src_is_const(intrin->src[0]) &&
57        intrin->def.bit_size == 32 &&
58        intrin->def.num_components == 1) {
59       unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
60       unsigned off_dw = off / 4;
61       if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
62          off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
63          return nir_load_mesh_inline_data_intel(b, 32, off_dw);
64       }
65    }
66 
67    return brw_nir_load_global_const(b, intrin,
68                                     nir_load_mesh_inline_data_intel(b, 64, 0), 0);
69 }
70 
71 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)72 brw_nir_lower_load_uniforms(nir_shader *nir)
73 {
74    return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
75                                         brw_nir_lower_load_uniforms_impl, NULL);
76 }
77 
78 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)79 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
80 {
81    return glsl_count_dword_slots(type, bindless);
82 }
83 
84 /* TODO(mesh): Make this a common function. */
85 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)86 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
87 {
88    assert(glsl_type_is_vector_or_scalar(type));
89 
90    uint32_t comp_size = glsl_type_is_boolean(type)
91       ? 4 : glsl_get_bit_size(type) / 8;
92    unsigned length = glsl_get_vector_elements(type);
93    *size = comp_size * length,
94    *align = comp_size * (length == 3 ? 4 : length);
95 }
96 
97 static bool
brw_nir_lower_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)98 brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
99                                            nir_intrinsic_instr *intrin,
100                                            void *data)
101 {
102    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
103       return false;
104 
105    b->cursor = nir_before_instr(&intrin->instr);
106 
107    nir_def *local_invocation_index = nir_load_local_invocation_index(b);
108 
109    /* Make sure that the mesh workgroup size is taken from the first invocation
110     * (nir_intrinsic_launch_mesh_workgroups requirement)
111     */
112    nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
113    nir_if *if_stmt = nir_push_if(b, cmp);
114    {
115       /* TUE header contains 4 words:
116        *
117        * - Word 0 for Task Count.
118        *
119        * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
120        *   3D dispatch into the 1D dispatch supported by HW.
121        */
122       nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
123       nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
124       nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
125       nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
126       nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
127       nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
128    }
129    nir_pop_if(b, if_stmt);
130 
131    nir_instr_remove(&intrin->instr);
132 
133    return true;
134 }
135 
136 static bool
brw_nir_lower_launch_mesh_workgroups(nir_shader * nir)137 brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
138 {
139    return nir_shader_intrinsics_pass(nir,
140                                        brw_nir_lower_launch_mesh_workgroups_instr,
141                                        nir_metadata_none,
142                                        NULL);
143 }
144 
145 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)146 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
147 {
148    memset(map, 0, sizeof(*map));
149 
150    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
151             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
152 
153    /* From bspec: "It is suggested that SW reserve the 16 bytes following the
154     * TUE Header, and therefore start the SW-defined data structure at 32B
155     * alignment.  This allows the TUE Header to always be written as 32 bytes
156     * with 32B alignment, the most optimal write performance case."
157     */
158    map->per_task_data_start_dw = 8;
159 
160    /* Lowering to explicit types will start offsets from task_payload_size, so
161     * set it to start after the header.
162     */
163    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
164    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
165             nir_var_mem_task_payload, shared_type_info);
166    NIR_PASS(_, nir, nir_lower_explicit_io,
167             nir_var_mem_task_payload, nir_address_format_32bit_offset);
168 
169    map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
170 }
171 
172 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)173 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
174 {
175    fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
176 }
177 
178 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)179 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
180                                           nir_intrinsic_instr *intrin,
181                                           void *data)
182 {
183    switch (intrin->intrinsic) {
184    case nir_intrinsic_store_task_payload:
185    case nir_intrinsic_load_task_payload: {
186       nir_src *offset_src = nir_get_io_offset_src(intrin);
187 
188       if (nir_src_is_const(*offset_src))
189          assert(nir_src_as_uint(*offset_src) % 4 == 0);
190 
191       b->cursor = nir_before_instr(&intrin->instr);
192 
193       /* Regular I/O uses dwords while explicit I/O used for task payload uses
194        * bytes.  Normalize it to dwords.
195        *
196        * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
197        */
198 
199       nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
200       nir_src_rewrite(offset_src, offset);
201 
202       unsigned base = nir_intrinsic_base(intrin);
203       assert(base % 4 == 0);
204       nir_intrinsic_set_base(intrin, base / 4);
205 
206       return true;
207    }
208 
209    default:
210       return false;
211    }
212 }
213 
214 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)215 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
216 {
217    return nir_shader_intrinsics_pass(nir,
218                                        brw_nir_adjust_task_payload_offsets_instr,
219                                        nir_metadata_block_index |
220                                        nir_metadata_dominance,
221                                        NULL);
222 }
223 
224 void
brw_nir_adjust_payload(nir_shader * shader)225 brw_nir_adjust_payload(nir_shader *shader)
226 {
227    /* Adjustment of task payload offsets must be performed *after* last pass
228     * which interprets them as bytes, because it changes their unit.
229     */
230    bool adjusted = false;
231    NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
232    if (adjusted) /* clean up the mess created by offset adjustments */
233       NIR_PASS(_, shader, nir_opt_constant_folding);
234 }
235 
236 static bool
brw_nir_align_launch_mesh_workgroups_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)237 brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
238                                            nir_intrinsic_instr *intrin,
239                                            void *data)
240 {
241    if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
242       return false;
243 
244    /* nir_lower_task_shader uses "range" as task payload size. */
245    unsigned range = nir_intrinsic_range(intrin);
246    /* This will avoid special case in nir_lower_task_shader dealing with
247     * not vec4-aligned payload when payload_in_shared workaround is enabled.
248     */
249    nir_intrinsic_set_range(intrin, ALIGN(range, 16));
250 
251    return true;
252 }
253 
254 static bool
brw_nir_align_launch_mesh_workgroups(nir_shader * nir)255 brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
256 {
257    return nir_shader_intrinsics_pass(nir,
258                                        brw_nir_align_launch_mesh_workgroups_instr,
259                                        nir_metadata_block_index |
260                                        nir_metadata_dominance,
261                                        NULL);
262 }
263 
264 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,struct brw_compile_task_params * params)265 brw_compile_task(const struct brw_compiler *compiler,
266                  struct brw_compile_task_params *params)
267 {
268    struct nir_shader *nir = params->base.nir;
269    const struct brw_task_prog_key *key = params->key;
270    struct brw_task_prog_data *prog_data = params->prog_data;
271    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
272 
273    brw_nir_lower_tue_outputs(nir, &prog_data->map);
274 
275    NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
276 
277    nir_lower_task_shader_options lower_ts_opt = {
278       .payload_to_shared_for_atomics = true,
279       .payload_to_shared_for_small_types = true,
280       /* The actual payload data starts after the TUE header and padding,
281        * so skip those when copying.
282        */
283       .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
284    };
285    NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
286 
287    NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
288 
289    prog_data->base.base.stage = MESA_SHADER_TASK;
290    prog_data->base.base.total_shared = nir->info.shared_size;
291    prog_data->base.base.total_scratch = 0;
292 
293    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
294    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
295    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
296 
297    prog_data->uses_drawid =
298       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
299 
300    brw_simd_selection_state simd_state{
301       .devinfo = compiler->devinfo,
302       .prog_data = &prog_data->base,
303       .required_width = brw_required_dispatch_width(&nir->info),
304    };
305 
306    std::unique_ptr<fs_visitor> v[3];
307 
308    for (unsigned simd = 0; simd < 3; simd++) {
309       if (!brw_simd_should_compile(simd_state, simd))
310          continue;
311 
312       const unsigned dispatch_width = 8 << simd;
313 
314       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
315       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
316 
317       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
318       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
319 
320       brw_postprocess_nir(shader, compiler, debug_enabled,
321                           key->base.robust_flags);
322 
323       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
324                                              &key->base,
325                                              &prog_data->base.base,
326                                              shader, dispatch_width,
327                                              params->base.stats != NULL,
328                                              debug_enabled);
329 
330       if (prog_data->base.prog_mask) {
331          unsigned first = ffs(prog_data->base.prog_mask) - 1;
332          v[simd]->import_uniforms(v[first].get());
333       }
334 
335       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
336       if (v[simd]->run_task(allow_spilling))
337          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
338       else
339          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
340    }
341 
342    int selected_simd = brw_simd_select(simd_state);
343    if (selected_simd < 0) {
344       params->base.error_str =
345          ralloc_asprintf(params->base.mem_ctx,
346                          "Can't compile shader: "
347                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
348                          simd_state.error[0], simd_state.error[1],
349                          simd_state.error[2]);
350       return NULL;
351    }
352 
353    fs_visitor *selected = v[selected_simd].get();
354    prog_data->base.prog_mask = 1 << selected_simd;
355 
356    if (unlikely(debug_enabled)) {
357       fprintf(stderr, "Task Output ");
358       brw_print_tue_map(stderr, &prog_data->map);
359    }
360 
361    fs_generator g(compiler, &params->base, &prog_data->base.base,
362                   MESA_SHADER_TASK);
363    if (unlikely(debug_enabled)) {
364       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
365                                      "%s task shader %s",
366                                      nir->info.label ? nir->info.label
367                                                      : "unnamed",
368                                      nir->info.name));
369    }
370 
371    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
372                    selected->performance_analysis.require(), params->base.stats);
373    g.add_const_data(nir->constant_data, nir->constant_data_size);
374    return g.get_assembly();
375 }
376 
377 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)378 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
379 {
380    if (!map)
381       return;
382 
383    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
384 
385    bool progress = false;
386 
387    NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
388             nir_var_mem_task_payload, shared_type_info);
389 
390    if (progress) {
391       /* The types for Task Output and Mesh Input should match, so their sizes
392        * should also match.
393        */
394       assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
395    } else {
396       /* Mesh doesn't read any input, to make it clearer set the
397        * task_payload_size to zero instead of keeping an incomplete size that
398        * just includes the header.
399        */
400       nir->info.task_payload_size = 0;
401    }
402 
403    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
404             nir_address_format_32bit_offset);
405 }
406 
407 /* Attribute types. Flat attributes have to be a separate class because
408  * flat and interpolated attributes can't share the same vec4 slot
409  * (see 3DSTATE_SBE.ConstantInterpolationEnable).
410  */
411 enum {
412    PRIM, /* per primitive */
413    VERT, /* per vertex interpolated */
414    VERT_FLAT, /* per vertex flat */
415 };
416 
417 struct attr_desc {
418    int location;
419    const struct glsl_type *type;
420    unsigned dwords;
421    unsigned slots;
422 };
423 
424 struct attr_type_info {
425    /* order of attributes, negative values are holes */
426    std::list<struct attr_desc> *order;
427 
428    /* attributes after which there's hole of size equal to array index */
429    std::list<int> holes[5];
430 };
431 
432 static void
brw_mue_assign_position(const struct attr_desc * attr,struct brw_mue_map * map,unsigned start_dw)433 brw_mue_assign_position(const struct attr_desc *attr,
434                         struct brw_mue_map *map,
435                         unsigned start_dw)
436 {
437    bool is_array = glsl_type_is_array(attr->type);
438    int location = attr->location;
439    unsigned remaining = attr->dwords;
440 
441    for (unsigned slot = 0; slot < attr->slots; ++slot) {
442       map->start_dw[location + slot] = start_dw;
443 
444       unsigned sz;
445 
446       if (is_array) {
447          assert(attr->dwords % attr->slots == 0);
448          sz = attr->dwords / attr->slots;
449       } else {
450          sz = MIN2(remaining, 4);
451       }
452 
453       map->len_dw[location + slot] = sz;
454       start_dw += sz;
455       remaining -= sz;
456    }
457 }
458 
459 static nir_variable *
brw_nir_find_complete_variable_with_location(nir_shader * shader,nir_variable_mode mode,int location)460 brw_nir_find_complete_variable_with_location(nir_shader *shader,
461                                              nir_variable_mode mode,
462                                              int location)
463 {
464    nir_variable *best_var = NULL;
465    unsigned last_size = 0;
466 
467    nir_foreach_variable_with_modes(var, shader, mode) {
468       if (var->data.location != location)
469          continue;
470 
471       unsigned new_size = glsl_count_dword_slots(var->type, false);
472       if (new_size > last_size) {
473          best_var = var;
474          last_size = new_size;
475       }
476    }
477 
478    return best_var;
479 }
480 
481 static unsigned
brw_sum_size(const std::list<struct attr_desc> & orders)482 brw_sum_size(const std::list<struct attr_desc> &orders)
483 {
484    unsigned sz = 0;
485    for (auto it = orders.cbegin(); it != orders.cend(); ++it)
486       sz += (*it).dwords;
487    return sz;
488 }
489 
490 /* Finds order of outputs which require minimum size, without splitting
491  * of URB read/write messages (which operate on vec4-aligned memory).
492  */
493 static void
brw_compute_mue_layout(const struct brw_compiler * compiler,std::list<struct attr_desc> * orders,uint64_t outputs_written,struct nir_shader * nir,bool * pack_prim_data_into_header,bool * pack_vert_data_into_header)494 brw_compute_mue_layout(const struct brw_compiler *compiler,
495                        std::list<struct attr_desc> *orders,
496                        uint64_t outputs_written,
497                        struct nir_shader *nir,
498                        bool *pack_prim_data_into_header,
499                        bool *pack_vert_data_into_header)
500 {
501    const struct shader_info *info = &nir->info;
502 
503    struct attr_type_info data[3];
504 
505    if ((compiler->mesh.mue_header_packing & 1) == 0)
506       *pack_prim_data_into_header = false;
507    if ((compiler->mesh.mue_header_packing & 2) == 0)
508       *pack_vert_data_into_header = false;
509 
510    for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
511       data[i].order = &orders[i];
512 
513    /* If packing into header is enabled, add a hole of size 4 and add
514     * a virtual location to keep the algorithm happy (it expects holes
515     * to be preceded by some location). We'll remove those virtual
516     * locations at the end.
517     */
518    const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
519    assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
520 
521    struct attr_desc d;
522    d.location = virtual_header_location;
523    d.type = NULL;
524    d.dwords = 0;
525    d.slots = 0;
526 
527    struct attr_desc h;
528    h.location = -1;
529    h.type = NULL;
530    h.dwords = 4;
531    h.slots = 0;
532 
533    if (*pack_prim_data_into_header) {
534       orders[PRIM].push_back(d);
535       orders[PRIM].push_back(h);
536       data[PRIM].holes[4].push_back(virtual_header_location);
537    }
538 
539    if (*pack_vert_data_into_header) {
540       orders[VERT].push_back(d);
541       orders[VERT].push_back(h);
542       data[VERT].holes[4].push_back(virtual_header_location);
543    }
544 
545    u_foreach_bit64(location, outputs_written) {
546       if ((BITFIELD64_BIT(location) & outputs_written) == 0)
547          continue;
548 
549       /* At this point there are both complete and split variables as
550        * outputs. We need the complete variable to compute the required
551        * size.
552        */
553       nir_variable *var =
554             brw_nir_find_complete_variable_with_location(nir,
555                                                          nir_var_shader_out,
556                                                          location);
557 
558       d.location = location;
559       d.type     = brw_nir_get_var_type(nir, var);
560       d.dwords   = glsl_count_dword_slots(d.type, false);
561       d.slots    = glsl_count_attribute_slots(d.type, false);
562 
563       struct attr_type_info *type_data;
564 
565       if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
566          type_data = &data[PRIM];
567       else if (var->data.interpolation == INTERP_MODE_FLAT)
568          type_data = &data[VERT_FLAT];
569       else
570          type_data = &data[VERT];
571 
572       std::list<struct attr_desc> *order = type_data->order;
573       std::list<int> *holes = type_data->holes;
574 
575       outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
576 
577       /* special case to use hole of size 4 */
578       if (d.dwords == 4 && !holes[4].empty()) {
579          holes[4].pop_back();
580 
581          assert(order->front().location == virtual_header_location);
582          order->pop_front();
583 
584          assert(order->front().location == -1);
585          assert(order->front().dwords == 4);
586          order->front() = d;
587 
588          continue;
589       }
590 
591       int mod = d.dwords % 4;
592       if (mod == 0) {
593          order->push_back(d);
594          continue;
595       }
596 
597       h.location = -1;
598       h.type = NULL;
599       h.dwords = 4 - mod;
600       h.slots = 0;
601 
602       if (!compiler->mesh.mue_compaction) {
603          order->push_back(d);
604          order->push_back(h);
605          continue;
606       }
607 
608       if (d.dwords > 4) {
609          order->push_back(d);
610          order->push_back(h);
611          holes[h.dwords].push_back(location);
612          continue;
613       }
614 
615       assert(d.dwords < 4);
616 
617       unsigned found = 0;
618       /* try to find the smallest hole big enough to hold this attribute */
619       for (unsigned sz = d.dwords; sz <= 4; sz++){
620          if (!holes[sz].empty()) {
621             found = sz;
622             break;
623          }
624       }
625 
626       /* append at the end if not found */
627       if (found == 0) {
628          order->push_back(d);
629          order->push_back(h);
630          holes[h.dwords].push_back(location);
631 
632          continue;
633       }
634 
635       assert(found <= 4);
636       assert(!holes[found].empty());
637       int after_loc = holes[found].back();
638       holes[found].pop_back();
639 
640       bool inserted_back = false;
641 
642       for (auto it = order->begin(); it != order->end(); ++it) {
643          if ((*it).location != after_loc)
644             continue;
645 
646          ++it;
647          /* must be a hole */
648          assert((*it).location < 0);
649          /* and it must be big enough */
650          assert(d.dwords <= (*it).dwords);
651 
652          if (d.dwords == (*it).dwords) {
653             /* exact size, just replace */
654             *it = d;
655          } else {
656             /* inexact size, shrink hole */
657             (*it).dwords -= d.dwords;
658             /* and insert new attribute before it */
659             order->insert(it, d);
660 
661             /* Insert shrunk hole in a spot so that the order of attributes
662              * is preserved.
663              */
664             std::list<int> &hole_list = holes[(*it).dwords];
665             std::list<int>::iterator insert_before = hole_list.end();
666 
667             for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
668                if ((*it2) >= (int)location) {
669                   insert_before = it2;
670                   break;
671                }
672             }
673 
674             hole_list.insert(insert_before, location);
675          }
676 
677          inserted_back = true;
678          break;
679       }
680 
681       assert(inserted_back);
682    }
683 
684    if (*pack_prim_data_into_header) {
685       if (orders[PRIM].front().location == virtual_header_location)
686          orders[PRIM].pop_front();
687 
688       if (!data[PRIM].holes[4].empty()) {
689          *pack_prim_data_into_header = false;
690 
691          assert(orders[PRIM].front().location == -1);
692          assert(orders[PRIM].front().dwords == 4);
693          orders[PRIM].pop_front();
694       }
695 
696       if (*pack_prim_data_into_header) {
697          unsigned sz = brw_sum_size(orders[PRIM]);
698 
699          if (sz % 8 == 0 || sz % 8 > 4)
700             *pack_prim_data_into_header = false;
701       }
702    }
703 
704    if (*pack_vert_data_into_header) {
705       if (orders[VERT].front().location == virtual_header_location)
706          orders[VERT].pop_front();
707 
708       if (!data[VERT].holes[4].empty()) {
709          *pack_vert_data_into_header = false;
710 
711          assert(orders[VERT].front().location == -1);
712          assert(orders[VERT].front().dwords == 4);
713          orders[VERT].pop_front();
714       }
715 
716       if (*pack_vert_data_into_header) {
717          unsigned sz = brw_sum_size(orders[VERT]) +
718                        brw_sum_size(orders[VERT_FLAT]);
719 
720          if (sz % 8 == 0 || sz % 8 > 4)
721             *pack_vert_data_into_header = false;
722       }
723    }
724 
725 
726    if (INTEL_DEBUG(DEBUG_MESH)) {
727       fprintf(stderr, "MUE attribute order:\n");
728       for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
729          if (!orders[i].empty())
730             fprintf(stderr, "%d: ", i);
731          for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
732             fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
733          }
734          if (!orders[i].empty())
735             fprintf(stderr, "\n");
736       }
737    }
738 }
739 
740 /* Mesh URB Entry consists of an initial section
741  *
742  *  - Primitive Count
743  *  - Primitive Indices (from 0 to Max-1)
744  *  - Padding to 32B if needed
745  *
746  * optionally followed by a section for per-primitive data,
747  * in which each primitive (from 0 to Max-1) gets
748  *
749  *  - Primitive Header (e.g. ViewportIndex)
750  *  - Primitive Custom Attributes
751  *
752  * then followed by a section for per-vertex data
753  *
754  *  - Vertex Header (e.g. Position)
755  *  - Vertex Custom Attributes
756  *
757  * Each per-element section has a pitch and a starting offset.  All the
758  * individual attributes offsets in start_dw are considering the first entry
759  * of the section (i.e. where the Position for first vertex, or ViewportIndex
760  * for first primitive).  Attributes for other elements are calculated using
761  * the pitch.
762  */
763 static void
brw_compute_mue_map(const struct brw_compiler * compiler,struct nir_shader * nir,struct brw_mue_map * map,enum brw_mesh_index_format index_format,bool compact_mue)764 brw_compute_mue_map(const struct brw_compiler *compiler,
765                     struct nir_shader *nir, struct brw_mue_map *map,
766                     enum brw_mesh_index_format index_format, bool compact_mue)
767 {
768    memset(map, 0, sizeof(*map));
769 
770    memset(&map->start_dw[0], -1, sizeof(map->start_dw));
771    memset(&map->len_dw[0], 0, sizeof(map->len_dw));
772 
773    unsigned vertices_per_primitive =
774       mesa_vertices_per_prim(nir->info.mesh.primitive_type);
775 
776    map->max_primitives = nir->info.mesh.max_primitives_out;
777    map->max_vertices = nir->info.mesh.max_vertices_out;
778 
779    uint64_t outputs_written = nir->info.outputs_written;
780 
781    /* One dword for primitives count then K extra dwords for each primitive. */
782    switch (index_format) {
783    case BRW_INDEX_FORMAT_U32:
784       map->per_primitive_indices_dw = vertices_per_primitive;
785       break;
786    case BRW_INDEX_FORMAT_U888X:
787       map->per_primitive_indices_dw = 1;
788       break;
789    default:
790       unreachable("invalid index format");
791    }
792 
793    map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
794                                        map->max_primitives + 1, 8);
795 
796    /* Assign initial section. */
797    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
798       map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
799       map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
800       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
801    }
802    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
803       map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
804       map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
805             map->per_primitive_indices_dw * map->max_primitives;
806       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
807    }
808 
809    const uint64_t per_primitive_header_bits =
810          BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
811          BITFIELD64_BIT(VARYING_SLOT_LAYER) |
812          BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
813          BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
814 
815    const uint64_t per_vertex_header_bits =
816          BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
817          BITFIELD64_BIT(VARYING_SLOT_POS) |
818          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
819          BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
820 
821    std::list<struct attr_desc> orders[3];
822    uint64_t regular_outputs = outputs_written &
823          ~(per_primitive_header_bits | per_vertex_header_bits);
824 
825    /* packing into prim header is possible only if prim header is present */
826    map->user_data_in_primitive_header = compact_mue &&
827          (outputs_written & per_primitive_header_bits) != 0;
828 
829    /* Packing into vert header is always possible, but we allow it only
830     * if full vec4 is available (so point size is not used) and there's
831     * nothing between it and normal vertex data (so no clip distances).
832     */
833    map->user_data_in_vertex_header = compact_mue &&
834          (outputs_written & per_vertex_header_bits) ==
835                BITFIELD64_BIT(VARYING_SLOT_POS);
836 
837    if (outputs_written & per_primitive_header_bits) {
838       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
839          map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
840                map->per_primitive_start_dw + 0;
841          map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
842       }
843 
844       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
845          map->start_dw[VARYING_SLOT_LAYER] =
846                map->per_primitive_start_dw + 1; /* RTAIndex */
847          map->len_dw[VARYING_SLOT_LAYER] = 1;
848       }
849 
850       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
851          map->start_dw[VARYING_SLOT_VIEWPORT] =
852                map->per_primitive_start_dw + 2;
853          map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
854       }
855 
856       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
857          map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
858                map->per_primitive_start_dw + 3;
859          map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
860       }
861 
862       map->per_primitive_header_size_dw = 8;
863       outputs_written &= ~per_primitive_header_bits;
864    } else {
865       map->per_primitive_header_size_dw = 0;
866    }
867 
868    map->per_primitive_data_size_dw = 0;
869 
870    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
871     * will be compiled without access to the MUE map and won't be able to find
872     * out where everything is.
873     * Instead, keep doing things as we did before the packing, just laying out
874     * everything in varying order, which is how the FS will expect them.
875     */
876    if (compact_mue) {
877       brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
878                              &map->user_data_in_primitive_header,
879                              &map->user_data_in_vertex_header);
880 
881       unsigned start_dw = map->per_primitive_start_dw;
882       if (map->user_data_in_primitive_header)
883          start_dw += 4; /* first 4 dwords are used */
884       else
885          start_dw += map->per_primitive_header_size_dw;
886       unsigned header_used_dw = 0;
887 
888       for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
889          int location = (*it).location;
890          if (location < 0) {
891             start_dw += (*it).dwords;
892             if (map->user_data_in_primitive_header && header_used_dw < 4)
893                header_used_dw += (*it).dwords;
894             else
895                map->per_primitive_data_size_dw += (*it).dwords;
896             assert(header_used_dw <= 4);
897             continue;
898          }
899 
900          assert(map->start_dw[location] == -1);
901 
902          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
903                 location >= VARYING_SLOT_VAR0);
904 
905          brw_mue_assign_position(&*it, map, start_dw);
906 
907          start_dw += (*it).dwords;
908          if (map->user_data_in_primitive_header && header_used_dw < 4)
909             header_used_dw += (*it).dwords;
910          else
911             map->per_primitive_data_size_dw += (*it).dwords;
912          assert(header_used_dw <= 4);
913          outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
914       }
915    } else {
916       unsigned start_dw = map->per_primitive_start_dw +
917                           map->per_primitive_header_size_dw;
918 
919       uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
920       while (per_prim_outputs) {
921          uint64_t location = ffsll(per_prim_outputs) - 1;
922 
923          assert(map->start_dw[location] == -1);
924          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
925                 location >= VARYING_SLOT_VAR0);
926 
927          nir_variable *var =
928             brw_nir_find_complete_variable_with_location(nir,
929                                                          nir_var_shader_out,
930                                                          location);
931          struct attr_desc d;
932          d.location = location;
933          d.type     = brw_nir_get_var_type(nir, var);
934          d.dwords   = glsl_count_dword_slots(d.type, false);
935          d.slots    = glsl_count_attribute_slots(d.type, false);
936 
937          brw_mue_assign_position(&d, map, start_dw);
938 
939          map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
940          start_dw += ALIGN(d.dwords, 4);
941 
942          per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
943       }
944    }
945 
946    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
947                                        map->per_primitive_data_size_dw, 8);
948 
949    map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
950                                     map->per_primitive_pitch_dw *
951                                     map->max_primitives, 8);
952 
953    /* TODO(mesh): Multiview. */
954    unsigned fixed_header_size = 8;
955    map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
956                                           nir->info.clip_distance_array_size +
957                                           nir->info.cull_distance_array_size, 8);
958 
959    if (outputs_written & per_vertex_header_bits) {
960       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
961          map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
962          map->len_dw[VARYING_SLOT_PSIZ] = 1;
963       }
964 
965       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
966          map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
967          map->len_dw[VARYING_SLOT_POS] = 4;
968       }
969 
970       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
971          map->start_dw[VARYING_SLOT_CLIP_DIST0] =
972                map->per_vertex_start_dw + fixed_header_size + 0;
973          map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
974       }
975 
976       if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
977          map->start_dw[VARYING_SLOT_CLIP_DIST1] =
978                map->per_vertex_start_dw + fixed_header_size + 4;
979          map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
980       }
981 
982       outputs_written &= ~per_vertex_header_bits;
983    }
984 
985    /* cull distances should be lowered earlier */
986    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
987    assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
988 
989    map->per_vertex_data_size_dw = 0;
990 
991    /* For fast linked libraries, we can't pack the MUE, as the fragment shader
992     * will be compiled without access to the MUE map and won't be able to find
993     * out where everything is.
994     * Instead, keep doing things as we did before the packing, just laying out
995     * everything in varying order, which is how the FS will expect them.
996     */
997    if (compact_mue) {
998       unsigned start_dw = map->per_vertex_start_dw;
999       if (!map->user_data_in_vertex_header)
1000          start_dw += map->per_vertex_header_size_dw;
1001 
1002       unsigned header_used_dw = 0;
1003       for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
1004          for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
1005             int location = (*it).location;
1006             if (location < 0) {
1007                start_dw += (*it).dwords;
1008                if (map->user_data_in_vertex_header && header_used_dw < 4) {
1009                   header_used_dw += (*it).dwords;
1010                   assert(header_used_dw <= 4);
1011                   if (header_used_dw == 4)
1012                      start_dw += 4; /* jump over gl_position */
1013                } else {
1014                   map->per_vertex_data_size_dw += (*it).dwords;
1015                }
1016                continue;
1017             }
1018 
1019             assert(map->start_dw[location] == -1);
1020 
1021             assert(location >= VARYING_SLOT_VAR0);
1022 
1023             brw_mue_assign_position(&*it, map, start_dw);
1024 
1025             start_dw += (*it).dwords;
1026             if (map->user_data_in_vertex_header && header_used_dw < 4) {
1027                header_used_dw += (*it).dwords;
1028                assert(header_used_dw <= 4);
1029                if (header_used_dw == 4)
1030                   start_dw += 4; /* jump over gl_position */
1031             } else {
1032                map->per_vertex_data_size_dw += (*it).dwords;
1033             }
1034             outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
1035          }
1036       }
1037    } else {
1038       unsigned start_dw = map->per_vertex_start_dw +
1039                           map->per_vertex_header_size_dw;
1040 
1041       uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
1042       while (per_vertex_outputs) {
1043          uint64_t location = ffsll(per_vertex_outputs) - 1;
1044 
1045          assert(map->start_dw[location] == -1);
1046          assert(location >= VARYING_SLOT_VAR0);
1047 
1048          nir_variable *var =
1049             brw_nir_find_complete_variable_with_location(nir,
1050                                                          nir_var_shader_out,
1051                                                          location);
1052          struct attr_desc d;
1053          d.location = location;
1054          d.type     = brw_nir_get_var_type(nir, var);
1055          d.dwords   = glsl_count_dword_slots(d.type, false);
1056          d.slots    = glsl_count_attribute_slots(d.type, false);
1057 
1058          brw_mue_assign_position(&d, map, start_dw);
1059 
1060          map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
1061          start_dw += ALIGN(d.dwords, 4);
1062 
1063          per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
1064       }
1065    }
1066 
1067    map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
1068                                     map->per_vertex_data_size_dw, 8);
1069 
1070    map->size_dw =
1071       map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
1072 
1073    assert(map->size_dw % 8 == 0);
1074 }
1075 
1076 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map,struct nir_shader * nir)1077 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
1078 {
1079    fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
1080            map->size_dw, map->max_primitives, map->max_vertices);
1081    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
1082            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
1083            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
1084            map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
1085    fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
1086            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
1087            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
1088            map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
1089 
1090    fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
1091            map->per_primitive_start_dw,
1092            map->per_primitive_header_size_dw,
1093            map->per_primitive_data_size_dw,
1094            map->per_primitive_pitch_dw);
1095 
1096    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1097       if (map->start_dw[i] < 0)
1098          continue;
1099 
1100       const unsigned offset = map->start_dw[i];
1101       const unsigned len = map->len_dw[i];
1102 
1103       if (offset < map->per_primitive_start_dw ||
1104           offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
1105          continue;
1106 
1107       const char *name =
1108             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1109                                            MESA_SHADER_MESH);
1110 
1111       fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
1112               name, i);
1113    }
1114 
1115    fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
1116            map->per_vertex_start_dw,
1117            map->per_vertex_header_size_dw,
1118            map->per_vertex_data_size_dw,
1119            map->per_vertex_pitch_dw);
1120 
1121    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1122       if (map->start_dw[i] < 0)
1123          continue;
1124 
1125       const unsigned offset = map->start_dw[i];
1126       const unsigned len = map->len_dw[i];
1127 
1128       if (offset < map->per_vertex_start_dw ||
1129           offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
1130          continue;
1131 
1132       nir_variable *var =
1133             nir_find_variable_with_location(nir, nir_var_shader_out, i);
1134       bool flat = var->data.interpolation == INTERP_MODE_FLAT;
1135 
1136       const char *name =
1137             gl_varying_slot_name_for_stage((gl_varying_slot)i,
1138                                            MESA_SHADER_MESH);
1139 
1140       fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
1141               name, i, flat ? " (flat)" : "");
1142    }
1143 
1144    fprintf(fp, "\n");
1145 }
1146 
1147 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)1148 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
1149 {
1150    nir_foreach_shader_out_variable(var, nir) {
1151       int location = var->data.location;
1152       assert(location >= 0);
1153       assert(map->start_dw[location] != -1);
1154       var->data.driver_location = map->start_dw[location];
1155    }
1156 
1157    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
1158             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
1159 }
1160 
1161 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)1162 brw_nir_initialize_mue(nir_shader *nir,
1163                        const struct brw_mue_map *map,
1164                        unsigned dispatch_width)
1165 {
1166    assert(map->per_primitive_header_size_dw > 0);
1167 
1168    nir_builder b;
1169    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
1170    b = nir_builder_at(nir_before_impl(entrypoint));
1171 
1172    nir_def *dw_off = nir_imm_int(&b, 0);
1173    nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
1174 
1175    /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
1176 
1177    assert(!nir->info.workgroup_size_variable);
1178    const unsigned workgroup_size = nir->info.workgroup_size[0] *
1179                                    nir->info.workgroup_size[1] *
1180                                    nir->info.workgroup_size[2];
1181 
1182    /* Invocations from a single workgroup will cooperate in zeroing MUE. */
1183 
1184    /* How many prims each invocation needs to cover without checking its index? */
1185    unsigned prims_per_inv = map->max_primitives / workgroup_size;
1186 
1187    /* Zero first 4 dwords of MUE Primitive Header:
1188     * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
1189     */
1190 
1191    nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
1192 
1193    /* Zero primitive headers distanced by workgroup_size, starting from
1194     * invocation index.
1195     */
1196    for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
1197       nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1198                                            prim_in_inv * workgroup_size);
1199 
1200       nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1201                                      .base = (int)map->per_primitive_start_dw,
1202                                      .write_mask = WRITEMASK_XYZW,
1203                                      .component = 0,
1204                                      .src_type = nir_type_uint32);
1205    }
1206 
1207    /* How many prims are left? */
1208    unsigned remaining = map->max_primitives % workgroup_size;
1209 
1210    if (remaining) {
1211       /* Zero "remaining" primitive headers starting from the last one covered
1212        * by the loop above + workgroup_size.
1213        */
1214       nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
1215       nir_if *if_stmt = nir_push_if(&b, cmp);
1216       {
1217          nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
1218                                                prims_per_inv * workgroup_size);
1219 
1220          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
1221                                         .base = (int)map->per_primitive_start_dw,
1222                                         .write_mask = WRITEMASK_XYZW,
1223                                         .component = 0,
1224                                         .src_type = nir_type_uint32);
1225       }
1226       nir_pop_if(&b, if_stmt);
1227    }
1228 
1229    /* If there's more than one subgroup, then we need to wait for all of them
1230     * to finish initialization before we can proceed. Otherwise some subgroups
1231     * may start filling MUE before other finished initializing.
1232     */
1233    if (workgroup_size > dispatch_width) {
1234       nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
1235                          NIR_MEMORY_ACQ_REL, nir_var_shader_out);
1236    }
1237 
1238    if (remaining) {
1239       nir_metadata_preserve(entrypoint, nir_metadata_none);
1240    } else {
1241       nir_metadata_preserve(entrypoint, nir_metadata_block_index |
1242                                         nir_metadata_dominance);
1243    }
1244 }
1245 
1246 static void
brw_nir_adjust_offset(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t pitch)1247 brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
1248 {
1249    nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
1250    nir_src *offset_src = nir_get_io_offset_src(intrin);
1251 
1252    b->cursor = nir_before_instr(&intrin->instr);
1253    nir_def *offset =
1254       nir_iadd(b,
1255                offset_src->ssa,
1256                nir_imul_imm(b, index_src->ssa, pitch));
1257    nir_src_rewrite(offset_src, offset);
1258 }
1259 
1260 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1261 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
1262                                                 nir_intrinsic_instr *intrin,
1263                                                 void *data)
1264 {
1265    const struct brw_mue_map *map = (const struct brw_mue_map *) data;
1266 
1267    /* Remap per_vertex and per_primitive offsets using the extra source and
1268     * the pitch.
1269     */
1270    switch (intrin->intrinsic) {
1271    case nir_intrinsic_load_per_vertex_output:
1272    case nir_intrinsic_store_per_vertex_output:
1273       brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
1274 
1275       return true;
1276 
1277    case nir_intrinsic_load_per_primitive_output:
1278    case nir_intrinsic_store_per_primitive_output: {
1279       struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
1280       uint32_t pitch;
1281       if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
1282          pitch = map->per_primitive_indices_dw;
1283       else
1284          pitch = map->per_primitive_pitch_dw;
1285 
1286       brw_nir_adjust_offset(b, intrin, pitch);
1287 
1288       return true;
1289    }
1290 
1291    default:
1292       return false;
1293    }
1294 }
1295 
1296 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)1297 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
1298 {
1299    return nir_shader_intrinsics_pass(nir,
1300                                        brw_nir_adjust_offset_for_arrayed_indices_instr,
1301                                        nir_metadata_block_index |
1302                                        nir_metadata_dominance,
1303                                        (void *)map);
1304 }
1305 
1306 struct index_packing_state {
1307    unsigned vertices_per_primitive;
1308    nir_variable *original_prim_indices;
1309    nir_variable *packed_prim_indices;
1310 };
1311 
1312 static bool
brw_can_pack_primitive_indices(nir_shader * nir,struct index_packing_state * state)1313 brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
1314 {
1315    /* can single index fit into one byte of U888X format? */
1316    if (nir->info.mesh.max_vertices_out > 255)
1317       return false;
1318 
1319    state->vertices_per_primitive =
1320          mesa_vertices_per_prim(nir->info.mesh.primitive_type);
1321    /* packing point indices doesn't help */
1322    if (state->vertices_per_primitive == 1)
1323       return false;
1324 
1325    state->original_prim_indices =
1326       nir_find_variable_with_location(nir,
1327                                       nir_var_shader_out,
1328                                       VARYING_SLOT_PRIMITIVE_INDICES);
1329    /* no indices = no changes to the shader, but it's still worth it,
1330     * because less URB space will be used
1331     */
1332    if (!state->original_prim_indices)
1333       return true;
1334 
1335    ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
1336    assert(glsl_type_is_array(type));
1337    assert(glsl_type_is_vector(glsl_without_array(type)));
1338    assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
1339 
1340    nir_foreach_function_impl(impl, nir) {
1341       nir_foreach_block(block, impl) {
1342          nir_foreach_instr(instr, block) {
1343             if (instr->type != nir_instr_type_intrinsic)
1344                continue;
1345 
1346             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1347 
1348             if (intrin->intrinsic != nir_intrinsic_store_deref) {
1349                /* any unknown deref operation on primitive indices -> don't pack */
1350                unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
1351                for (unsigned i = 0; i < num_srcs; i++) {
1352                   nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
1353                   if (!deref)
1354                      continue;
1355                   nir_variable *var = nir_deref_instr_get_variable(deref);
1356 
1357                   if (var == state->original_prim_indices)
1358                      return false;
1359                }
1360 
1361                continue;
1362             }
1363 
1364             nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1365             if (!deref)
1366                continue;
1367 
1368             nir_variable *var = nir_deref_instr_get_variable(deref);
1369             if (var != state->original_prim_indices)
1370                continue;
1371 
1372             if (deref->deref_type != nir_deref_type_array)
1373                return false; /* unknown chain of derefs */
1374 
1375             nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
1376             if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1377                return false; /* unknown chain of derefs */
1378 
1379             assert (var_deref->var == state->original_prim_indices);
1380 
1381             unsigned write_mask = nir_intrinsic_write_mask(intrin);
1382 
1383             /* If only some components are written, then we can't easily pack.
1384              * In theory we could, by loading current dword value, bitmasking
1385              * one byte and storing back the whole dword, but it would be slow
1386              * and could actually decrease performance. TODO: reevaluate this
1387              * once there will be something hitting this.
1388              */
1389             if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
1390                return false;
1391          }
1392       }
1393    }
1394 
1395    return true;
1396 }
1397 
1398 static bool
brw_pack_primitive_indices_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)1399 brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
1400                                  void *data)
1401 {
1402    if (intrin->intrinsic != nir_intrinsic_store_deref)
1403       return false;
1404 
1405    nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
1406    if (!array_deref || array_deref->deref_type != nir_deref_type_array)
1407       return false;
1408 
1409    nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
1410    if (!var_deref || var_deref->deref_type != nir_deref_type_var)
1411       return false;
1412 
1413    struct index_packing_state *state =
1414          (struct index_packing_state *)data;
1415 
1416    nir_variable *var = var_deref->var;
1417 
1418    if (var != state->original_prim_indices)
1419       return false;
1420 
1421    unsigned vertices_per_primitive = state->vertices_per_primitive;
1422 
1423    b->cursor = nir_before_instr(&intrin->instr);
1424 
1425    nir_deref_instr *new_var_deref =
1426          nir_build_deref_var(b, state->packed_prim_indices);
1427    nir_deref_instr *new_array_deref =
1428          nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
1429 
1430    nir_src *data_src = &intrin->src[1];
1431    nir_def *data_def =
1432          data_src->ssa;
1433 
1434    nir_def *new_data =
1435          nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
1436                     nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
1437 
1438    if (vertices_per_primitive >= 3) {
1439       new_data =
1440             nir_ior(b, new_data,
1441                        nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
1442    }
1443 
1444    nir_build_store_deref(b, &new_array_deref->def, new_data);
1445 
1446    nir_instr_remove(&intrin->instr);
1447 
1448    return true;
1449 }
1450 
1451 static bool
brw_pack_primitive_indices(nir_shader * nir,void * data)1452 brw_pack_primitive_indices(nir_shader *nir, void *data)
1453 {
1454    struct index_packing_state *state = (struct index_packing_state *)data;
1455 
1456    const struct glsl_type *new_type =
1457          glsl_array_type(glsl_uint_type(),
1458                          nir->info.mesh.max_primitives_out,
1459                          0);
1460 
1461    state->packed_prim_indices =
1462          nir_variable_create(nir, nir_var_shader_out,
1463                              new_type, "gl_PrimitiveIndicesPacked");
1464    state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
1465    state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
1466    state->packed_prim_indices->data.per_primitive = 1;
1467 
1468    return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
1469                                        nir_metadata_block_index |
1470                                        nir_metadata_dominance,
1471                                        data);
1472 }
1473 
1474 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,struct brw_compile_mesh_params * params)1475 brw_compile_mesh(const struct brw_compiler *compiler,
1476                  struct brw_compile_mesh_params *params)
1477 {
1478    struct nir_shader *nir = params->base.nir;
1479    const struct brw_mesh_prog_key *key = params->key;
1480    struct brw_mesh_prog_data *prog_data = params->prog_data;
1481    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
1482 
1483    prog_data->base.base.stage = MESA_SHADER_MESH;
1484    prog_data->base.base.total_shared = nir->info.shared_size;
1485    prog_data->base.base.total_scratch = 0;
1486 
1487    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
1488    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
1489    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
1490 
1491    prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
1492    prog_data->cull_distance_mask =
1493          ((1 << nir->info.cull_distance_array_size) - 1) <<
1494           nir->info.clip_distance_array_size;
1495    prog_data->primitive_type = nir->info.mesh.primitive_type;
1496 
1497    struct index_packing_state index_packing_state = {};
1498    if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
1499       if (index_packing_state.original_prim_indices)
1500          NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
1501       prog_data->index_format = BRW_INDEX_FORMAT_U888X;
1502    } else {
1503       prog_data->index_format = BRW_INDEX_FORMAT_U32;
1504    }
1505 
1506    prog_data->uses_drawid =
1507       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
1508 
1509    brw_nir_lower_tue_inputs(nir, params->tue_map);
1510 
1511    brw_compute_mue_map(compiler, nir, &prog_data->map,
1512                        prog_data->index_format, key->compact_mue);
1513    brw_nir_lower_mue_outputs(nir, &prog_data->map);
1514 
1515    brw_simd_selection_state simd_state{
1516       .devinfo = compiler->devinfo,
1517       .prog_data = &prog_data->base,
1518       .required_width = brw_required_dispatch_width(&nir->info),
1519    };
1520 
1521    std::unique_ptr<fs_visitor> v[3];
1522 
1523    for (int simd = 0; simd < 3; simd++) {
1524       if (!brw_simd_should_compile(simd_state, simd))
1525          continue;
1526 
1527       const unsigned dispatch_width = 8 << simd;
1528 
1529       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
1530 
1531       /*
1532        * When Primitive Header is enabled, we may not generates writes to all
1533        * fields, so let's initialize everything.
1534        */
1535       if (prog_data->map.per_primitive_header_size_dw > 0)
1536          NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
1537 
1538       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
1539 
1540       NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
1541       /* Load uniforms can do a better job for constants, so fold before it. */
1542       NIR_PASS(_, shader, nir_opt_constant_folding);
1543       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
1544 
1545       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
1546 
1547       brw_postprocess_nir(shader, compiler, debug_enabled,
1548                           key->base.robust_flags);
1549 
1550       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
1551                                              &key->base,
1552                                              &prog_data->base.base,
1553                                              shader, dispatch_width,
1554                                              params->base.stats != NULL,
1555                                              debug_enabled);
1556 
1557       if (prog_data->base.prog_mask) {
1558          unsigned first = ffs(prog_data->base.prog_mask) - 1;
1559          v[simd]->import_uniforms(v[first].get());
1560       }
1561 
1562       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
1563       if (v[simd]->run_mesh(allow_spilling))
1564          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
1565       else
1566          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
1567    }
1568 
1569    int selected_simd = brw_simd_select(simd_state);
1570    if (selected_simd < 0) {
1571       params->base.error_str =
1572          ralloc_asprintf(params->base.mem_ctx,
1573                          "Can't compile shader: "
1574                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
1575                          simd_state.error[0], simd_state.error[1],
1576                          simd_state.error[2]);
1577       return NULL;
1578    }
1579 
1580    fs_visitor *selected = v[selected_simd].get();
1581    prog_data->base.prog_mask = 1 << selected_simd;
1582 
1583    if (unlikely(debug_enabled)) {
1584       if (params->tue_map) {
1585          fprintf(stderr, "Mesh Input ");
1586          brw_print_tue_map(stderr, params->tue_map);
1587       }
1588       fprintf(stderr, "Mesh Output ");
1589       brw_print_mue_map(stderr, &prog_data->map, nir);
1590    }
1591 
1592    fs_generator g(compiler, &params->base, &prog_data->base.base,
1593                   MESA_SHADER_MESH);
1594    if (unlikely(debug_enabled)) {
1595       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1596                                      "%s mesh shader %s",
1597                                      nir->info.label ? nir->info.label
1598                                                      : "unnamed",
1599                                      nir->info.name));
1600    }
1601 
1602    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
1603                    selected->performance_analysis.require(), params->base.stats);
1604    g.add_const_data(nir->constant_data, nir->constant_data_size);
1605    return g.get_assembly();
1606 }
1607