• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_compiler.h"
25 #include "brw_fs.h"
26 #include "brw_nir.h"
27 #include "brw_private.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "dev/intel_debug.h"
30 
31 using namespace brw;
32 
33 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)34 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
35                                    UNUSED const void *data)
36 {
37    if (instr->type != nir_instr_type_intrinsic)
38       return false;
39    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
40    return intrin->intrinsic == nir_intrinsic_load_uniform;
41 }
42 
43 static nir_ssa_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)44 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
45                                  UNUSED void *data)
46 {
47    assert(instr->type == nir_instr_type_intrinsic);
48    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
49    assert(intrin->intrinsic == nir_intrinsic_load_uniform);
50 
51    /* Read the first few 32-bit scalars from InlineData. */
52    if (nir_src_is_const(intrin->src[0]) &&
53        nir_dest_bit_size(intrin->dest) == 32 &&
54        nir_dest_num_components(intrin->dest) == 1) {
55       unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
56       unsigned off_dw = off / 4;
57       if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
58          off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
59          return nir_load_mesh_inline_data_intel(b, 32, off_dw);
60       }
61    }
62 
63    return brw_nir_load_global_const(b, intrin,
64                                     nir_load_mesh_inline_data_intel(b, 64, 0), 0);
65 }
66 
67 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)68 brw_nir_lower_load_uniforms(nir_shader *nir)
69 {
70    return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
71                                         brw_nir_lower_load_uniforms_impl, NULL);
72 }
73 
74 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)75 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
76 {
77    return glsl_count_dword_slots(type, bindless);
78 }
79 
80 /* TODO(mesh): Make this a common function. */
81 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)82 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
83 {
84    assert(glsl_type_is_vector_or_scalar(type));
85 
86    uint32_t comp_size = glsl_type_is_boolean(type)
87       ? 4 : glsl_get_bit_size(type) / 8;
88    unsigned length = glsl_get_vector_elements(type);
89    *size = comp_size * length,
90    *align = comp_size * (length == 3 ? 4 : length);
91 }
92 
93 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)94 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
95 {
96    memset(map, 0, sizeof(*map));
97 
98    /* TUE header contains 4 words:
99     *
100     * - Word 0 for Task Count.
101     *
102     * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
103     *   3D dispatch into the 1D dispatch supported by HW.  Currently not used.
104     */
105    nir_foreach_shader_out_variable(var, nir) {
106       assert(var->data.location == VARYING_SLOT_TASK_COUNT);
107       var->data.driver_location = 0;
108    }
109 
110    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
111             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
112 
113    /* From bspec: "It is suggested that SW reserve the 16 bytes following the
114     * TUE Header, and therefore start the SW-defined data structure at 32B
115     * alignment.  This allows the TUE Header to always be written as 32 bytes
116     * with 32B alignment, the most optimal write performance case."
117     */
118    map->per_task_data_start_dw = 8;
119 
120    /* Lowering to explicit types will start offsets from task_payload_size, so
121     * set it to start after the header.
122     */
123    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
124    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
125             nir_var_mem_task_payload, shared_type_info);
126    NIR_PASS(_, nir, nir_lower_explicit_io,
127             nir_var_mem_task_payload, nir_address_format_32bit_offset);
128 
129    map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
130 }
131 
132 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)133 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
134 {
135    fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
136 }
137 
138 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_instr * instr,void * data)139 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
140                                           nir_instr *instr, void *data)
141 {
142    if (instr->type != nir_instr_type_intrinsic)
143       return false;
144 
145    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
146    switch (intrin->intrinsic) {
147    case nir_intrinsic_store_task_payload:
148    case nir_intrinsic_load_task_payload: {
149       nir_src *offset_src = nir_get_io_offset_src(intrin);
150 
151       if (nir_src_is_const(*offset_src))
152          assert(nir_src_as_uint(*offset_src) % 4 == 0);
153 
154       b->cursor = nir_before_instr(&intrin->instr);
155 
156       /* Regular I/O uses dwords while explicit I/O used for task payload uses
157        * bytes.  Normalize it to dwords.
158        *
159        * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
160        */
161 
162       assert(offset_src->is_ssa);
163       nir_ssa_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
164       nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
165 
166       return true;
167    }
168 
169    default:
170       return false;
171    }
172 }
173 
174 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)175 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
176 {
177    return nir_shader_instructions_pass(nir,
178                                        brw_nir_adjust_task_payload_offsets_instr,
179                                        nir_metadata_block_index |
180                                        nir_metadata_dominance,
181                                        NULL);
182 }
183 
184 static void
brw_nir_adjust_payload(nir_shader * shader,const struct brw_compiler * compiler)185 brw_nir_adjust_payload(nir_shader *shader, const struct brw_compiler *compiler)
186 {
187    /* Adjustment of task payload offsets must be performed *after* last pass
188     * which interprets them as bytes, because it changes their unit.
189     */
190    bool adjusted = false;
191    NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
192    if (adjusted) /* clean up the mess created by offset adjustments */
193       NIR_PASS(_, shader, nir_opt_constant_folding);
194 }
195 
196 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,void * mem_ctx,struct brw_compile_task_params * params)197 brw_compile_task(const struct brw_compiler *compiler,
198                  void *mem_ctx,
199                  struct brw_compile_task_params *params)
200 {
201    struct nir_shader *nir = params->nir;
202    const struct brw_task_prog_key *key = params->key;
203    struct brw_task_prog_data *prog_data = params->prog_data;
204    const bool debug_enabled = INTEL_DEBUG(DEBUG_TASK);
205 
206    prog_data->base.base.stage = MESA_SHADER_TASK;
207    prog_data->base.base.total_shared = nir->info.shared_size;
208    prog_data->base.base.total_scratch = 0;
209 
210    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
211    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
212    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
213 
214    prog_data->uses_drawid =
215       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
216 
217    brw_nir_lower_tue_outputs(nir, &prog_data->map);
218 
219    const unsigned required_dispatch_width =
220       brw_required_dispatch_width(&nir->info);
221 
222    fs_visitor *v[3]     = {0};
223    const char *error[3] = {0};
224 
225    for (unsigned simd = 0; simd < 3; simd++) {
226       if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base,
227                                    required_dispatch_width, &error[simd]))
228          continue;
229 
230       const unsigned dispatch_width = 8 << simd;
231 
232       nir_shader *shader = nir_shader_clone(mem_ctx, nir);
233       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */);
234 
235       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
236       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
237 
238       brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled,
239                           key->base.robust_buffer_access);
240 
241       brw_nir_adjust_payload(shader, compiler);
242 
243       v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
244                                &prog_data->base.base, shader, dispatch_width,
245                                debug_enabled);
246 
247       if (prog_data->base.prog_mask) {
248          unsigned first = ffs(prog_data->base.prog_mask) - 1;
249          v[simd]->import_uniforms(v[first]);
250       }
251 
252       const bool allow_spilling = !prog_data->base.prog_mask;
253 
254       if (v[simd]->run_task(allow_spilling))
255          brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers);
256       else
257          error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
258    }
259 
260    int selected_simd = brw_simd_select(&prog_data->base);
261    if (selected_simd < 0) {
262       params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
263                                           error[0], error[1], error[2]);;
264       return NULL;
265    }
266 
267    fs_visitor *selected = v[selected_simd];
268    prog_data->base.prog_mask = 1 << selected_simd;
269 
270    if (unlikely(debug_enabled)) {
271       fprintf(stderr, "Task Output ");
272       brw_print_tue_map(stderr, &prog_data->map);
273    }
274 
275    fs_generator g(compiler, params->log_data, mem_ctx,
276                   &prog_data->base.base, false, MESA_SHADER_TASK);
277    if (unlikely(debug_enabled)) {
278       g.enable_debug(ralloc_asprintf(mem_ctx,
279                                      "%s task shader %s",
280                                      nir->info.label ? nir->info.label
281                                                      : "unnamed",
282                                      nir->info.name));
283    }
284 
285    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
286                    selected->performance_analysis.require(), params->stats);
287 
288    delete v[0];
289    delete v[1];
290    delete v[2];
291 
292    return g.get_assembly();
293 }
294 
295 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)296 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
297 {
298    if (!map)
299       return;
300 
301    nir->info.task_payload_size = map->per_task_data_start_dw * 4;
302 
303    bool progress = false;
304 
305    NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
306             nir_var_mem_task_payload, shared_type_info);
307 
308    if (progress) {
309       /* The types for Task Output and Mesh Input should match, so their sizes
310        * should also match.
311        */
312       assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
313    } else {
314       /* Mesh doesn't read any input, to make it clearer set the
315        * task_payload_size to zero instead of keeping an incomplete size that
316        * just includes the header.
317        */
318       nir->info.task_payload_size = 0;
319    }
320 
321    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
322             nir_address_format_32bit_offset);
323 }
324 
325 /* Mesh URB Entry consists of an initial section
326  *
327  *  - Primitive Count
328  *  - Primitive Indices (from 0 to Max-1)
329  *  - Padding to 32B if needed
330  *
331  * optionally followed by a section for per-primitive data,
332  * in which each primitive (from 0 to Max-1) gets
333  *
334  *  - Primitive Header (e.g. ViewportIndex)
335  *  - Primitive Custom Attributes
336  *
337  * then followed by a section for per-vertex data
338  *
339  *  - Vertex Header (e.g. Position)
340  *  - Vertex Custom Attributes
341  *
342  * Each per-element section has a pitch and a starting offset.  All the
343  * individual attributes offsets in start_dw are considering the first entry
344  * of the section (i.e. where the Position for first vertex, or ViewportIndex
345  * for first primitive).  Attributes for other elements are calculated using
346  * the pitch.
347  */
348 static void
brw_compute_mue_map(struct nir_shader * nir,struct brw_mue_map * map)349 brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map)
350 {
351    memset(map, 0, sizeof(*map));
352 
353    for (int i = 0; i < VARYING_SLOT_MAX; i++)
354       map->start_dw[i] = -1;
355 
356    unsigned vertices_per_primitive =
357       num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
358 
359    map->max_primitives = nir->info.mesh.max_primitives_out;
360    map->max_vertices = nir->info.mesh.max_vertices_out;
361 
362    uint64_t outputs_written = nir->info.outputs_written;
363 
364    /* Assign initial section. */
365    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
366       map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
367       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
368    }
369    if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
370       map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
371       outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
372    }
373 
374    /* One dword for primitives count then K extra dwords for each
375     * primitive. Note this should change when we implement other index types.
376     */
377    const unsigned primitive_list_size_dw = 1 + vertices_per_primitive * map->max_primitives;
378 
379    /* TODO(mesh): Multiview. */
380    map->per_primitive_header_size_dw =
381          (nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
382                                        BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) |
383                                        BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
384                                        BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0;
385 
386    map->per_primitive_start_dw = ALIGN(primitive_list_size_dw, 8);
387 
388    map->per_primitive_data_size_dw = 0;
389    u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) {
390       assert(map->start_dw[location] == -1);
391 
392       unsigned start;
393       switch (location) {
394       case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
395          start = map->per_primitive_start_dw + 0;
396          break;
397       case VARYING_SLOT_LAYER:
398          start = map->per_primitive_start_dw + 1; /* RTAIndex */
399          break;
400       case VARYING_SLOT_VIEWPORT:
401          start = map->per_primitive_start_dw + 2;
402          break;
403       case VARYING_SLOT_CULL_PRIMITIVE:
404          start = map->per_primitive_start_dw + 3;
405          break;
406       default:
407          assert(location == VARYING_SLOT_PRIMITIVE_ID ||
408                 location >= VARYING_SLOT_VAR0);
409          start = map->per_primitive_start_dw +
410                  map->per_primitive_header_size_dw +
411                  map->per_primitive_data_size_dw;
412          map->per_primitive_data_size_dw += 4;
413          break;
414       }
415 
416       map->start_dw[location] = start;
417    }
418 
419    map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
420                                        map->per_primitive_data_size_dw, 8);
421 
422    map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
423                                     map->per_primitive_pitch_dw * map->max_primitives, 8);
424 
425    /* TODO(mesh): Multiview. */
426    unsigned fixed_header_size = 8;
427    map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
428                                           nir->info.clip_distance_array_size +
429                                           nir->info.cull_distance_array_size, 8);
430    map->per_vertex_data_size_dw = 0;
431    u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
432       assert(map->start_dw[location] == -1);
433 
434       unsigned start;
435       switch (location) {
436       case VARYING_SLOT_PSIZ:
437          start = map->per_vertex_start_dw + 3;
438          break;
439       case VARYING_SLOT_POS:
440          start = map->per_vertex_start_dw + 4;
441          break;
442       case VARYING_SLOT_CLIP_DIST0:
443          start = map->per_vertex_start_dw + fixed_header_size + 0;
444          break;
445       case VARYING_SLOT_CLIP_DIST1:
446          start = map->per_vertex_start_dw + fixed_header_size + 4;
447          break;
448       case VARYING_SLOT_CULL_DIST0:
449       case VARYING_SLOT_CULL_DIST1:
450          unreachable("cull distances should be lowered earlier");
451          break;
452       default:
453          assert(location >= VARYING_SLOT_VAR0);
454          start = map->per_vertex_start_dw +
455                  map->per_vertex_header_size_dw +
456                  map->per_vertex_data_size_dw;
457          map->per_vertex_data_size_dw += 4;
458          break;
459       }
460       map->start_dw[location] = start;
461    }
462 
463    map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
464                                     map->per_vertex_data_size_dw, 8);
465 
466    map->size_dw =
467       map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
468 
469    assert(map->size_dw % 8 == 0);
470 }
471 
472 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map)473 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
474 {
475    fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
476            map->size_dw, map->max_primitives, map->max_vertices);
477    fprintf(fp, "  %4d: VARYING_SLOT_PRIMITIVE_COUNT\n",
478            map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]);
479    fprintf(fp, "  %4d: VARYING_SLOT_PRIMITIVE_INDICES\n",
480            map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]);
481 
482    fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
483            map->per_primitive_start_dw,
484            map->per_primitive_header_size_dw,
485            map->per_primitive_data_size_dw,
486            map->per_primitive_pitch_dw);
487 
488    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
489       if (map->start_dw[i] < 0)
490          continue;
491       const unsigned offset = map->start_dw[i];
492       if (offset >= map->per_primitive_start_dw &&
493           offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
494          fprintf(fp, "  %4d: %s\n", offset,
495                  gl_varying_slot_name_for_stage((gl_varying_slot)i,
496                                                 MESA_SHADER_MESH));
497       }
498    }
499 
500    fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
501            map->per_vertex_start_dw,
502            map->per_vertex_header_size_dw,
503            map->per_vertex_data_size_dw,
504            map->per_vertex_pitch_dw);
505 
506    for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
507       if (map->start_dw[i] < 0)
508          continue;
509       const unsigned offset = map->start_dw[i];
510       if (offset >= map->per_vertex_start_dw &&
511           offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
512          fprintf(fp, "  %4d: %s\n", offset,
513                  gl_varying_slot_name_for_stage((gl_varying_slot)i,
514                                                 MESA_SHADER_MESH));
515       }
516    }
517 
518    fprintf(fp, "\n");
519 }
520 
521 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)522 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
523 {
524    nir_foreach_shader_out_variable(var, nir) {
525       int location = var->data.location;
526       assert(location >= 0);
527       assert(map->start_dw[location] != -1);
528       var->data.driver_location = map->start_dw[location];
529    }
530 
531    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
532             type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
533 }
534 
535 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)536 brw_nir_initialize_mue(nir_shader *nir,
537                        const struct brw_mue_map *map,
538                        unsigned dispatch_width)
539 {
540    assert(map->per_primitive_header_size_dw > 0);
541 
542    nir_builder b;
543    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
544    nir_builder_init(&b, entrypoint);
545    b.cursor = nir_before_block(nir_start_block(entrypoint));
546 
547    nir_ssa_def *dw_off = nir_imm_int(&b, 0);
548    nir_ssa_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
549 
550    /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
551 
552    assert(!nir->info.workgroup_size_variable);
553    const unsigned workgroup_size = nir->info.workgroup_size[0] *
554                                    nir->info.workgroup_size[1] *
555                                    nir->info.workgroup_size[2];
556 
557    /* Invocations from a single workgroup will cooperate in zeroing MUE. */
558 
559    /* How many prims each invocation needs to cover without checking its index? */
560    unsigned prims_per_inv = map->max_primitives / workgroup_size;
561 
562    /* Zero first 4 dwords of MUE Primitive Header:
563     * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
564     */
565 
566    nir_ssa_def *local_invocation_index = nir_load_local_invocation_index(&b);
567 
568    /* Zero primitive headers distanced by workgroup_size, starting from
569     * invocation index.
570     */
571    for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
572       nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index,
573                                            prim_in_inv * workgroup_size);
574 
575       nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
576                                      .base = (int)map->per_primitive_start_dw,
577                                      .write_mask = WRITEMASK_XYZW,
578                                      .component = 0,
579                                      .src_type = nir_type_uint32);
580    }
581 
582    /* How many prims are left? */
583    unsigned remaining = map->max_primitives % workgroup_size;
584 
585    if (remaining) {
586       /* Zero "remaining" primitive headers starting from the last one covered
587        * by the loop above + workgroup_size.
588        */
589       nir_ssa_def *cmp = nir_ilt(&b, local_invocation_index,
590                                      nir_imm_int(&b, remaining));
591       nir_if *if_stmt = nir_push_if(&b, cmp);
592       {
593          nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index,
594                                                prims_per_inv * workgroup_size);
595 
596          nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
597                                         .base = (int)map->per_primitive_start_dw,
598                                         .write_mask = WRITEMASK_XYZW,
599                                         .component = 0,
600                                         .src_type = nir_type_uint32);
601       }
602       nir_pop_if(&b, if_stmt);
603    }
604 
605    /* If there's more than one subgroup, then we need to wait for all of them
606     * to finish initialization before we can proceed. Otherwise some subgroups
607     * may start filling MUE before other finished initializing.
608     */
609    if (workgroup_size > dispatch_width) {
610       nir_scoped_barrier(&b, NIR_SCOPE_WORKGROUP, NIR_SCOPE_WORKGROUP,
611                          NIR_MEMORY_ACQ_REL, nir_var_shader_out);
612    }
613 
614    if (remaining) {
615       nir_metadata_preserve(entrypoint, nir_metadata_none);
616    } else {
617       nir_metadata_preserve(entrypoint, nir_metadata_block_index |
618                                         nir_metadata_dominance);
619    }
620 }
621 
622 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_instr * instr,void * data)623 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, nir_instr *instr, void *data)
624 {
625    if (instr->type != nir_instr_type_intrinsic)
626       return false;
627 
628    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
629 
630    const struct brw_mue_map *map = (const struct brw_mue_map *) data;
631 
632    /* Remap per_vertex and per_primitive offsets using the extra source and
633     * the pitch.
634     */
635    switch (intrin->intrinsic) {
636    case nir_intrinsic_load_per_vertex_output:
637    case nir_intrinsic_store_per_vertex_output: {
638       const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_vertex_output;
639       nir_src *index_src = &intrin->src[is_load ? 0 : 1];
640       nir_src *offset_src = &intrin->src[is_load ? 1 : 2];
641 
642       assert(index_src->is_ssa);
643       b->cursor = nir_before_instr(&intrin->instr);
644       nir_ssa_def *offset =
645          nir_iadd(b,
646                   offset_src->ssa,
647                   nir_imul_imm(b, index_src->ssa, map->per_vertex_pitch_dw));
648       nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
649       return true;
650    }
651 
652    case nir_intrinsic_load_per_primitive_output:
653    case nir_intrinsic_store_per_primitive_output: {
654       const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_primitive_output;
655       nir_src *index_src = &intrin->src[is_load ? 0 : 1];
656       nir_src *offset_src = &intrin->src[is_load ? 1 : 2];
657 
658       assert(index_src->is_ssa);
659       b->cursor = nir_before_instr(&intrin->instr);
660 
661       assert(index_src->is_ssa);
662       nir_ssa_def *offset =
663          nir_iadd(b,
664                   offset_src->ssa,
665                   nir_imul_imm(b, index_src->ssa, map->per_primitive_pitch_dw));
666       nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
667       return true;
668    }
669 
670    default:
671       return false;
672    }
673 }
674 
675 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)676 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
677 {
678    return nir_shader_instructions_pass(nir,
679                                        brw_nir_adjust_offset_for_arrayed_indices_instr,
680                                        nir_metadata_block_index |
681                                        nir_metadata_dominance,
682                                        (void *)map);
683 }
684 
685 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,void * mem_ctx,struct brw_compile_mesh_params * params)686 brw_compile_mesh(const struct brw_compiler *compiler,
687                  void *mem_ctx,
688                  struct brw_compile_mesh_params *params)
689 {
690    struct nir_shader *nir = params->nir;
691    const struct brw_mesh_prog_key *key = params->key;
692    struct brw_mesh_prog_data *prog_data = params->prog_data;
693    const bool debug_enabled = INTEL_DEBUG(DEBUG_MESH);
694 
695    prog_data->base.base.stage = MESA_SHADER_MESH;
696    prog_data->base.base.total_shared = nir->info.shared_size;
697    prog_data->base.base.total_scratch = 0;
698 
699    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
700    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
701    prog_data->base.local_size[2] = nir->info.workgroup_size[2];
702 
703    prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
704    prog_data->cull_distance_mask =
705          ((1 << nir->info.cull_distance_array_size) - 1) <<
706           nir->info.clip_distance_array_size;
707    prog_data->primitive_type = nir->info.mesh.primitive_type;
708 
709    /* TODO(mesh): Use other index formats (that are more compact) for optimization. */
710    prog_data->index_format = BRW_INDEX_FORMAT_U32;
711 
712    prog_data->uses_drawid =
713       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
714 
715    brw_nir_lower_tue_inputs(nir, params->tue_map);
716 
717    brw_compute_mue_map(nir, &prog_data->map);
718    brw_nir_lower_mue_outputs(nir, &prog_data->map);
719 
720    const unsigned required_dispatch_width =
721       brw_required_dispatch_width(&nir->info);
722 
723    fs_visitor *v[3]     = {0};
724    const char *error[3] = {0};
725 
726    for (int simd = 0; simd < 3; simd++) {
727       if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base,
728                                    required_dispatch_width, &error[simd]))
729          continue;
730 
731       const unsigned dispatch_width = 8 << simd;
732 
733       nir_shader *shader = nir_shader_clone(mem_ctx, nir);
734 
735       /*
736        * When Primitive Header is enabled, we may not generates writes to all
737        * fields, so let's initialize everything.
738        */
739       if (prog_data->map.per_primitive_header_size_dw > 0)
740          NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
741 
742       brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */);
743 
744       NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
745       /* Load uniforms can do a better job for constants, so fold before it. */
746       NIR_PASS(_, shader, nir_opt_constant_folding);
747       NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
748 
749       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
750 
751       brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled,
752                           key->base.robust_buffer_access);
753 
754       brw_nir_adjust_payload(shader, compiler);
755 
756       v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
757                                &prog_data->base.base, shader, dispatch_width,
758                                debug_enabled);
759 
760       if (prog_data->base.prog_mask) {
761          unsigned first = ffs(prog_data->base.prog_mask) - 1;
762          v[simd]->import_uniforms(v[first]);
763       }
764 
765       const bool allow_spilling = !prog_data->base.prog_mask;
766 
767       if (v[simd]->run_mesh(allow_spilling))
768          brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers);
769       else
770          error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
771    }
772 
773    int selected_simd = brw_simd_select(&prog_data->base);
774    if (selected_simd < 0) {
775       params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
776                                           error[0], error[1], error[2]);;
777       return NULL;
778    }
779 
780    fs_visitor *selected = v[selected_simd];
781    prog_data->base.prog_mask = 1 << selected_simd;
782 
783    if (unlikely(debug_enabled)) {
784       if (params->tue_map) {
785          fprintf(stderr, "Mesh Input ");
786          brw_print_tue_map(stderr, params->tue_map);
787       }
788       fprintf(stderr, "Mesh Output ");
789       brw_print_mue_map(stderr, &prog_data->map);
790    }
791 
792    fs_generator g(compiler, params->log_data, mem_ctx,
793                   &prog_data->base.base, false, MESA_SHADER_MESH);
794    if (unlikely(debug_enabled)) {
795       g.enable_debug(ralloc_asprintf(mem_ctx,
796                                      "%s mesh shader %s",
797                                      nir->info.label ? nir->info.label
798                                                      : "unnamed",
799                                      nir->info.name));
800    }
801 
802    g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
803                    selected->performance_analysis.require(), params->stats);
804 
805    delete v[0];
806    delete v[1];
807    delete v[2];
808 
809    return g.get_assembly();
810 }
811 
812 static fs_reg
get_mesh_urb_handle(const fs_builder & bld,nir_intrinsic_op op)813 get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op)
814 {
815    unsigned subreg;
816    if (bld.shader->stage == MESA_SHADER_TASK) {
817       subreg = 6;
818    } else {
819       assert(bld.shader->stage == MESA_SHADER_MESH);
820       subreg = op == nir_intrinsic_load_task_payload ? 7 : 6;
821    }
822 
823    fs_builder ubld8 = bld.group(8, 0).exec_all();
824 
825    fs_reg h = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
826    ubld8.MOV(h, retype(brw_vec1_grf(0, subreg), BRW_REGISTER_TYPE_UD));
827    ubld8.AND(h, h, brw_imm_ud(0xFFFF));
828 
829    return h;
830 }
831 
832 static unsigned
component_from_intrinsic(nir_intrinsic_instr * instr)833 component_from_intrinsic(nir_intrinsic_instr *instr)
834 {
835    if (nir_intrinsic_has_component(instr))
836       return nir_intrinsic_component(instr);
837    else
838       return 0;
839 }
840 
841 static void
adjust_handle_and_offset(const fs_builder & bld,fs_reg & urb_handle,unsigned & urb_global_offset)842 adjust_handle_and_offset(const fs_builder &bld,
843                          fs_reg &urb_handle,
844                          unsigned &urb_global_offset)
845 {
846    /* Make sure that URB global offset is below 2048 (2^11), because
847     * that's the maximum possible value encoded in Message Descriptor.
848     */
849    unsigned adjustment = (urb_global_offset >> 11) << 11;
850 
851    if (adjustment) {
852       fs_builder ubld8 = bld.group(8, 0).exec_all();
853       ubld8.ADD(urb_handle, urb_handle, brw_imm_ud(adjustment));
854       urb_global_offset -= adjustment;
855    }
856 }
857 
858 static void
emit_urb_direct_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src)859 emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
860                        const fs_reg &src)
861 {
862    assert(nir_src_bit_size(instr->src[0]) == 32);
863 
864    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
865    assert(nir_src_is_const(*offset_nir_src));
866 
867    fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
868 
869    const unsigned comps = nir_src_num_components(instr->src[0]);
870    assert(comps <= 4);
871 
872    const unsigned mask = nir_intrinsic_write_mask(instr);
873    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
874                                      nir_src_as_uint(*offset_nir_src) +
875                                      component_from_intrinsic(instr);
876 
877    /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
878     * With a max of 4 components, an intrinsic can require up to two writes.
879     *
880     * First URB write will be shifted by comp_shift.  If there are other
881     * components left, then dispatch a second write.  In addition to that,
882     * take mask into account to decide whether each write will be actually
883     * needed.
884     */
885    const unsigned comp_shift   = offset_in_dwords % 4;
886    const unsigned first_comps  = MIN2(comps, 4 - comp_shift);
887    const unsigned second_comps = comps - first_comps;
888    const unsigned first_mask   = (mask << comp_shift) & 0xF;
889    const unsigned second_mask  = (mask >> (4 - comp_shift)) & 0xF;
890 
891    unsigned urb_global_offset = offset_in_dwords / 4;
892    adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
893 
894    if (first_mask > 0) {
895       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
896          fs_builder bld8 = bld.group(8, q);
897 
898          fs_reg payload_srcs[4];
899          unsigned length = 0;
900 
901          for (unsigned i = 0; i < comp_shift; i++)
902             payload_srcs[length++] = reg_undef;
903 
904          for (unsigned c = 0; c < first_comps; c++)
905             payload_srcs[length++] = quarter(offset(src, bld, c), q);
906 
907          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
908          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
909          srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16);
910          srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
911                                              BRW_REGISTER_TYPE_F);
912          bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
913 
914          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
915                                    reg_undef, srcs, ARRAY_SIZE(srcs));
916          inst->mlen = 2 + length;
917          inst->offset = urb_global_offset;
918          assert(inst->offset < 2048);
919       }
920    }
921 
922    if (second_mask > 0) {
923       urb_global_offset++;
924       adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
925 
926       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
927          fs_builder bld8 = bld.group(8, q);
928 
929          fs_reg payload_srcs[4];
930          unsigned length = 0;
931 
932          for (unsigned c = 0; c < second_comps; c++)
933             payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q);
934 
935          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
936          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
937          srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16);
938          srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
939                                              BRW_REGISTER_TYPE_F);
940          bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
941 
942          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
943                                    reg_undef, srcs, ARRAY_SIZE(srcs));
944          inst->mlen = 2 + length;
945          inst->offset = urb_global_offset;
946          assert(inst->offset < 2048);
947       }
948    }
949 }
950 
951 static void
emit_urb_indirect_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,const fs_reg & offset_src)952 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
953                          const fs_reg &src, const fs_reg &offset_src)
954 {
955    assert(nir_src_bit_size(instr->src[0]) == 32);
956 
957    const unsigned comps = nir_src_num_components(instr->src[0]);
958    assert(comps <= 4);
959 
960    fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
961 
962    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
963                                    component_from_intrinsic(instr);
964 
965    /* Use URB write message that allow different offsets per-slot.  The offset
966     * is in units of vec4s (128 bits), so we use a write for each component,
967     * replicating it in the sources and applying the appropriate mask based on
968     * the dword offset.
969     */
970 
971    for (unsigned c = 0; c < comps; c++) {
972       if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
973          continue;
974 
975       fs_reg src_comp = offset(src, bld, c);
976 
977       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
978          fs_builder bld8 = bld.group(8, q);
979 
980          fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
981          bld8.MOV(off, quarter(offset_src, q));
982          bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
983 
984          fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
985          bld8.AND(mask, off, brw_imm_ud(0x3));
986 
987          fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
988          bld8.MOV(one, brw_imm_ud(1));
989          bld8.SHL(mask, one, mask);
990          bld8.SHL(mask, mask, brw_imm_ud(16));
991 
992          bld8.SHR(off, off, brw_imm_ud(2));
993 
994          fs_reg payload_srcs[4];
995          unsigned length = 0;
996 
997          for (unsigned j = 0; j < 4; j++)
998             payload_srcs[length++] = quarter(src_comp, q);
999 
1000          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1001          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1002          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
1003          srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
1004          srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
1005                                              BRW_REGISTER_TYPE_F);
1006          bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
1007 
1008          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
1009                                    reg_undef, srcs, ARRAY_SIZE(srcs));
1010          inst->mlen = 3 + length;
1011          inst->offset = 0;
1012       }
1013    }
1014 }
1015 
1016 static void
emit_urb_direct_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest)1017 emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
1018                       const fs_reg &dest)
1019 {
1020    assert(nir_dest_bit_size(instr->dest) == 32);
1021 
1022    unsigned comps = nir_dest_num_components(instr->dest);
1023    if (comps == 0)
1024       return;
1025 
1026    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1027    assert(nir_src_is_const(*offset_nir_src));
1028 
1029    fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
1030 
1031    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
1032                                      nir_src_as_uint(*offset_nir_src) +
1033                                      component_from_intrinsic(instr);
1034 
1035    unsigned urb_global_offset = offset_in_dwords / 4;
1036    adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
1037 
1038    const unsigned comp_offset = offset_in_dwords % 4;
1039    const unsigned num_regs = comp_offset + comps;
1040 
1041    fs_builder ubld8 = bld.group(8, 0).exec_all();
1042    fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
1043    fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1044    srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1045 
1046    fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
1047                               srcs, ARRAY_SIZE(srcs));
1048    inst->mlen = 1;
1049    inst->offset = urb_global_offset;
1050    assert(inst->offset < 2048);
1051    inst->size_written = num_regs * REG_SIZE;
1052 
1053    for (unsigned c = 0; c < comps; c++) {
1054       fs_reg dest_comp = offset(dest, bld, c);
1055       fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
1056       bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
1057    }
1058 }
1059 
1060 static void
emit_urb_indirect_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,const fs_reg & offset_src)1061 emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
1062                         const fs_reg &dest, const fs_reg &offset_src)
1063 {
1064    assert(nir_dest_bit_size(instr->dest) == 32);
1065 
1066    unsigned comps = nir_dest_num_components(instr->dest);
1067    if (comps == 0)
1068       return;
1069 
1070    fs_reg seq_ud;
1071    {
1072       fs_builder ubld8 = bld.group(8, 0).exec_all();
1073       seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1074       fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
1075       ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
1076       ubld8.MOV(seq_ud, seq_uw);
1077       ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
1078    }
1079 
1080    fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
1081 
1082    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
1083                                    component_from_intrinsic(instr);
1084 
1085    for (unsigned c = 0; c < comps; c++) {
1086       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
1087          fs_builder bld8 = bld.group(8, q);
1088 
1089          fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1090          bld8.MOV(off, quarter(offset_src, q));
1091          bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
1092 
1093          STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
1094 
1095          fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1096          bld8.AND(comp, off, brw_imm_ud(0x3));
1097          bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
1098          bld8.ADD(comp, comp, seq_ud);
1099 
1100          bld8.SHR(off, off, brw_imm_ud(2));
1101 
1102          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1103          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1104          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
1105 
1106          fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
1107 
1108          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
1109                                    data, srcs, ARRAY_SIZE(srcs));
1110          inst->mlen = 2;
1111          inst->offset = 0;
1112          inst->size_written = 4 * REG_SIZE;
1113 
1114          fs_reg dest_comp = offset(dest, bld, c);
1115          bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
1116                    retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
1117                    data,
1118                    comp,
1119                    brw_imm_ud(4));
1120       }
1121    }
1122 }
1123 
1124 void
emit_task_mesh_store(const fs_builder & bld,nir_intrinsic_instr * instr)1125 fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *instr)
1126 {
1127    fs_reg src = get_nir_src(instr->src[0]);
1128    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1129 
1130    /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
1131     * the non-array-index offset, we could use to decide if we can perform
1132     * either one or (at most) two writes instead one per component.
1133     */
1134 
1135    if (nir_src_is_const(*offset_nir_src))
1136       emit_urb_direct_writes(bld, instr, src);
1137    else
1138       emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src));
1139 }
1140 
1141 void
emit_task_mesh_load(const fs_builder & bld,nir_intrinsic_instr * instr)1142 fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr)
1143 {
1144    fs_reg dest = get_nir_dest(instr->dest);
1145    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1146 
1147    /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
1148     * the non-array-index offset, we could use to decide if we can perform
1149     * a single large aligned read instead one per component.
1150     */
1151 
1152    if (nir_src_is_const(*offset_nir_src))
1153       emit_urb_direct_reads(bld, instr, dest);
1154    else
1155       emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src));
1156 }
1157 
1158 void
nir_emit_task_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1159 fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld,
1160                                     nir_intrinsic_instr *instr)
1161 {
1162    assert(stage == MESA_SHADER_TASK);
1163 
1164    switch (instr->intrinsic) {
1165    case nir_intrinsic_store_output:
1166    case nir_intrinsic_store_task_payload:
1167       emit_task_mesh_store(bld, instr);
1168       break;
1169 
1170    case nir_intrinsic_load_output:
1171    case nir_intrinsic_load_task_payload:
1172       emit_task_mesh_load(bld, instr);
1173       break;
1174 
1175    default:
1176       nir_emit_task_mesh_intrinsic(bld, instr);
1177       break;
1178    }
1179 }
1180 
1181 void
nir_emit_mesh_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1182 fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld,
1183                                     nir_intrinsic_instr *instr)
1184 {
1185    assert(stage == MESA_SHADER_MESH);
1186 
1187    switch (instr->intrinsic) {
1188    case nir_intrinsic_store_per_primitive_output:
1189    case nir_intrinsic_store_per_vertex_output:
1190    case nir_intrinsic_store_output:
1191       emit_task_mesh_store(bld, instr);
1192       break;
1193 
1194    case nir_intrinsic_load_per_vertex_output:
1195    case nir_intrinsic_load_per_primitive_output:
1196    case nir_intrinsic_load_output:
1197    case nir_intrinsic_load_task_payload:
1198       emit_task_mesh_load(bld, instr);
1199       break;
1200 
1201    default:
1202       nir_emit_task_mesh_intrinsic(bld, instr);
1203       break;
1204    }
1205 }
1206 
1207 void
nir_emit_task_mesh_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1208 fs_visitor::nir_emit_task_mesh_intrinsic(const fs_builder &bld,
1209                                          nir_intrinsic_instr *instr)
1210 {
1211    assert(stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK);
1212 
1213    fs_reg dest;
1214    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1215       dest = get_nir_dest(instr->dest);
1216 
1217    switch (instr->intrinsic) {
1218    case nir_intrinsic_load_mesh_inline_data_intel:
1219       assert(payload.num_regs == 3 || payload.num_regs == 4);
1220       /* Inline Parameter is the last element of the payload. */
1221       bld.MOV(dest, retype(brw_vec1_grf(payload.num_regs - 1,
1222                                         nir_intrinsic_align_offset(instr)),
1223                            dest.type));
1224       break;
1225 
1226    case nir_intrinsic_load_draw_id:
1227       /* DrawID comes from Extended Parameter 0 (XP0). */
1228       bld.MOV(dest, brw_vec1_grf(0, 3));
1229       break;
1230 
1231    case nir_intrinsic_load_local_invocation_index:
1232    case nir_intrinsic_load_local_invocation_id:
1233       /* Local_ID.X is given by the HW in the shader payload. */
1234       dest = retype(dest, BRW_REGISTER_TYPE_UD);
1235       bld.MOV(dest, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
1236       /* Task/Mesh only use one dimension. */
1237       if (instr->intrinsic == nir_intrinsic_load_local_invocation_id) {
1238          bld.MOV(offset(dest, bld, 1), brw_imm_uw(0));
1239          bld.MOV(offset(dest, bld, 2), brw_imm_uw(0));
1240       }
1241       break;
1242 
1243    default:
1244       nir_emit_cs_intrinsic(bld, instr);
1245       break;
1246    }
1247 }
1248