1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_compiler.h"
25 #include "brw_fs.h"
26 #include "brw_nir.h"
27 #include "brw_private.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "dev/intel_debug.h"
30
31 using namespace brw;
32
33 static bool
brw_nir_lower_load_uniforms_filter(const nir_instr * instr,UNUSED const void * data)34 brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
35 UNUSED const void *data)
36 {
37 if (instr->type != nir_instr_type_intrinsic)
38 return false;
39 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
40 return intrin->intrinsic == nir_intrinsic_load_uniform;
41 }
42
43 static nir_ssa_def *
brw_nir_lower_load_uniforms_impl(nir_builder * b,nir_instr * instr,UNUSED void * data)44 brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
45 UNUSED void *data)
46 {
47 assert(instr->type == nir_instr_type_intrinsic);
48 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
49 assert(intrin->intrinsic == nir_intrinsic_load_uniform);
50
51 /* Read the first few 32-bit scalars from InlineData. */
52 if (nir_src_is_const(intrin->src[0]) &&
53 nir_dest_bit_size(intrin->dest) == 32 &&
54 nir_dest_num_components(intrin->dest) == 1) {
55 unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
56 unsigned off_dw = off / 4;
57 if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
58 off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
59 return nir_load_mesh_inline_data_intel(b, 32, off_dw);
60 }
61 }
62
63 return brw_nir_load_global_const(b, intrin,
64 nir_load_mesh_inline_data_intel(b, 64, 0), 0);
65 }
66
67 static bool
brw_nir_lower_load_uniforms(nir_shader * nir)68 brw_nir_lower_load_uniforms(nir_shader *nir)
69 {
70 return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
71 brw_nir_lower_load_uniforms_impl, NULL);
72 }
73
74 static inline int
type_size_scalar_dwords(const struct glsl_type * type,bool bindless)75 type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
76 {
77 return glsl_count_dword_slots(type, bindless);
78 }
79
80 /* TODO(mesh): Make this a common function. */
81 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)82 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
83 {
84 assert(glsl_type_is_vector_or_scalar(type));
85
86 uint32_t comp_size = glsl_type_is_boolean(type)
87 ? 4 : glsl_get_bit_size(type) / 8;
88 unsigned length = glsl_get_vector_elements(type);
89 *size = comp_size * length,
90 *align = comp_size * (length == 3 ? 4 : length);
91 }
92
93 static void
brw_nir_lower_tue_outputs(nir_shader * nir,brw_tue_map * map)94 brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
95 {
96 memset(map, 0, sizeof(*map));
97
98 /* TUE header contains 4 words:
99 *
100 * - Word 0 for Task Count.
101 *
102 * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
103 * 3D dispatch into the 1D dispatch supported by HW. Currently not used.
104 */
105 nir_foreach_shader_out_variable(var, nir) {
106 assert(var->data.location == VARYING_SLOT_TASK_COUNT);
107 var->data.driver_location = 0;
108 }
109
110 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
111 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
112
113 /* From bspec: "It is suggested that SW reserve the 16 bytes following the
114 * TUE Header, and therefore start the SW-defined data structure at 32B
115 * alignment. This allows the TUE Header to always be written as 32 bytes
116 * with 32B alignment, the most optimal write performance case."
117 */
118 map->per_task_data_start_dw = 8;
119
120 /* Lowering to explicit types will start offsets from task_payload_size, so
121 * set it to start after the header.
122 */
123 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
124 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
125 nir_var_mem_task_payload, shared_type_info);
126 NIR_PASS(_, nir, nir_lower_explicit_io,
127 nir_var_mem_task_payload, nir_address_format_32bit_offset);
128
129 map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
130 }
131
132 static void
brw_print_tue_map(FILE * fp,const struct brw_tue_map * map)133 brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
134 {
135 fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
136 }
137
138 static bool
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder * b,nir_instr * instr,void * data)139 brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
140 nir_instr *instr, void *data)
141 {
142 if (instr->type != nir_instr_type_intrinsic)
143 return false;
144
145 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
146 switch (intrin->intrinsic) {
147 case nir_intrinsic_store_task_payload:
148 case nir_intrinsic_load_task_payload: {
149 nir_src *offset_src = nir_get_io_offset_src(intrin);
150
151 if (nir_src_is_const(*offset_src))
152 assert(nir_src_as_uint(*offset_src) % 4 == 0);
153
154 b->cursor = nir_before_instr(&intrin->instr);
155
156 /* Regular I/O uses dwords while explicit I/O used for task payload uses
157 * bytes. Normalize it to dwords.
158 *
159 * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
160 */
161
162 assert(offset_src->is_ssa);
163 nir_ssa_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
164 nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
165
166 return true;
167 }
168
169 default:
170 return false;
171 }
172 }
173
174 static bool
brw_nir_adjust_task_payload_offsets(nir_shader * nir)175 brw_nir_adjust_task_payload_offsets(nir_shader *nir)
176 {
177 return nir_shader_instructions_pass(nir,
178 brw_nir_adjust_task_payload_offsets_instr,
179 nir_metadata_block_index |
180 nir_metadata_dominance,
181 NULL);
182 }
183
184 static void
brw_nir_adjust_payload(nir_shader * shader,const struct brw_compiler * compiler)185 brw_nir_adjust_payload(nir_shader *shader, const struct brw_compiler *compiler)
186 {
187 /* Adjustment of task payload offsets must be performed *after* last pass
188 * which interprets them as bytes, because it changes their unit.
189 */
190 bool adjusted = false;
191 NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
192 if (adjusted) /* clean up the mess created by offset adjustments */
193 NIR_PASS(_, shader, nir_opt_constant_folding);
194 }
195
196 const unsigned *
brw_compile_task(const struct brw_compiler * compiler,void * mem_ctx,struct brw_compile_task_params * params)197 brw_compile_task(const struct brw_compiler *compiler,
198 void *mem_ctx,
199 struct brw_compile_task_params *params)
200 {
201 struct nir_shader *nir = params->nir;
202 const struct brw_task_prog_key *key = params->key;
203 struct brw_task_prog_data *prog_data = params->prog_data;
204 const bool debug_enabled = INTEL_DEBUG(DEBUG_TASK);
205
206 prog_data->base.base.stage = MESA_SHADER_TASK;
207 prog_data->base.base.total_shared = nir->info.shared_size;
208 prog_data->base.base.total_scratch = 0;
209
210 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
211 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
212 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
213
214 prog_data->uses_drawid =
215 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
216
217 brw_nir_lower_tue_outputs(nir, &prog_data->map);
218
219 const unsigned required_dispatch_width =
220 brw_required_dispatch_width(&nir->info);
221
222 fs_visitor *v[3] = {0};
223 const char *error[3] = {0};
224
225 for (unsigned simd = 0; simd < 3; simd++) {
226 if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base,
227 required_dispatch_width, &error[simd]))
228 continue;
229
230 const unsigned dispatch_width = 8 << simd;
231
232 nir_shader *shader = nir_shader_clone(mem_ctx, nir);
233 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */);
234
235 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
236 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
237
238 brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled,
239 key->base.robust_buffer_access);
240
241 brw_nir_adjust_payload(shader, compiler);
242
243 v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
244 &prog_data->base.base, shader, dispatch_width,
245 debug_enabled);
246
247 if (prog_data->base.prog_mask) {
248 unsigned first = ffs(prog_data->base.prog_mask) - 1;
249 v[simd]->import_uniforms(v[first]);
250 }
251
252 const bool allow_spilling = !prog_data->base.prog_mask;
253
254 if (v[simd]->run_task(allow_spilling))
255 brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers);
256 else
257 error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
258 }
259
260 int selected_simd = brw_simd_select(&prog_data->base);
261 if (selected_simd < 0) {
262 params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
263 error[0], error[1], error[2]);;
264 return NULL;
265 }
266
267 fs_visitor *selected = v[selected_simd];
268 prog_data->base.prog_mask = 1 << selected_simd;
269
270 if (unlikely(debug_enabled)) {
271 fprintf(stderr, "Task Output ");
272 brw_print_tue_map(stderr, &prog_data->map);
273 }
274
275 fs_generator g(compiler, params->log_data, mem_ctx,
276 &prog_data->base.base, false, MESA_SHADER_TASK);
277 if (unlikely(debug_enabled)) {
278 g.enable_debug(ralloc_asprintf(mem_ctx,
279 "%s task shader %s",
280 nir->info.label ? nir->info.label
281 : "unnamed",
282 nir->info.name));
283 }
284
285 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
286 selected->performance_analysis.require(), params->stats);
287
288 delete v[0];
289 delete v[1];
290 delete v[2];
291
292 return g.get_assembly();
293 }
294
295 static void
brw_nir_lower_tue_inputs(nir_shader * nir,const brw_tue_map * map)296 brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
297 {
298 if (!map)
299 return;
300
301 nir->info.task_payload_size = map->per_task_data_start_dw * 4;
302
303 bool progress = false;
304
305 NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
306 nir_var_mem_task_payload, shared_type_info);
307
308 if (progress) {
309 /* The types for Task Output and Mesh Input should match, so their sizes
310 * should also match.
311 */
312 assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
313 } else {
314 /* Mesh doesn't read any input, to make it clearer set the
315 * task_payload_size to zero instead of keeping an incomplete size that
316 * just includes the header.
317 */
318 nir->info.task_payload_size = 0;
319 }
320
321 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
322 nir_address_format_32bit_offset);
323 }
324
325 /* Mesh URB Entry consists of an initial section
326 *
327 * - Primitive Count
328 * - Primitive Indices (from 0 to Max-1)
329 * - Padding to 32B if needed
330 *
331 * optionally followed by a section for per-primitive data,
332 * in which each primitive (from 0 to Max-1) gets
333 *
334 * - Primitive Header (e.g. ViewportIndex)
335 * - Primitive Custom Attributes
336 *
337 * then followed by a section for per-vertex data
338 *
339 * - Vertex Header (e.g. Position)
340 * - Vertex Custom Attributes
341 *
342 * Each per-element section has a pitch and a starting offset. All the
343 * individual attributes offsets in start_dw are considering the first entry
344 * of the section (i.e. where the Position for first vertex, or ViewportIndex
345 * for first primitive). Attributes for other elements are calculated using
346 * the pitch.
347 */
348 static void
brw_compute_mue_map(struct nir_shader * nir,struct brw_mue_map * map)349 brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map)
350 {
351 memset(map, 0, sizeof(*map));
352
353 for (int i = 0; i < VARYING_SLOT_MAX; i++)
354 map->start_dw[i] = -1;
355
356 unsigned vertices_per_primitive =
357 num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type);
358
359 map->max_primitives = nir->info.mesh.max_primitives_out;
360 map->max_vertices = nir->info.mesh.max_vertices_out;
361
362 uint64_t outputs_written = nir->info.outputs_written;
363
364 /* Assign initial section. */
365 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
366 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
367 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
368 }
369 if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
370 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
371 outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
372 }
373
374 /* One dword for primitives count then K extra dwords for each
375 * primitive. Note this should change when we implement other index types.
376 */
377 const unsigned primitive_list_size_dw = 1 + vertices_per_primitive * map->max_primitives;
378
379 /* TODO(mesh): Multiview. */
380 map->per_primitive_header_size_dw =
381 (nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
382 BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) |
383 BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
384 BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0;
385
386 map->per_primitive_start_dw = ALIGN(primitive_list_size_dw, 8);
387
388 map->per_primitive_data_size_dw = 0;
389 u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) {
390 assert(map->start_dw[location] == -1);
391
392 unsigned start;
393 switch (location) {
394 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
395 start = map->per_primitive_start_dw + 0;
396 break;
397 case VARYING_SLOT_LAYER:
398 start = map->per_primitive_start_dw + 1; /* RTAIndex */
399 break;
400 case VARYING_SLOT_VIEWPORT:
401 start = map->per_primitive_start_dw + 2;
402 break;
403 case VARYING_SLOT_CULL_PRIMITIVE:
404 start = map->per_primitive_start_dw + 3;
405 break;
406 default:
407 assert(location == VARYING_SLOT_PRIMITIVE_ID ||
408 location >= VARYING_SLOT_VAR0);
409 start = map->per_primitive_start_dw +
410 map->per_primitive_header_size_dw +
411 map->per_primitive_data_size_dw;
412 map->per_primitive_data_size_dw += 4;
413 break;
414 }
415
416 map->start_dw[location] = start;
417 }
418
419 map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
420 map->per_primitive_data_size_dw, 8);
421
422 map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
423 map->per_primitive_pitch_dw * map->max_primitives, 8);
424
425 /* TODO(mesh): Multiview. */
426 unsigned fixed_header_size = 8;
427 map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
428 nir->info.clip_distance_array_size +
429 nir->info.cull_distance_array_size, 8);
430 map->per_vertex_data_size_dw = 0;
431 u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) {
432 assert(map->start_dw[location] == -1);
433
434 unsigned start;
435 switch (location) {
436 case VARYING_SLOT_PSIZ:
437 start = map->per_vertex_start_dw + 3;
438 break;
439 case VARYING_SLOT_POS:
440 start = map->per_vertex_start_dw + 4;
441 break;
442 case VARYING_SLOT_CLIP_DIST0:
443 start = map->per_vertex_start_dw + fixed_header_size + 0;
444 break;
445 case VARYING_SLOT_CLIP_DIST1:
446 start = map->per_vertex_start_dw + fixed_header_size + 4;
447 break;
448 case VARYING_SLOT_CULL_DIST0:
449 case VARYING_SLOT_CULL_DIST1:
450 unreachable("cull distances should be lowered earlier");
451 break;
452 default:
453 assert(location >= VARYING_SLOT_VAR0);
454 start = map->per_vertex_start_dw +
455 map->per_vertex_header_size_dw +
456 map->per_vertex_data_size_dw;
457 map->per_vertex_data_size_dw += 4;
458 break;
459 }
460 map->start_dw[location] = start;
461 }
462
463 map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
464 map->per_vertex_data_size_dw, 8);
465
466 map->size_dw =
467 map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
468
469 assert(map->size_dw % 8 == 0);
470 }
471
472 static void
brw_print_mue_map(FILE * fp,const struct brw_mue_map * map)473 brw_print_mue_map(FILE *fp, const struct brw_mue_map *map)
474 {
475 fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
476 map->size_dw, map->max_primitives, map->max_vertices);
477 fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_COUNT\n",
478 map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]);
479 fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_INDICES\n",
480 map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]);
481
482 fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
483 map->per_primitive_start_dw,
484 map->per_primitive_header_size_dw,
485 map->per_primitive_data_size_dw,
486 map->per_primitive_pitch_dw);
487
488 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
489 if (map->start_dw[i] < 0)
490 continue;
491 const unsigned offset = map->start_dw[i];
492 if (offset >= map->per_primitive_start_dw &&
493 offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) {
494 fprintf(fp, " %4d: %s\n", offset,
495 gl_varying_slot_name_for_stage((gl_varying_slot)i,
496 MESA_SHADER_MESH));
497 }
498 }
499
500 fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
501 map->per_vertex_start_dw,
502 map->per_vertex_header_size_dw,
503 map->per_vertex_data_size_dw,
504 map->per_vertex_pitch_dw);
505
506 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
507 if (map->start_dw[i] < 0)
508 continue;
509 const unsigned offset = map->start_dw[i];
510 if (offset >= map->per_vertex_start_dw &&
511 offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) {
512 fprintf(fp, " %4d: %s\n", offset,
513 gl_varying_slot_name_for_stage((gl_varying_slot)i,
514 MESA_SHADER_MESH));
515 }
516 }
517
518 fprintf(fp, "\n");
519 }
520
521 static void
brw_nir_lower_mue_outputs(nir_shader * nir,const struct brw_mue_map * map)522 brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
523 {
524 nir_foreach_shader_out_variable(var, nir) {
525 int location = var->data.location;
526 assert(location >= 0);
527 assert(map->start_dw[location] != -1);
528 var->data.driver_location = map->start_dw[location];
529 }
530
531 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
532 type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
533 }
534
535 static void
brw_nir_initialize_mue(nir_shader * nir,const struct brw_mue_map * map,unsigned dispatch_width)536 brw_nir_initialize_mue(nir_shader *nir,
537 const struct brw_mue_map *map,
538 unsigned dispatch_width)
539 {
540 assert(map->per_primitive_header_size_dw > 0);
541
542 nir_builder b;
543 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
544 nir_builder_init(&b, entrypoint);
545 b.cursor = nir_before_block(nir_start_block(entrypoint));
546
547 nir_ssa_def *dw_off = nir_imm_int(&b, 0);
548 nir_ssa_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
549
550 /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
551
552 assert(!nir->info.workgroup_size_variable);
553 const unsigned workgroup_size = nir->info.workgroup_size[0] *
554 nir->info.workgroup_size[1] *
555 nir->info.workgroup_size[2];
556
557 /* Invocations from a single workgroup will cooperate in zeroing MUE. */
558
559 /* How many prims each invocation needs to cover without checking its index? */
560 unsigned prims_per_inv = map->max_primitives / workgroup_size;
561
562 /* Zero first 4 dwords of MUE Primitive Header:
563 * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
564 */
565
566 nir_ssa_def *local_invocation_index = nir_load_local_invocation_index(&b);
567
568 /* Zero primitive headers distanced by workgroup_size, starting from
569 * invocation index.
570 */
571 for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
572 nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index,
573 prim_in_inv * workgroup_size);
574
575 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
576 .base = (int)map->per_primitive_start_dw,
577 .write_mask = WRITEMASK_XYZW,
578 .component = 0,
579 .src_type = nir_type_uint32);
580 }
581
582 /* How many prims are left? */
583 unsigned remaining = map->max_primitives % workgroup_size;
584
585 if (remaining) {
586 /* Zero "remaining" primitive headers starting from the last one covered
587 * by the loop above + workgroup_size.
588 */
589 nir_ssa_def *cmp = nir_ilt(&b, local_invocation_index,
590 nir_imm_int(&b, remaining));
591 nir_if *if_stmt = nir_push_if(&b, cmp);
592 {
593 nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index,
594 prims_per_inv * workgroup_size);
595
596 nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
597 .base = (int)map->per_primitive_start_dw,
598 .write_mask = WRITEMASK_XYZW,
599 .component = 0,
600 .src_type = nir_type_uint32);
601 }
602 nir_pop_if(&b, if_stmt);
603 }
604
605 /* If there's more than one subgroup, then we need to wait for all of them
606 * to finish initialization before we can proceed. Otherwise some subgroups
607 * may start filling MUE before other finished initializing.
608 */
609 if (workgroup_size > dispatch_width) {
610 nir_scoped_barrier(&b, NIR_SCOPE_WORKGROUP, NIR_SCOPE_WORKGROUP,
611 NIR_MEMORY_ACQ_REL, nir_var_shader_out);
612 }
613
614 if (remaining) {
615 nir_metadata_preserve(entrypoint, nir_metadata_none);
616 } else {
617 nir_metadata_preserve(entrypoint, nir_metadata_block_index |
618 nir_metadata_dominance);
619 }
620 }
621
622 static bool
brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder * b,nir_instr * instr,void * data)623 brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, nir_instr *instr, void *data)
624 {
625 if (instr->type != nir_instr_type_intrinsic)
626 return false;
627
628 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
629
630 const struct brw_mue_map *map = (const struct brw_mue_map *) data;
631
632 /* Remap per_vertex and per_primitive offsets using the extra source and
633 * the pitch.
634 */
635 switch (intrin->intrinsic) {
636 case nir_intrinsic_load_per_vertex_output:
637 case nir_intrinsic_store_per_vertex_output: {
638 const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_vertex_output;
639 nir_src *index_src = &intrin->src[is_load ? 0 : 1];
640 nir_src *offset_src = &intrin->src[is_load ? 1 : 2];
641
642 assert(index_src->is_ssa);
643 b->cursor = nir_before_instr(&intrin->instr);
644 nir_ssa_def *offset =
645 nir_iadd(b,
646 offset_src->ssa,
647 nir_imul_imm(b, index_src->ssa, map->per_vertex_pitch_dw));
648 nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
649 return true;
650 }
651
652 case nir_intrinsic_load_per_primitive_output:
653 case nir_intrinsic_store_per_primitive_output: {
654 const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_primitive_output;
655 nir_src *index_src = &intrin->src[is_load ? 0 : 1];
656 nir_src *offset_src = &intrin->src[is_load ? 1 : 2];
657
658 assert(index_src->is_ssa);
659 b->cursor = nir_before_instr(&intrin->instr);
660
661 assert(index_src->is_ssa);
662 nir_ssa_def *offset =
663 nir_iadd(b,
664 offset_src->ssa,
665 nir_imul_imm(b, index_src->ssa, map->per_primitive_pitch_dw));
666 nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
667 return true;
668 }
669
670 default:
671 return false;
672 }
673 }
674
675 static bool
brw_nir_adjust_offset_for_arrayed_indices(nir_shader * nir,const struct brw_mue_map * map)676 brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
677 {
678 return nir_shader_instructions_pass(nir,
679 brw_nir_adjust_offset_for_arrayed_indices_instr,
680 nir_metadata_block_index |
681 nir_metadata_dominance,
682 (void *)map);
683 }
684
685 const unsigned *
brw_compile_mesh(const struct brw_compiler * compiler,void * mem_ctx,struct brw_compile_mesh_params * params)686 brw_compile_mesh(const struct brw_compiler *compiler,
687 void *mem_ctx,
688 struct brw_compile_mesh_params *params)
689 {
690 struct nir_shader *nir = params->nir;
691 const struct brw_mesh_prog_key *key = params->key;
692 struct brw_mesh_prog_data *prog_data = params->prog_data;
693 const bool debug_enabled = INTEL_DEBUG(DEBUG_MESH);
694
695 prog_data->base.base.stage = MESA_SHADER_MESH;
696 prog_data->base.base.total_shared = nir->info.shared_size;
697 prog_data->base.base.total_scratch = 0;
698
699 prog_data->base.local_size[0] = nir->info.workgroup_size[0];
700 prog_data->base.local_size[1] = nir->info.workgroup_size[1];
701 prog_data->base.local_size[2] = nir->info.workgroup_size[2];
702
703 prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
704 prog_data->cull_distance_mask =
705 ((1 << nir->info.cull_distance_array_size) - 1) <<
706 nir->info.clip_distance_array_size;
707 prog_data->primitive_type = nir->info.mesh.primitive_type;
708
709 /* TODO(mesh): Use other index formats (that are more compact) for optimization. */
710 prog_data->index_format = BRW_INDEX_FORMAT_U32;
711
712 prog_data->uses_drawid =
713 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
714
715 brw_nir_lower_tue_inputs(nir, params->tue_map);
716
717 brw_compute_mue_map(nir, &prog_data->map);
718 brw_nir_lower_mue_outputs(nir, &prog_data->map);
719
720 const unsigned required_dispatch_width =
721 brw_required_dispatch_width(&nir->info);
722
723 fs_visitor *v[3] = {0};
724 const char *error[3] = {0};
725
726 for (int simd = 0; simd < 3; simd++) {
727 if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base,
728 required_dispatch_width, &error[simd]))
729 continue;
730
731 const unsigned dispatch_width = 8 << simd;
732
733 nir_shader *shader = nir_shader_clone(mem_ctx, nir);
734
735 /*
736 * When Primitive Header is enabled, we may not generates writes to all
737 * fields, so let's initialize everything.
738 */
739 if (prog_data->map.per_primitive_header_size_dw > 0)
740 NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
741
742 brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */);
743
744 NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
745 /* Load uniforms can do a better job for constants, so fold before it. */
746 NIR_PASS(_, shader, nir_opt_constant_folding);
747 NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
748
749 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
750
751 brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled,
752 key->base.robust_buffer_access);
753
754 brw_nir_adjust_payload(shader, compiler);
755
756 v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
757 &prog_data->base.base, shader, dispatch_width,
758 debug_enabled);
759
760 if (prog_data->base.prog_mask) {
761 unsigned first = ffs(prog_data->base.prog_mask) - 1;
762 v[simd]->import_uniforms(v[first]);
763 }
764
765 const bool allow_spilling = !prog_data->base.prog_mask;
766
767 if (v[simd]->run_mesh(allow_spilling))
768 brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers);
769 else
770 error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
771 }
772
773 int selected_simd = brw_simd_select(&prog_data->base);
774 if (selected_simd < 0) {
775 params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
776 error[0], error[1], error[2]);;
777 return NULL;
778 }
779
780 fs_visitor *selected = v[selected_simd];
781 prog_data->base.prog_mask = 1 << selected_simd;
782
783 if (unlikely(debug_enabled)) {
784 if (params->tue_map) {
785 fprintf(stderr, "Mesh Input ");
786 brw_print_tue_map(stderr, params->tue_map);
787 }
788 fprintf(stderr, "Mesh Output ");
789 brw_print_mue_map(stderr, &prog_data->map);
790 }
791
792 fs_generator g(compiler, params->log_data, mem_ctx,
793 &prog_data->base.base, false, MESA_SHADER_MESH);
794 if (unlikely(debug_enabled)) {
795 g.enable_debug(ralloc_asprintf(mem_ctx,
796 "%s mesh shader %s",
797 nir->info.label ? nir->info.label
798 : "unnamed",
799 nir->info.name));
800 }
801
802 g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
803 selected->performance_analysis.require(), params->stats);
804
805 delete v[0];
806 delete v[1];
807 delete v[2];
808
809 return g.get_assembly();
810 }
811
812 static fs_reg
get_mesh_urb_handle(const fs_builder & bld,nir_intrinsic_op op)813 get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op)
814 {
815 unsigned subreg;
816 if (bld.shader->stage == MESA_SHADER_TASK) {
817 subreg = 6;
818 } else {
819 assert(bld.shader->stage == MESA_SHADER_MESH);
820 subreg = op == nir_intrinsic_load_task_payload ? 7 : 6;
821 }
822
823 fs_builder ubld8 = bld.group(8, 0).exec_all();
824
825 fs_reg h = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
826 ubld8.MOV(h, retype(brw_vec1_grf(0, subreg), BRW_REGISTER_TYPE_UD));
827 ubld8.AND(h, h, brw_imm_ud(0xFFFF));
828
829 return h;
830 }
831
832 static unsigned
component_from_intrinsic(nir_intrinsic_instr * instr)833 component_from_intrinsic(nir_intrinsic_instr *instr)
834 {
835 if (nir_intrinsic_has_component(instr))
836 return nir_intrinsic_component(instr);
837 else
838 return 0;
839 }
840
841 static void
adjust_handle_and_offset(const fs_builder & bld,fs_reg & urb_handle,unsigned & urb_global_offset)842 adjust_handle_and_offset(const fs_builder &bld,
843 fs_reg &urb_handle,
844 unsigned &urb_global_offset)
845 {
846 /* Make sure that URB global offset is below 2048 (2^11), because
847 * that's the maximum possible value encoded in Message Descriptor.
848 */
849 unsigned adjustment = (urb_global_offset >> 11) << 11;
850
851 if (adjustment) {
852 fs_builder ubld8 = bld.group(8, 0).exec_all();
853 ubld8.ADD(urb_handle, urb_handle, brw_imm_ud(adjustment));
854 urb_global_offset -= adjustment;
855 }
856 }
857
858 static void
emit_urb_direct_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src)859 emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
860 const fs_reg &src)
861 {
862 assert(nir_src_bit_size(instr->src[0]) == 32);
863
864 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
865 assert(nir_src_is_const(*offset_nir_src));
866
867 fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
868
869 const unsigned comps = nir_src_num_components(instr->src[0]);
870 assert(comps <= 4);
871
872 const unsigned mask = nir_intrinsic_write_mask(instr);
873 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
874 nir_src_as_uint(*offset_nir_src) +
875 component_from_intrinsic(instr);
876
877 /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
878 * With a max of 4 components, an intrinsic can require up to two writes.
879 *
880 * First URB write will be shifted by comp_shift. If there are other
881 * components left, then dispatch a second write. In addition to that,
882 * take mask into account to decide whether each write will be actually
883 * needed.
884 */
885 const unsigned comp_shift = offset_in_dwords % 4;
886 const unsigned first_comps = MIN2(comps, 4 - comp_shift);
887 const unsigned second_comps = comps - first_comps;
888 const unsigned first_mask = (mask << comp_shift) & 0xF;
889 const unsigned second_mask = (mask >> (4 - comp_shift)) & 0xF;
890
891 unsigned urb_global_offset = offset_in_dwords / 4;
892 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
893
894 if (first_mask > 0) {
895 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
896 fs_builder bld8 = bld.group(8, q);
897
898 fs_reg payload_srcs[4];
899 unsigned length = 0;
900
901 for (unsigned i = 0; i < comp_shift; i++)
902 payload_srcs[length++] = reg_undef;
903
904 for (unsigned c = 0; c < first_comps; c++)
905 payload_srcs[length++] = quarter(offset(src, bld, c), q);
906
907 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
908 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
909 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16);
910 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
911 BRW_REGISTER_TYPE_F);
912 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
913
914 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
915 reg_undef, srcs, ARRAY_SIZE(srcs));
916 inst->mlen = 2 + length;
917 inst->offset = urb_global_offset;
918 assert(inst->offset < 2048);
919 }
920 }
921
922 if (second_mask > 0) {
923 urb_global_offset++;
924 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
925
926 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
927 fs_builder bld8 = bld.group(8, q);
928
929 fs_reg payload_srcs[4];
930 unsigned length = 0;
931
932 for (unsigned c = 0; c < second_comps; c++)
933 payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q);
934
935 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
936 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
937 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16);
938 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
939 BRW_REGISTER_TYPE_F);
940 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
941
942 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
943 reg_undef, srcs, ARRAY_SIZE(srcs));
944 inst->mlen = 2 + length;
945 inst->offset = urb_global_offset;
946 assert(inst->offset < 2048);
947 }
948 }
949 }
950
951 static void
emit_urb_indirect_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,const fs_reg & offset_src)952 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
953 const fs_reg &src, const fs_reg &offset_src)
954 {
955 assert(nir_src_bit_size(instr->src[0]) == 32);
956
957 const unsigned comps = nir_src_num_components(instr->src[0]);
958 assert(comps <= 4);
959
960 fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
961
962 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
963 component_from_intrinsic(instr);
964
965 /* Use URB write message that allow different offsets per-slot. The offset
966 * is in units of vec4s (128 bits), so we use a write for each component,
967 * replicating it in the sources and applying the appropriate mask based on
968 * the dword offset.
969 */
970
971 for (unsigned c = 0; c < comps; c++) {
972 if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
973 continue;
974
975 fs_reg src_comp = offset(src, bld, c);
976
977 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
978 fs_builder bld8 = bld.group(8, q);
979
980 fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
981 bld8.MOV(off, quarter(offset_src, q));
982 bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
983
984 fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
985 bld8.AND(mask, off, brw_imm_ud(0x3));
986
987 fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
988 bld8.MOV(one, brw_imm_ud(1));
989 bld8.SHL(mask, one, mask);
990 bld8.SHL(mask, mask, brw_imm_ud(16));
991
992 bld8.SHR(off, off, brw_imm_ud(2));
993
994 fs_reg payload_srcs[4];
995 unsigned length = 0;
996
997 for (unsigned j = 0; j < 4; j++)
998 payload_srcs[length++] = quarter(src_comp, q);
999
1000 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1001 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1002 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
1003 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
1004 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
1005 BRW_REGISTER_TYPE_F);
1006 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
1007
1008 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
1009 reg_undef, srcs, ARRAY_SIZE(srcs));
1010 inst->mlen = 3 + length;
1011 inst->offset = 0;
1012 }
1013 }
1014 }
1015
1016 static void
emit_urb_direct_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest)1017 emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
1018 const fs_reg &dest)
1019 {
1020 assert(nir_dest_bit_size(instr->dest) == 32);
1021
1022 unsigned comps = nir_dest_num_components(instr->dest);
1023 if (comps == 0)
1024 return;
1025
1026 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1027 assert(nir_src_is_const(*offset_nir_src));
1028
1029 fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
1030
1031 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
1032 nir_src_as_uint(*offset_nir_src) +
1033 component_from_intrinsic(instr);
1034
1035 unsigned urb_global_offset = offset_in_dwords / 4;
1036 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
1037
1038 const unsigned comp_offset = offset_in_dwords % 4;
1039 const unsigned num_regs = comp_offset + comps;
1040
1041 fs_builder ubld8 = bld.group(8, 0).exec_all();
1042 fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
1043 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1044 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1045
1046 fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
1047 srcs, ARRAY_SIZE(srcs));
1048 inst->mlen = 1;
1049 inst->offset = urb_global_offset;
1050 assert(inst->offset < 2048);
1051 inst->size_written = num_regs * REG_SIZE;
1052
1053 for (unsigned c = 0; c < comps; c++) {
1054 fs_reg dest_comp = offset(dest, bld, c);
1055 fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
1056 bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
1057 }
1058 }
1059
1060 static void
emit_urb_indirect_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,const fs_reg & offset_src)1061 emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
1062 const fs_reg &dest, const fs_reg &offset_src)
1063 {
1064 assert(nir_dest_bit_size(instr->dest) == 32);
1065
1066 unsigned comps = nir_dest_num_components(instr->dest);
1067 if (comps == 0)
1068 return;
1069
1070 fs_reg seq_ud;
1071 {
1072 fs_builder ubld8 = bld.group(8, 0).exec_all();
1073 seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1074 fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
1075 ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
1076 ubld8.MOV(seq_ud, seq_uw);
1077 ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
1078 }
1079
1080 fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
1081
1082 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
1083 component_from_intrinsic(instr);
1084
1085 for (unsigned c = 0; c < comps; c++) {
1086 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
1087 fs_builder bld8 = bld.group(8, q);
1088
1089 fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1090 bld8.MOV(off, quarter(offset_src, q));
1091 bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
1092
1093 STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
1094
1095 fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
1096 bld8.AND(comp, off, brw_imm_ud(0x3));
1097 bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
1098 bld8.ADD(comp, comp, seq_ud);
1099
1100 bld8.SHR(off, off, brw_imm_ud(2));
1101
1102 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1103 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1104 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
1105
1106 fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
1107
1108 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
1109 data, srcs, ARRAY_SIZE(srcs));
1110 inst->mlen = 2;
1111 inst->offset = 0;
1112 inst->size_written = 4 * REG_SIZE;
1113
1114 fs_reg dest_comp = offset(dest, bld, c);
1115 bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
1116 retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
1117 data,
1118 comp,
1119 brw_imm_ud(4));
1120 }
1121 }
1122 }
1123
1124 void
emit_task_mesh_store(const fs_builder & bld,nir_intrinsic_instr * instr)1125 fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *instr)
1126 {
1127 fs_reg src = get_nir_src(instr->src[0]);
1128 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1129
1130 /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
1131 * the non-array-index offset, we could use to decide if we can perform
1132 * either one or (at most) two writes instead one per component.
1133 */
1134
1135 if (nir_src_is_const(*offset_nir_src))
1136 emit_urb_direct_writes(bld, instr, src);
1137 else
1138 emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src));
1139 }
1140
1141 void
emit_task_mesh_load(const fs_builder & bld,nir_intrinsic_instr * instr)1142 fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr)
1143 {
1144 fs_reg dest = get_nir_dest(instr->dest);
1145 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
1146
1147 /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
1148 * the non-array-index offset, we could use to decide if we can perform
1149 * a single large aligned read instead one per component.
1150 */
1151
1152 if (nir_src_is_const(*offset_nir_src))
1153 emit_urb_direct_reads(bld, instr, dest);
1154 else
1155 emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src));
1156 }
1157
1158 void
nir_emit_task_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1159 fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld,
1160 nir_intrinsic_instr *instr)
1161 {
1162 assert(stage == MESA_SHADER_TASK);
1163
1164 switch (instr->intrinsic) {
1165 case nir_intrinsic_store_output:
1166 case nir_intrinsic_store_task_payload:
1167 emit_task_mesh_store(bld, instr);
1168 break;
1169
1170 case nir_intrinsic_load_output:
1171 case nir_intrinsic_load_task_payload:
1172 emit_task_mesh_load(bld, instr);
1173 break;
1174
1175 default:
1176 nir_emit_task_mesh_intrinsic(bld, instr);
1177 break;
1178 }
1179 }
1180
1181 void
nir_emit_mesh_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1182 fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld,
1183 nir_intrinsic_instr *instr)
1184 {
1185 assert(stage == MESA_SHADER_MESH);
1186
1187 switch (instr->intrinsic) {
1188 case nir_intrinsic_store_per_primitive_output:
1189 case nir_intrinsic_store_per_vertex_output:
1190 case nir_intrinsic_store_output:
1191 emit_task_mesh_store(bld, instr);
1192 break;
1193
1194 case nir_intrinsic_load_per_vertex_output:
1195 case nir_intrinsic_load_per_primitive_output:
1196 case nir_intrinsic_load_output:
1197 case nir_intrinsic_load_task_payload:
1198 emit_task_mesh_load(bld, instr);
1199 break;
1200
1201 default:
1202 nir_emit_task_mesh_intrinsic(bld, instr);
1203 break;
1204 }
1205 }
1206
1207 void
nir_emit_task_mesh_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)1208 fs_visitor::nir_emit_task_mesh_intrinsic(const fs_builder &bld,
1209 nir_intrinsic_instr *instr)
1210 {
1211 assert(stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK);
1212
1213 fs_reg dest;
1214 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1215 dest = get_nir_dest(instr->dest);
1216
1217 switch (instr->intrinsic) {
1218 case nir_intrinsic_load_mesh_inline_data_intel:
1219 assert(payload.num_regs == 3 || payload.num_regs == 4);
1220 /* Inline Parameter is the last element of the payload. */
1221 bld.MOV(dest, retype(brw_vec1_grf(payload.num_regs - 1,
1222 nir_intrinsic_align_offset(instr)),
1223 dest.type));
1224 break;
1225
1226 case nir_intrinsic_load_draw_id:
1227 /* DrawID comes from Extended Parameter 0 (XP0). */
1228 bld.MOV(dest, brw_vec1_grf(0, 3));
1229 break;
1230
1231 case nir_intrinsic_load_local_invocation_index:
1232 case nir_intrinsic_load_local_invocation_id:
1233 /* Local_ID.X is given by the HW in the shader payload. */
1234 dest = retype(dest, BRW_REGISTER_TYPE_UD);
1235 bld.MOV(dest, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
1236 /* Task/Mesh only use one dimension. */
1237 if (instr->intrinsic == nir_intrinsic_load_local_invocation_id) {
1238 bld.MOV(offset(dest, bld, 1), brw_imm_uw(0));
1239 bld.MOV(offset(dest, bld, 2), brw_imm_uw(0));
1240 }
1241 break;
1242
1243 default:
1244 nir_emit_cs_intrinsic(bld, instr);
1245 break;
1246 }
1247 }
1248