1 /*
2 * Copyright © 2019 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "compiler/nir/nir_builder.h"
7 #include "ir3_compiler.h"
8 #include "ir3_nir.h"
9
10 struct state {
11 uint32_t topology;
12
13 struct primitive_map {
14 /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
15 unsigned loc[13 + 32];
16 unsigned stride;
17 } map;
18
19 nir_def *header;
20
21 nir_variable *vertex_count_var;
22 nir_variable *emitted_vertex_var;
23 nir_variable *vertex_flags_out;
24
25 struct exec_list old_outputs;
26 struct exec_list new_outputs;
27 struct exec_list emit_outputs;
28
29 /* tess ctrl shader on a650 gets the local primitive id at different bits: */
30 unsigned local_primitive_id_start;
31 };
32
33 static nir_def *
bitfield_extract(nir_builder * b,nir_def * v,uint32_t start,uint32_t mask)34 bitfield_extract(nir_builder *b, nir_def *v, uint32_t start, uint32_t mask)
35 {
36 return nir_iand_imm(b, nir_ushr_imm(b, v, start), mask);
37 }
38
39 static nir_def *
build_invocation_id(nir_builder * b,struct state * state)40 build_invocation_id(nir_builder *b, struct state *state)
41 {
42 return bitfield_extract(b, state->header, 11, 31);
43 }
44
45 static nir_def *
build_vertex_id(nir_builder * b,struct state * state)46 build_vertex_id(nir_builder *b, struct state *state)
47 {
48 return bitfield_extract(b, state->header, 6, 31);
49 }
50
51 static nir_def *
build_local_primitive_id(nir_builder * b,struct state * state)52 build_local_primitive_id(nir_builder *b, struct state *state)
53 {
54 return bitfield_extract(b, state->header, state->local_primitive_id_start,
55 63);
56 }
57
58 static bool
is_tess_levels(gl_varying_slot slot)59 is_tess_levels(gl_varying_slot slot)
60 {
61 return (slot == VARYING_SLOT_PRIMITIVE_ID ||
62 slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
63 slot == VARYING_SLOT_TESS_LEVEL_INNER);
64 }
65
66 /* Return a deterministic index for varyings. We can't rely on driver_location
67 * to be correct without linking the different stages first, so we create
68 * "primitive maps" where the producer decides on the location of each varying
69 * slot and then exports a per-slot array to the consumer. This compacts the
70 * gl_varying_slot space down a bit so that the primitive maps aren't too
71 * large.
72 *
73 * Note: per-patch varyings are currently handled separately, without any
74 * compacting.
75 *
76 * TODO: We could probably use the driver_location's directly in the non-SSO
77 * (Vulkan) case.
78 */
79
80 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)81 shader_io_get_unique_index(gl_varying_slot slot)
82 {
83 switch (slot) {
84 case VARYING_SLOT_POS: return 0;
85 case VARYING_SLOT_PSIZ: return 1;
86 case VARYING_SLOT_COL0: return 2;
87 case VARYING_SLOT_COL1: return 3;
88 case VARYING_SLOT_BFC0: return 4;
89 case VARYING_SLOT_BFC1: return 5;
90 case VARYING_SLOT_FOGC: return 6;
91 case VARYING_SLOT_CLIP_DIST0: return 7;
92 case VARYING_SLOT_CLIP_DIST1: return 8;
93 case VARYING_SLOT_CLIP_VERTEX: return 9;
94 case VARYING_SLOT_LAYER: return 10;
95 case VARYING_SLOT_VIEWPORT: return 11;
96 case VARYING_SLOT_PRIMITIVE_SHADING_RATE: return 12;
97 case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
98 struct state state = {};
99 STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
100 (13 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
101 struct ir3_shader_variant v = {};
102 STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
103 (13 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
104 return 13 + (slot - VARYING_SLOT_VAR0);
105 }
106 default:
107 unreachable("illegal slot in get unique index\n");
108 }
109 }
110
111 static nir_def *
build_local_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)112 build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
113 uint32_t location, uint32_t comp, nir_def *offset)
114 {
115 nir_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
116 nir_def *primitive_offset =
117 nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
118 nir_def *attr_offset;
119 nir_def *vertex_stride;
120 unsigned index = shader_io_get_unique_index(location);
121
122 switch (b->shader->info.stage) {
123 case MESA_SHADER_VERTEX:
124 case MESA_SHADER_TESS_EVAL:
125 vertex_stride = nir_imm_int(b, state->map.stride * 4);
126 attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
127 break;
128 case MESA_SHADER_TESS_CTRL:
129 case MESA_SHADER_GEOMETRY:
130 vertex_stride = nir_load_vs_vertex_stride_ir3(b);
131 attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
132 comp * 4);
133 break;
134 default:
135 unreachable("bad shader stage");
136 }
137
138 nir_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
139
140 return nir_iadd(
141 b, nir_iadd(b, primitive_offset, vertex_offset),
142 nir_iadd(b, attr_offset, nir_ishl_imm(b, offset, 4)));
143 }
144
145 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_def * src0,nir_def * src1,nir_def * src2)146 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
147 nir_intrinsic_op op, nir_def *src0, nir_def *src1,
148 nir_def *src2)
149 {
150 nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
151
152 new_intr->src[0] = nir_src_for_ssa(src0);
153 if (src1)
154 new_intr->src[1] = nir_src_for_ssa(src1);
155 if (src2)
156 new_intr->src[2] = nir_src_for_ssa(src2);
157
158 new_intr->num_components = intr->num_components;
159
160 if (nir_intrinsic_infos[op].has_dest)
161 nir_def_init(&new_intr->instr, &new_intr->def,
162 intr->num_components, intr->def.bit_size);
163
164 nir_builder_instr_insert(b, &new_intr->instr);
165
166 if (nir_intrinsic_infos[op].has_dest)
167 nir_def_rewrite_uses(&intr->def, &new_intr->def);
168
169 nir_instr_remove(&intr->instr);
170
171 return new_intr;
172 }
173
174 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)175 build_primitive_map(nir_shader *shader, struct primitive_map *map)
176 {
177 /* All interfaces except the TCS <-> TES interface use ldlw, which takes
178 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
179 * ldg, which takes an offset in dwords, but each per-vertex slot has
180 * space for every vertex, and there's space at the beginning for
181 * per-patch varyings.
182 */
183 unsigned slot_size = 16, start = 0;
184 if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
185 slot_size = shader->info.tess.tcs_vertices_out * 4;
186 start = util_last_bit(shader->info.patch_outputs_written) * 4;
187 }
188
189 uint64_t mask = shader->info.outputs_written;
190 unsigned loc = start;
191 while (mask) {
192 int location = u_bit_scan64(&mask);
193 if (is_tess_levels(location))
194 continue;
195
196 unsigned index = shader_io_get_unique_index(location);
197 map->loc[index] = loc;
198 loc += slot_size;
199 }
200
201 map->stride = loc;
202 /* Use units of dwords for the stride. */
203 if (shader->info.stage != MESA_SHADER_TESS_CTRL)
204 map->stride /= 4;
205 }
206
207 /* For shader stages that receive a primitive map, calculate how big it should
208 * be.
209 */
210
211 static unsigned
calc_primitive_map_size(nir_shader * shader)212 calc_primitive_map_size(nir_shader *shader)
213 {
214 uint64_t mask = shader->info.inputs_read;
215 unsigned max_index = 0;
216 while (mask) {
217 int location = u_bit_scan64(&mask);
218
219 if (is_tess_levels(location))
220 continue;
221
222 unsigned index = shader_io_get_unique_index(location);
223 max_index = MAX2(max_index, index + 1);
224 }
225
226 return max_index;
227 }
228
229 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)230 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
231 struct state *state)
232 {
233 nir_foreach_instr_safe (instr, block) {
234 if (instr->type != nir_instr_type_intrinsic)
235 continue;
236
237 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
238
239 switch (intr->intrinsic) {
240 case nir_intrinsic_store_output: {
241 // src[] = { value, offset }.
242
243 /* nir_lower_io_to_temporaries replaces all access to output
244 * variables with temp variables and then emits a nir_copy_var at
245 * the end of the shader. Thus, we should always get a full wrmask
246 * here.
247 */
248 assert(
249 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
250
251 b->cursor = nir_instr_remove(&intr->instr);
252
253 nir_def *vertex_id = build_vertex_id(b, state);
254 nir_def *offset = build_local_offset(
255 b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
256 nir_intrinsic_component(intr), intr->src[1].ssa);
257
258 nir_store_shared_ir3(b, intr->src[0].ssa, offset);
259 break;
260 }
261
262 default:
263 break;
264 }
265 }
266 }
267
268 static nir_def *
local_thread_id(nir_builder * b)269 local_thread_id(nir_builder *b)
270 {
271 return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
272 }
273
274 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)275 ir3_nir_lower_to_explicit_output(nir_shader *shader,
276 struct ir3_shader_variant *v,
277 unsigned topology)
278 {
279 struct state state = {};
280
281 build_primitive_map(shader, &state.map);
282 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
283
284 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
285 assert(impl);
286
287 nir_builder b = nir_builder_at(nir_before_impl(impl));
288
289 if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
290 state.header = nir_load_tcs_header_ir3(&b);
291 else
292 state.header = nir_load_gs_header_ir3(&b);
293
294 nir_foreach_block_safe (block, impl)
295 lower_block_to_explicit_output(block, &b, &state);
296
297 nir_metadata_preserve(impl,
298 nir_metadata_control_flow);
299
300 v->output_size = state.map.stride;
301 }
302
303 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)304 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
305 struct state *state)
306 {
307 nir_foreach_instr_safe (instr, block) {
308 if (instr->type != nir_instr_type_intrinsic)
309 continue;
310
311 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
312
313 switch (intr->intrinsic) {
314 case nir_intrinsic_load_per_vertex_input: {
315 // src[] = { vertex, offset }.
316
317 b->cursor = nir_before_instr(&intr->instr);
318
319 nir_def *offset = build_local_offset(
320 b, state,
321 intr->src[0].ssa, // this is typically gl_InvocationID
322 nir_intrinsic_io_semantics(intr).location,
323 nir_intrinsic_component(intr), intr->src[1].ssa);
324
325 replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
326 NULL);
327 break;
328 }
329
330 case nir_intrinsic_load_invocation_id: {
331 b->cursor = nir_before_instr(&intr->instr);
332
333 nir_def *iid = build_invocation_id(b, state);
334 nir_def_replace(&intr->def, iid);
335 break;
336 }
337
338 default:
339 break;
340 }
341 }
342 }
343
344 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)345 ir3_nir_lower_to_explicit_input(nir_shader *shader,
346 struct ir3_shader_variant *v)
347 {
348 struct state state = {};
349
350 /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
351 * HS uses a different primitive id, which starts at bit 16 in the header
352 */
353 if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
354 v->compiler->tess_use_shared)
355 state.local_primitive_id_start = 16;
356
357 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
358 assert(impl);
359
360 nir_builder b = nir_builder_at(nir_before_impl(impl));
361
362 if (shader->info.stage == MESA_SHADER_GEOMETRY)
363 state.header = nir_load_gs_header_ir3(&b);
364 else
365 state.header = nir_load_tcs_header_ir3(&b);
366
367 nir_foreach_block_safe (block, impl)
368 lower_block_to_explicit_input(block, &b, &state);
369
370 v->input_size = calc_primitive_map_size(shader);
371 }
372
373 static nir_def *
build_tcs_out_vertices(nir_builder * b)374 build_tcs_out_vertices(nir_builder *b)
375 {
376 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
377 return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
378 else
379 return nir_load_patch_vertices_in(b);
380 }
381
382 static nir_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)383 build_per_vertex_offset(nir_builder *b, struct state *state,
384 nir_def *vertex, uint32_t location, uint32_t comp,
385 nir_def *offset)
386 {
387 nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
388 nir_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
389 nir_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
390 nir_def *attr_offset;
391
392 if (nir_src_is_const(nir_src_for_ssa(offset))) {
393 location += nir_src_as_uint(nir_src_for_ssa(offset));
394 offset = nir_imm_int(b, 0);
395 } else {
396 /* Offset is in vec4's, but we need it in unit of components for the
397 * load/store_global_ir3 offset.
398 */
399 offset = nir_ishl_imm(b, offset, 2);
400 }
401
402 nir_def *vertex_offset;
403 if (vertex) {
404 unsigned index = shader_io_get_unique_index(location);
405 switch (b->shader->info.stage) {
406 case MESA_SHADER_TESS_CTRL:
407 attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
408 break;
409 case MESA_SHADER_TESS_EVAL:
410 attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
411 comp);
412 break;
413 default:
414 unreachable("bad shader state");
415 }
416
417 attr_offset = nir_iadd(b, attr_offset,
418 nir_imul24(b, offset, build_tcs_out_vertices(b)));
419 vertex_offset = nir_ishl_imm(b, vertex, 2);
420 } else {
421 assert(location >= VARYING_SLOT_PATCH0 &&
422 location <= VARYING_SLOT_TESS_MAX);
423 unsigned index = location - VARYING_SLOT_PATCH0;
424 attr_offset = nir_iadd_imm(b, offset, index * 4 + comp);
425 vertex_offset = nir_imm_int(b, 0);
426 }
427
428 return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
429 }
430
431 static nir_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_def * offset)432 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
433 uint32_t comp, nir_def *offset)
434 {
435 return build_per_vertex_offset(b, state, NULL, base, comp, offset);
436 }
437
438 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)439 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
440 {
441 switch (state->topology) {
442 case IR3_TESS_TRIANGLES:
443 *inner = 1;
444 *outer = 3;
445 break;
446 case IR3_TESS_QUADS:
447 *inner = 2;
448 *outer = 4;
449 break;
450 case IR3_TESS_ISOLINES:
451 *inner = 0;
452 *outer = 2;
453 break;
454 default:
455 unreachable("bad");
456 }
457 }
458
459 static nir_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)460 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
461 struct state *state)
462 {
463 uint32_t inner_levels, outer_levels;
464 tess_level_components(state, &inner_levels, &outer_levels);
465
466 const uint32_t patch_stride = 1 + inner_levels + outer_levels;
467
468 nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
469
470 nir_def *patch_offset =
471 nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
472
473 uint32_t offset;
474 switch (slot) {
475 case VARYING_SLOT_PRIMITIVE_ID:
476 offset = 0;
477 break;
478 case VARYING_SLOT_TESS_LEVEL_OUTER:
479 offset = 1;
480 break;
481 case VARYING_SLOT_TESS_LEVEL_INNER:
482 offset = 1 + outer_levels;
483 break;
484 default:
485 unreachable("bad");
486 }
487
488 return nir_iadd_imm(b, patch_offset, offset + comp);
489 }
490
491 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)492 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
493 {
494 nir_foreach_instr_safe (instr, block) {
495 if (instr->type != nir_instr_type_intrinsic)
496 continue;
497
498 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
499
500 switch (intr->intrinsic) {
501 case nir_intrinsic_load_per_vertex_output: {
502 // src[] = { vertex, offset }.
503
504 b->cursor = nir_before_instr(&intr->instr);
505
506 nir_def *address = nir_load_tess_param_base_ir3(b);
507 nir_def *offset = build_per_vertex_offset(
508 b, state, intr->src[0].ssa,
509 nir_intrinsic_io_semantics(intr).location,
510 nir_intrinsic_component(intr), intr->src[1].ssa);
511
512 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
513 offset, NULL);
514 break;
515 }
516
517 case nir_intrinsic_store_per_vertex_output: {
518 // src[] = { value, vertex, offset }.
519
520 b->cursor = nir_before_instr(&intr->instr);
521
522 /* sparse writemask not supported */
523 assert(
524 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
525
526 nir_def *value = intr->src[0].ssa;
527 nir_def *address = nir_load_tess_param_base_ir3(b);
528 nir_def *offset = build_per_vertex_offset(
529 b, state, intr->src[1].ssa,
530 nir_intrinsic_io_semantics(intr).location,
531 nir_intrinsic_component(intr), intr->src[2].ssa);
532
533 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
534 address, offset);
535
536 break;
537 }
538
539 case nir_intrinsic_load_output: {
540 // src[] = { offset }.
541
542 b->cursor = nir_before_instr(&intr->instr);
543
544 nir_def *address, *offset;
545
546 /* note if vectorization of the tess level loads ever happens:
547 * "ldg" across 16-byte boundaries can behave incorrectly if results
548 * are never used. most likely some issue with (sy) not properly
549 * syncing with values coming from a second memory transaction.
550 */
551 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
552 if (is_tess_levels(location)) {
553 assert(intr->def.num_components == 1);
554 address = nir_load_tess_factor_base_ir3(b);
555 offset = build_tessfactor_base(
556 b, location, nir_intrinsic_component(intr), state);
557 } else {
558 address = nir_load_tess_param_base_ir3(b);
559 offset = build_patch_offset(b, state, location,
560 nir_intrinsic_component(intr),
561 intr->src[0].ssa);
562 }
563
564 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
565 offset, NULL);
566 break;
567 }
568
569 case nir_intrinsic_store_output: {
570 // src[] = { value, offset }.
571
572 /* write patch output to bo */
573
574 b->cursor = nir_before_instr(&intr->instr);
575
576 /* sparse writemask not supported */
577 assert(
578 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
579
580 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
581 if (is_tess_levels(location)) {
582 uint32_t inner_levels, outer_levels, levels;
583 tess_level_components(state, &inner_levels, &outer_levels);
584
585 assert(intr->src[0].ssa->num_components == 1);
586
587 nir_if *nif = NULL;
588 if (location != VARYING_SLOT_PRIMITIVE_ID) {
589 /* with tess levels are defined as float[4] and float[2],
590 * but tess factor BO has smaller sizes for tris/isolines,
591 * so we have to discard any writes beyond the number of
592 * components for inner/outer levels
593 */
594 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
595 levels = outer_levels;
596 else
597 levels = inner_levels;
598
599 nir_def *offset = nir_iadd_imm(
600 b, intr->src[1].ssa, nir_intrinsic_component(intr));
601 nif = nir_push_if(b, nir_ult_imm(b, offset, levels));
602 }
603
604 nir_def *offset = build_tessfactor_base(
605 b, location, nir_intrinsic_component(intr), state);
606
607 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
608 intr->src[0].ssa,
609 nir_load_tess_factor_base_ir3(b),
610 nir_iadd(b, intr->src[1].ssa, offset));
611
612 if (location != VARYING_SLOT_PRIMITIVE_ID) {
613 nir_pop_if(b, nif);
614 }
615 } else {
616 nir_def *address = nir_load_tess_param_base_ir3(b);
617 nir_def *offset = build_patch_offset(
618 b, state, location, nir_intrinsic_component(intr),
619 intr->src[1].ssa);
620
621 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
622 intr->src[0].ssa, address, offset);
623 }
624 break;
625 }
626
627 default:
628 break;
629 }
630 }
631 }
632
633 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)634 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
635 unsigned topology)
636 {
637 struct state state = {.topology = topology};
638
639 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
640 mesa_logi("NIR (before tess lowering) for %s shader:",
641 _mesa_shader_stage_to_string(shader->info.stage));
642 nir_log_shaderi(shader);
643 }
644
645 build_primitive_map(shader, &state.map);
646 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
647 v->output_size = state.map.stride;
648
649 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
650 assert(impl);
651
652 nir_builder b = nir_builder_at(nir_before_impl(impl));
653
654 state.header = nir_load_tcs_header_ir3(&b);
655
656 /* If required, store gl_PrimitiveID. */
657 if (v->key.tcs_store_primid) {
658 b.cursor = nir_after_impl(impl);
659
660 nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
661 .io_semantics = {
662 .location = VARYING_SLOT_PRIMITIVE_ID,
663 .num_slots = 1
664 });
665
666 b.cursor = nir_before_impl(impl);
667 }
668
669 nir_foreach_block_safe (block, impl)
670 lower_tess_ctrl_block(block, &b, &state);
671
672 /* Now move the body of the TCS into a conditional:
673 *
674 * if (gl_InvocationID < num_vertices)
675 * // body
676 *
677 */
678
679 nir_cf_list body;
680 nir_cf_extract(&body, nir_before_impl(impl),
681 nir_after_impl(impl));
682
683 b.cursor = nir_after_impl(impl);
684
685 /* Re-emit the header, since the old one got moved into the if branch */
686 state.header = nir_load_tcs_header_ir3(&b);
687 nir_def *iid = build_invocation_id(&b, &state);
688
689 const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
690 nir_def *cond = nir_ult_imm(&b, iid, nvertices);
691
692 nir_if *nif = nir_push_if(&b, cond);
693
694 nir_cf_reinsert(&body, b.cursor);
695
696 b.cursor = nir_after_cf_list(&nif->then_list);
697
698 nir_pop_if(&b, nif);
699
700 nir_metadata_preserve(impl, nir_metadata_none);
701 }
702
703 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)704 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
705 {
706 nir_foreach_instr_safe (instr, block) {
707 if (instr->type != nir_instr_type_intrinsic)
708 continue;
709
710 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
711
712 switch (intr->intrinsic) {
713 case nir_intrinsic_load_per_vertex_input: {
714 // src[] = { vertex, offset }.
715
716 b->cursor = nir_before_instr(&intr->instr);
717
718 nir_def *address = nir_load_tess_param_base_ir3(b);
719 nir_def *offset = build_per_vertex_offset(
720 b, state, intr->src[0].ssa,
721 nir_intrinsic_io_semantics(intr).location,
722 nir_intrinsic_component(intr), intr->src[1].ssa);
723
724 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
725 offset, NULL);
726 break;
727 }
728
729 case nir_intrinsic_load_input: {
730 // src[] = { offset }.
731
732 b->cursor = nir_before_instr(&intr->instr);
733
734 nir_def *address, *offset;
735
736 /* note if vectorization of the tess level loads ever happens:
737 * "ldg" across 16-byte boundaries can behave incorrectly if results
738 * are never used. most likely some issue with (sy) not properly
739 * syncing with values coming from a second memory transaction.
740 */
741 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
742 if (is_tess_levels(location)) {
743 assert(intr->def.num_components == 1);
744 address = nir_load_tess_factor_base_ir3(b);
745 offset = build_tessfactor_base(
746 b, location, nir_intrinsic_component(intr), state);
747 } else {
748 address = nir_load_tess_param_base_ir3(b);
749 offset = build_patch_offset(b, state, location,
750 nir_intrinsic_component(intr),
751 intr->src[0].ssa);
752 }
753
754 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
755 offset, NULL);
756 break;
757 }
758
759 default:
760 break;
761 }
762 }
763 }
764
765 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)766 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
767 unsigned topology)
768 {
769 struct state state = {.topology = topology};
770
771 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
772 mesa_logi("NIR (before tess lowering) for %s shader:",
773 _mesa_shader_stage_to_string(shader->info.stage));
774 nir_log_shaderi(shader);
775 }
776
777 NIR_PASS_V(shader, nir_lower_tess_coord_z, topology == IR3_TESS_TRIANGLES);
778
779 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
780 assert(impl);
781
782 nir_builder b = nir_builder_create(impl);
783
784 nir_foreach_block_safe (block, impl)
785 lower_tess_eval_block(block, &b, &state);
786
787 v->input_size = calc_primitive_map_size(shader);
788
789 nir_metadata_preserve(impl, nir_metadata_none);
790 }
791
792 /* The hardware does not support incomplete primitives in multiple streams at
793 * once or ending the "wrong" stream, but Vulkan allows this. That is,
794 * EmitStreamVertex(N) followed by EmitStreamVertex(M) or EndStreamPrimitive(M)
795 * where N != M and there isn't a call to EndStreamPrimitive(N) in between isn't
796 * supported by the hardware. Fix this up by duplicating the entire shader per
797 * stream, removing EmitStreamVertex/EndStreamPrimitive calls for streams other
798 * than the current one.
799 */
800
801 static void
lower_mixed_streams(nir_shader * nir)802 lower_mixed_streams(nir_shader *nir)
803 {
804 /* We don't have to do anything for points because there is only one vertex
805 * per primitive and therefore no possibility of mixing.
806 */
807 if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
808 return;
809
810 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
811
812 uint8_t stream_mask = 0;
813
814 nir_foreach_block (block, entrypoint) {
815 nir_foreach_instr (instr, block) {
816 if (instr->type != nir_instr_type_intrinsic)
817 continue;
818
819 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
820
821 if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
822 intrin->intrinsic == nir_intrinsic_end_primitive)
823 stream_mask |= 1 << nir_intrinsic_stream_id(intrin);
824 }
825 }
826
827 if (util_is_power_of_two_or_zero(stream_mask))
828 return;
829
830 nir_cf_list body;
831 nir_cf_list_extract(&body, &entrypoint->body);
832
833 nir_builder b = nir_builder_create(entrypoint);
834
835 u_foreach_bit (stream, stream_mask) {
836 b.cursor = nir_after_impl(entrypoint);
837
838 /* Inserting the cloned body invalidates any cursor not using an
839 * instruction, so we need to emit this to keep track of where the new
840 * body is to iterate over it.
841 */
842 nir_instr *anchor = &nir_nop(&b)->instr;
843
844 nir_cf_list_clone_and_reinsert(&body, &entrypoint->cf_node, b.cursor, NULL);
845
846 /* We need to iterate over all instructions after the anchor, which is a
847 * bit tricky to do so we do it manually.
848 */
849 for (nir_block *block = anchor->block; block != NULL;
850 block = nir_block_cf_tree_next(block)) {
851 for (nir_instr *instr =
852 (block == anchor->block) ? anchor : nir_block_first_instr(block),
853 *next = instr ? nir_instr_next(instr) : NULL;
854 instr != NULL; instr = next, next = next ? nir_instr_next(next) : NULL) {
855 if (instr->type != nir_instr_type_intrinsic)
856 continue;
857
858 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
859 if ((intrin->intrinsic == nir_intrinsic_emit_vertex ||
860 intrin->intrinsic == nir_intrinsic_end_primitive) &&
861 nir_intrinsic_stream_id(intrin) != stream) {
862 nir_instr_remove(instr);
863 }
864 }
865 }
866
867 nir_instr_remove(anchor);
868
869 /* The user can omit the last EndStreamPrimitive(), so add an extra one
870 * here before potentially adding other copies of the body that emit to
871 * different streams. Our lowering means that redundant calls to
872 * EndStreamPrimitive are safe and should be optimized out.
873 */
874 b.cursor = nir_after_impl(entrypoint);
875 nir_end_primitive(&b, .stream_id = stream);
876 }
877
878 nir_cf_delete(&body);
879 }
880
881 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)882 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
883 {
884 foreach_two_lists (dest_node, dests, src_node, srcs) {
885 nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
886 nir_variable *src = exec_node_data(nir_variable, src_node, node);
887 nir_copy_var(b, dest, src);
888 }
889 }
890
891 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)892 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
893 {
894 nir_foreach_instr_safe (instr, block) {
895 if (instr->type != nir_instr_type_intrinsic)
896 continue;
897
898 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
899
900 switch (intr->intrinsic) {
901 case nir_intrinsic_end_primitive: {
902 /* The HW will use the stream from the preceding emitted vertices,
903 * which thanks to the lower_mixed_streams is the same as the stream
904 * for this instruction, so we can ignore it here.
905 */
906 b->cursor = nir_before_instr(&intr->instr);
907 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
908 nir_instr_remove(&intr->instr);
909 break;
910 }
911
912 case nir_intrinsic_emit_vertex: {
913 /* Load the vertex count */
914 b->cursor = nir_before_instr(&intr->instr);
915 nir_def *count = nir_load_var(b, state->vertex_count_var);
916
917 nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
918
919 unsigned stream = nir_intrinsic_stream_id(intr);
920 /* vertex_flags_out |= stream */
921 nir_store_var(b, state->vertex_flags_out,
922 nir_ior_imm(b, nir_load_var(b, state->vertex_flags_out),
923 stream),
924 0x1 /* .x */);
925
926 copy_vars(b, &state->emit_outputs, &state->old_outputs);
927
928 nir_instr_remove(&intr->instr);
929
930 nir_store_var(b, state->emitted_vertex_var,
931 nir_iadd_imm(b,
932 nir_load_var(b,
933 state->emitted_vertex_var),
934 1),
935 0x1);
936
937 nir_pop_if(b, NULL);
938
939 /* Increment the vertex count by 1 */
940 nir_store_var(b, state->vertex_count_var,
941 nir_iadd_imm(b, count, 1), 0x1); /* .x */
942 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
943
944 break;
945 }
946
947 default:
948 break;
949 }
950 }
951 }
952
953 void
ir3_nir_lower_gs(nir_shader * shader)954 ir3_nir_lower_gs(nir_shader *shader)
955 {
956 struct state state = {};
957
958 /* Don't lower multiple times: */
959 nir_foreach_shader_out_variable (var, shader)
960 if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
961 return;
962
963 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
964 mesa_logi("NIR (before gs lowering):");
965 nir_log_shaderi(shader);
966 }
967
968 lower_mixed_streams(shader);
969
970 /* Create an output var for vertex_flags. This will be shadowed below,
971 * same way regular outputs get shadowed, and this variable will become a
972 * temporary.
973 */
974 state.vertex_flags_out = nir_variable_create(
975 shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
976 state.vertex_flags_out->data.driver_location = shader->num_outputs++;
977 state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
978 state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
979
980 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
981 assert(impl);
982
983 nir_builder b = nir_builder_at(nir_before_impl(impl));
984
985 state.header = nir_load_gs_header_ir3(&b);
986
987 /* Generate two set of shadow vars for the output variables. The first
988 * set replaces the real outputs and the second set (emit_outputs) we'll
989 * assign in the emit_vertex conditionals. Then at the end of the shader
990 * we copy the emit_outputs to the real outputs, so that we get
991 * store_output in uniform control flow.
992 */
993 exec_list_make_empty(&state.old_outputs);
994 nir_foreach_shader_out_variable_safe (var, shader) {
995 exec_node_remove(&var->node);
996 exec_list_push_tail(&state.old_outputs, &var->node);
997 }
998 exec_list_make_empty(&state.new_outputs);
999 exec_list_make_empty(&state.emit_outputs);
1000 nir_foreach_variable_in_list (var, &state.old_outputs) {
1001 /* Create a new output var by cloning the original output var and
1002 * stealing the name.
1003 */
1004 nir_variable *output = nir_variable_clone(var, shader);
1005 exec_list_push_tail(&state.new_outputs, &output->node);
1006
1007 /* Rewrite the original output to be a shadow variable. */
1008 var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
1009 var->data.mode = nir_var_shader_temp;
1010
1011 /* Clone the shadow variable to create the emit shadow variable that
1012 * we'll assign in the emit conditionals.
1013 */
1014 nir_variable *emit_output = nir_variable_clone(var, shader);
1015 emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
1016 exec_list_push_tail(&state.emit_outputs, &emit_output->node);
1017 }
1018
1019 /* During the shader we'll keep track of which vertex we're currently
1020 * emitting for the EmitVertex test and how many vertices we emitted so we
1021 * know to discard if didn't emit any. In most simple shaders, this can
1022 * all be statically determined and gets optimized away.
1023 */
1024 state.vertex_count_var =
1025 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
1026 state.emitted_vertex_var =
1027 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
1028
1029 /* Initialize to 0. */
1030 b.cursor = nir_before_impl(impl);
1031 nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1032 nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1033 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1034
1035 nir_foreach_block_safe (block, impl)
1036 lower_gs_block(block, &b, &state);
1037
1038 /* Note: returns are lowered, so there should be only one block before the
1039 * end block. If we had real returns, we would probably want to redirect
1040 * them to this new if statement, rather than emitting this code at every
1041 * return statement.
1042 */
1043 assert(impl->end_block->predecessors->entries == 1);
1044 nir_block *block = nir_impl_last_block(impl);
1045 b.cursor = nir_after_block_before_jump(block);
1046
1047 /* If we haven't emitted any vertex we need to copy the shadow (old)
1048 * outputs to emit outputs here.
1049 *
1050 * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1051 * in an extra vertex_flags write for good measure. If unneeded it
1052 * will be optimized out.
1053 *
1054 * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1055 */
1056 nir_def *cond =
1057 nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1058 nir_push_if(&b, cond);
1059 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1060 copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1061 nir_pop_if(&b, NULL);
1062
1063 nir_discard_if(&b, cond);
1064
1065 copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1066
1067 exec_list_append(&shader->variables, &state.old_outputs);
1068 exec_list_append(&shader->variables, &state.emit_outputs);
1069 exec_list_append(&shader->variables, &state.new_outputs);
1070
1071 nir_metadata_preserve(impl, nir_metadata_none);
1072
1073 nir_lower_global_vars_to_local(shader);
1074 nir_split_var_copies(shader);
1075 nir_lower_var_copies(shader);
1076
1077 nir_fixup_deref_modes(shader);
1078
1079 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
1080 mesa_logi("NIR (after gs lowering):");
1081 nir_log_shaderi(shader);
1082 }
1083 }
1084