1 /*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27
28 struct state {
29 uint32_t topology;
30
31 struct primitive_map {
32 /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
33 unsigned loc[12 + 32];
34 unsigned stride;
35 } map;
36
37 nir_def *header;
38
39 nir_variable *vertex_count_var;
40 nir_variable *emitted_vertex_var;
41 nir_variable *vertex_flags_out;
42
43 struct exec_list old_outputs;
44 struct exec_list new_outputs;
45 struct exec_list emit_outputs;
46
47 /* tess ctrl shader on a650 gets the local primitive id at different bits: */
48 unsigned local_primitive_id_start;
49 };
50
51 static nir_def *
bitfield_extract(nir_builder * b,nir_def * v,uint32_t start,uint32_t mask)52 bitfield_extract(nir_builder *b, nir_def *v, uint32_t start, uint32_t mask)
53 {
54 return nir_iand_imm(b, nir_ushr_imm(b, v, start), mask);
55 }
56
57 static nir_def *
build_invocation_id(nir_builder * b,struct state * state)58 build_invocation_id(nir_builder *b, struct state *state)
59 {
60 return bitfield_extract(b, state->header, 11, 31);
61 }
62
63 static nir_def *
build_vertex_id(nir_builder * b,struct state * state)64 build_vertex_id(nir_builder *b, struct state *state)
65 {
66 return bitfield_extract(b, state->header, 6, 31);
67 }
68
69 static nir_def *
build_local_primitive_id(nir_builder * b,struct state * state)70 build_local_primitive_id(nir_builder *b, struct state *state)
71 {
72 return bitfield_extract(b, state->header, state->local_primitive_id_start,
73 63);
74 }
75
76 static bool
is_tess_levels(gl_varying_slot slot)77 is_tess_levels(gl_varying_slot slot)
78 {
79 return (slot == VARYING_SLOT_PRIMITIVE_ID ||
80 slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
81 slot == VARYING_SLOT_TESS_LEVEL_INNER);
82 }
83
84 /* Return a deterministic index for varyings. We can't rely on driver_location
85 * to be correct without linking the different stages first, so we create
86 * "primitive maps" where the producer decides on the location of each varying
87 * slot and then exports a per-slot array to the consumer. This compacts the
88 * gl_varying_slot space down a bit so that the primitive maps aren't too
89 * large.
90 *
91 * Note: per-patch varyings are currently handled separately, without any
92 * compacting.
93 *
94 * TODO: We could probably use the driver_location's directly in the non-SSO
95 * (Vulkan) case.
96 */
97
98 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)99 shader_io_get_unique_index(gl_varying_slot slot)
100 {
101 switch (slot) {
102 case VARYING_SLOT_POS: return 0;
103 case VARYING_SLOT_PSIZ: return 1;
104 case VARYING_SLOT_COL0: return 2;
105 case VARYING_SLOT_COL1: return 3;
106 case VARYING_SLOT_BFC0: return 4;
107 case VARYING_SLOT_BFC1: return 5;
108 case VARYING_SLOT_FOGC: return 6;
109 case VARYING_SLOT_CLIP_DIST0: return 7;
110 case VARYING_SLOT_CLIP_DIST1: return 8;
111 case VARYING_SLOT_CLIP_VERTEX: return 9;
112 case VARYING_SLOT_LAYER: return 10;
113 case VARYING_SLOT_VIEWPORT: return 11;
114 case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
115 struct state state = {};
116 STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
117 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
118 struct ir3_shader_variant v = {};
119 STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
120 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
121 return 12 + (slot - VARYING_SLOT_VAR0);
122 }
123 default:
124 unreachable("illegal slot in get unique index\n");
125 }
126 }
127
128 static nir_def *
build_local_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)129 build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
130 uint32_t location, uint32_t comp, nir_def *offset)
131 {
132 nir_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
133 nir_def *primitive_offset =
134 nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
135 nir_def *attr_offset;
136 nir_def *vertex_stride;
137 unsigned index = shader_io_get_unique_index(location);
138
139 switch (b->shader->info.stage) {
140 case MESA_SHADER_VERTEX:
141 case MESA_SHADER_TESS_EVAL:
142 vertex_stride = nir_imm_int(b, state->map.stride * 4);
143 attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
144 break;
145 case MESA_SHADER_TESS_CTRL:
146 case MESA_SHADER_GEOMETRY:
147 vertex_stride = nir_load_vs_vertex_stride_ir3(b);
148 attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
149 comp * 4);
150 break;
151 default:
152 unreachable("bad shader stage");
153 }
154
155 nir_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
156
157 return nir_iadd(
158 b, nir_iadd(b, primitive_offset, vertex_offset),
159 nir_iadd(b, attr_offset, nir_ishl_imm(b, offset, 4)));
160 }
161
162 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_def * src0,nir_def * src1,nir_def * src2)163 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
164 nir_intrinsic_op op, nir_def *src0, nir_def *src1,
165 nir_def *src2)
166 {
167 nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
168
169 new_intr->src[0] = nir_src_for_ssa(src0);
170 if (src1)
171 new_intr->src[1] = nir_src_for_ssa(src1);
172 if (src2)
173 new_intr->src[2] = nir_src_for_ssa(src2);
174
175 new_intr->num_components = intr->num_components;
176
177 if (nir_intrinsic_infos[op].has_dest)
178 nir_def_init(&new_intr->instr, &new_intr->def,
179 intr->num_components, intr->def.bit_size);
180
181 nir_builder_instr_insert(b, &new_intr->instr);
182
183 if (nir_intrinsic_infos[op].has_dest)
184 nir_def_rewrite_uses(&intr->def, &new_intr->def);
185
186 nir_instr_remove(&intr->instr);
187
188 return new_intr;
189 }
190
191 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)192 build_primitive_map(nir_shader *shader, struct primitive_map *map)
193 {
194 /* All interfaces except the TCS <-> TES interface use ldlw, which takes
195 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
196 * ldg, which takes an offset in dwords, but each per-vertex slot has
197 * space for every vertex, and there's space at the beginning for
198 * per-patch varyings.
199 */
200 unsigned slot_size = 16, start = 0;
201 if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
202 slot_size = shader->info.tess.tcs_vertices_out * 4;
203 start = util_last_bit(shader->info.patch_outputs_written) * 4;
204 }
205
206 uint64_t mask = shader->info.outputs_written;
207 unsigned loc = start;
208 while (mask) {
209 int location = u_bit_scan64(&mask);
210 if (is_tess_levels(location))
211 continue;
212
213 unsigned index = shader_io_get_unique_index(location);
214 map->loc[index] = loc;
215 loc += slot_size;
216 }
217
218 map->stride = loc;
219 /* Use units of dwords for the stride. */
220 if (shader->info.stage != MESA_SHADER_TESS_CTRL)
221 map->stride /= 4;
222 }
223
224 /* For shader stages that receive a primitive map, calculate how big it should
225 * be.
226 */
227
228 static unsigned
calc_primitive_map_size(nir_shader * shader)229 calc_primitive_map_size(nir_shader *shader)
230 {
231 uint64_t mask = shader->info.inputs_read;
232 unsigned max_index = 0;
233 while (mask) {
234 int location = u_bit_scan64(&mask);
235
236 if (is_tess_levels(location))
237 continue;
238
239 unsigned index = shader_io_get_unique_index(location);
240 max_index = MAX2(max_index, index + 1);
241 }
242
243 return max_index;
244 }
245
246 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)247 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
248 struct state *state)
249 {
250 nir_foreach_instr_safe (instr, block) {
251 if (instr->type != nir_instr_type_intrinsic)
252 continue;
253
254 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
255
256 switch (intr->intrinsic) {
257 case nir_intrinsic_store_output: {
258 // src[] = { value, offset }.
259
260 /* nir_lower_io_to_temporaries replaces all access to output
261 * variables with temp variables and then emits a nir_copy_var at
262 * the end of the shader. Thus, we should always get a full wrmask
263 * here.
264 */
265 assert(
266 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
267
268 b->cursor = nir_instr_remove(&intr->instr);
269
270 nir_def *vertex_id = build_vertex_id(b, state);
271 nir_def *offset = build_local_offset(
272 b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
273 nir_intrinsic_component(intr), intr->src[1].ssa);
274
275 nir_store_shared_ir3(b, intr->src[0].ssa, offset);
276 break;
277 }
278
279 default:
280 break;
281 }
282 }
283 }
284
285 static nir_def *
local_thread_id(nir_builder * b)286 local_thread_id(nir_builder *b)
287 {
288 return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
289 }
290
291 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)292 ir3_nir_lower_to_explicit_output(nir_shader *shader,
293 struct ir3_shader_variant *v,
294 unsigned topology)
295 {
296 struct state state = {};
297
298 build_primitive_map(shader, &state.map);
299 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
300
301 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
302 assert(impl);
303
304 nir_builder b = nir_builder_at(nir_before_impl(impl));
305
306 if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
307 state.header = nir_load_tcs_header_ir3(&b);
308 else
309 state.header = nir_load_gs_header_ir3(&b);
310
311 nir_foreach_block_safe (block, impl)
312 lower_block_to_explicit_output(block, &b, &state);
313
314 nir_metadata_preserve(impl,
315 nir_metadata_block_index | nir_metadata_dominance);
316
317 v->output_size = state.map.stride;
318 }
319
320 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)321 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
322 struct state *state)
323 {
324 nir_foreach_instr_safe (instr, block) {
325 if (instr->type != nir_instr_type_intrinsic)
326 continue;
327
328 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
329
330 switch (intr->intrinsic) {
331 case nir_intrinsic_load_per_vertex_input: {
332 // src[] = { vertex, offset }.
333
334 b->cursor = nir_before_instr(&intr->instr);
335
336 nir_def *offset = build_local_offset(
337 b, state,
338 intr->src[0].ssa, // this is typically gl_InvocationID
339 nir_intrinsic_io_semantics(intr).location,
340 nir_intrinsic_component(intr), intr->src[1].ssa);
341
342 replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
343 NULL);
344 break;
345 }
346
347 case nir_intrinsic_load_invocation_id: {
348 b->cursor = nir_before_instr(&intr->instr);
349
350 nir_def *iid = build_invocation_id(b, state);
351 nir_def_rewrite_uses(&intr->def, iid);
352 nir_instr_remove(&intr->instr);
353 break;
354 }
355
356 default:
357 break;
358 }
359 }
360 }
361
362 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)363 ir3_nir_lower_to_explicit_input(nir_shader *shader,
364 struct ir3_shader_variant *v)
365 {
366 struct state state = {};
367
368 /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
369 * HS uses a different primitive id, which starts at bit 16 in the header
370 */
371 if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
372 v->compiler->tess_use_shared)
373 state.local_primitive_id_start = 16;
374
375 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
376 assert(impl);
377
378 nir_builder b = nir_builder_at(nir_before_impl(impl));
379
380 if (shader->info.stage == MESA_SHADER_GEOMETRY)
381 state.header = nir_load_gs_header_ir3(&b);
382 else
383 state.header = nir_load_tcs_header_ir3(&b);
384
385 nir_foreach_block_safe (block, impl)
386 lower_block_to_explicit_input(block, &b, &state);
387
388 v->input_size = calc_primitive_map_size(shader);
389 }
390
391 static nir_def *
build_tcs_out_vertices(nir_builder * b)392 build_tcs_out_vertices(nir_builder *b)
393 {
394 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
395 return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
396 else
397 return nir_load_patch_vertices_in(b);
398 }
399
400 static nir_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)401 build_per_vertex_offset(nir_builder *b, struct state *state,
402 nir_def *vertex, uint32_t location, uint32_t comp,
403 nir_def *offset)
404 {
405 nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
406 nir_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
407 nir_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
408 nir_def *attr_offset;
409
410 if (nir_src_is_const(nir_src_for_ssa(offset))) {
411 location += nir_src_as_uint(nir_src_for_ssa(offset));
412 offset = nir_imm_int(b, 0);
413 } else {
414 /* Offset is in vec4's, but we need it in unit of components for the
415 * load/store_global_ir3 offset.
416 */
417 offset = nir_ishl_imm(b, offset, 2);
418 }
419
420 nir_def *vertex_offset;
421 if (vertex) {
422 unsigned index = shader_io_get_unique_index(location);
423 switch (b->shader->info.stage) {
424 case MESA_SHADER_TESS_CTRL:
425 attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
426 break;
427 case MESA_SHADER_TESS_EVAL:
428 attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
429 comp);
430 break;
431 default:
432 unreachable("bad shader state");
433 }
434
435 attr_offset = nir_iadd(b, attr_offset,
436 nir_imul24(b, offset, build_tcs_out_vertices(b)));
437 vertex_offset = nir_ishl_imm(b, vertex, 2);
438 } else {
439 assert(location >= VARYING_SLOT_PATCH0 &&
440 location <= VARYING_SLOT_TESS_MAX);
441 unsigned index = location - VARYING_SLOT_PATCH0;
442 attr_offset = nir_iadd_imm(b, offset, index * 4 + comp);
443 vertex_offset = nir_imm_int(b, 0);
444 }
445
446 return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
447 }
448
449 static nir_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_def * offset)450 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
451 uint32_t comp, nir_def *offset)
452 {
453 return build_per_vertex_offset(b, state, NULL, base, comp, offset);
454 }
455
456 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)457 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
458 {
459 switch (state->topology) {
460 case IR3_TESS_TRIANGLES:
461 *inner = 1;
462 *outer = 3;
463 break;
464 case IR3_TESS_QUADS:
465 *inner = 2;
466 *outer = 4;
467 break;
468 case IR3_TESS_ISOLINES:
469 *inner = 0;
470 *outer = 2;
471 break;
472 default:
473 unreachable("bad");
474 }
475 }
476
477 static nir_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)478 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
479 struct state *state)
480 {
481 uint32_t inner_levels, outer_levels;
482 tess_level_components(state, &inner_levels, &outer_levels);
483
484 const uint32_t patch_stride = 1 + inner_levels + outer_levels;
485
486 nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
487
488 nir_def *patch_offset =
489 nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
490
491 uint32_t offset;
492 switch (slot) {
493 case VARYING_SLOT_PRIMITIVE_ID:
494 offset = 0;
495 break;
496 case VARYING_SLOT_TESS_LEVEL_OUTER:
497 offset = 1;
498 break;
499 case VARYING_SLOT_TESS_LEVEL_INNER:
500 offset = 1 + outer_levels;
501 break;
502 default:
503 unreachable("bad");
504 }
505
506 return nir_iadd_imm(b, patch_offset, offset + comp);
507 }
508
509 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)510 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
511 {
512 nir_foreach_instr_safe (instr, block) {
513 if (instr->type != nir_instr_type_intrinsic)
514 continue;
515
516 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
517
518 switch (intr->intrinsic) {
519 case nir_intrinsic_load_per_vertex_output: {
520 // src[] = { vertex, offset }.
521
522 b->cursor = nir_before_instr(&intr->instr);
523
524 nir_def *address = nir_load_tess_param_base_ir3(b);
525 nir_def *offset = build_per_vertex_offset(
526 b, state, intr->src[0].ssa,
527 nir_intrinsic_io_semantics(intr).location,
528 nir_intrinsic_component(intr), intr->src[1].ssa);
529
530 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
531 offset, NULL);
532 break;
533 }
534
535 case nir_intrinsic_store_per_vertex_output: {
536 // src[] = { value, vertex, offset }.
537
538 b->cursor = nir_before_instr(&intr->instr);
539
540 /* sparse writemask not supported */
541 assert(
542 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
543
544 nir_def *value = intr->src[0].ssa;
545 nir_def *address = nir_load_tess_param_base_ir3(b);
546 nir_def *offset = build_per_vertex_offset(
547 b, state, intr->src[1].ssa,
548 nir_intrinsic_io_semantics(intr).location,
549 nir_intrinsic_component(intr), intr->src[2].ssa);
550
551 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
552 address, offset);
553
554 break;
555 }
556
557 case nir_intrinsic_load_output: {
558 // src[] = { offset }.
559
560 b->cursor = nir_before_instr(&intr->instr);
561
562 nir_def *address, *offset;
563
564 /* note if vectorization of the tess level loads ever happens:
565 * "ldg" across 16-byte boundaries can behave incorrectly if results
566 * are never used. most likely some issue with (sy) not properly
567 * syncing with values coming from a second memory transaction.
568 */
569 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
570 if (is_tess_levels(location)) {
571 assert(intr->def.num_components == 1);
572 address = nir_load_tess_factor_base_ir3(b);
573 offset = build_tessfactor_base(
574 b, location, nir_intrinsic_component(intr), state);
575 } else {
576 address = nir_load_tess_param_base_ir3(b);
577 offset = build_patch_offset(b, state, location,
578 nir_intrinsic_component(intr),
579 intr->src[0].ssa);
580 }
581
582 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
583 offset, NULL);
584 break;
585 }
586
587 case nir_intrinsic_store_output: {
588 // src[] = { value, offset }.
589
590 /* write patch output to bo */
591
592 b->cursor = nir_before_instr(&intr->instr);
593
594 /* sparse writemask not supported */
595 assert(
596 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
597
598 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
599 if (is_tess_levels(location)) {
600 uint32_t inner_levels, outer_levels, levels;
601 tess_level_components(state, &inner_levels, &outer_levels);
602
603 assert(intr->src[0].ssa->num_components == 1);
604
605 nir_if *nif = NULL;
606 if (location != VARYING_SLOT_PRIMITIVE_ID) {
607 /* with tess levels are defined as float[4] and float[2],
608 * but tess factor BO has smaller sizes for tris/isolines,
609 * so we have to discard any writes beyond the number of
610 * components for inner/outer levels
611 */
612 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
613 levels = outer_levels;
614 else
615 levels = inner_levels;
616
617 nir_def *offset = nir_iadd_imm(
618 b, intr->src[1].ssa, nir_intrinsic_component(intr));
619 nif = nir_push_if(b, nir_ult_imm(b, offset, levels));
620 }
621
622 nir_def *offset = build_tessfactor_base(
623 b, location, nir_intrinsic_component(intr), state);
624
625 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
626 intr->src[0].ssa,
627 nir_load_tess_factor_base_ir3(b),
628 nir_iadd(b, intr->src[1].ssa, offset));
629
630 if (location != VARYING_SLOT_PRIMITIVE_ID) {
631 nir_pop_if(b, nif);
632 }
633 } else {
634 nir_def *address = nir_load_tess_param_base_ir3(b);
635 nir_def *offset = build_patch_offset(
636 b, state, location, nir_intrinsic_component(intr),
637 intr->src[1].ssa);
638
639 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
640 intr->src[0].ssa, address, offset);
641 }
642 break;
643 }
644
645 default:
646 break;
647 }
648 }
649 }
650
651 static void
emit_tess_epilouge(nir_builder * b,struct state * state)652 emit_tess_epilouge(nir_builder *b, struct state *state)
653 {
654 /* Insert endpatch instruction:
655 *
656 * TODO we should re-work this to use normal flow control.
657 */
658
659 nir_end_patch_ir3(b);
660 }
661
662 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)663 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
664 unsigned topology)
665 {
666 struct state state = {.topology = topology};
667
668 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
669 mesa_logi("NIR (before tess lowering) for %s shader:",
670 _mesa_shader_stage_to_string(shader->info.stage));
671 nir_log_shaderi(shader);
672 }
673
674 build_primitive_map(shader, &state.map);
675 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
676 v->output_size = state.map.stride;
677
678 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
679 assert(impl);
680
681 nir_builder b = nir_builder_at(nir_before_impl(impl));
682
683 state.header = nir_load_tcs_header_ir3(&b);
684
685 /* If required, store gl_PrimitiveID. */
686 if (v->key.tcs_store_primid) {
687 b.cursor = nir_after_impl(impl);
688
689 nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
690 .io_semantics = {
691 .location = VARYING_SLOT_PRIMITIVE_ID,
692 .num_slots = 1
693 });
694
695 b.cursor = nir_before_impl(impl);
696 }
697
698 nir_foreach_block_safe (block, impl)
699 lower_tess_ctrl_block(block, &b, &state);
700
701 /* Now move the body of the TCS into a conditional:
702 *
703 * if (gl_InvocationID < num_vertices)
704 * // body
705 *
706 */
707
708 nir_cf_list body;
709 nir_cf_extract(&body, nir_before_impl(impl),
710 nir_after_impl(impl));
711
712 b.cursor = nir_after_impl(impl);
713
714 /* Re-emit the header, since the old one got moved into the if branch */
715 state.header = nir_load_tcs_header_ir3(&b);
716 nir_def *iid = build_invocation_id(&b, &state);
717
718 const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
719 nir_def *cond = nir_ult_imm(&b, iid, nvertices);
720
721 nir_if *nif = nir_push_if(&b, cond);
722
723 nir_cf_reinsert(&body, b.cursor);
724
725 b.cursor = nir_after_cf_list(&nif->then_list);
726
727 /* Insert conditional exit for threads invocation id != 0 */
728 nir_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
729 nir_cond_end_ir3(&b, iid0_cond);
730
731 emit_tess_epilouge(&b, &state);
732
733 nir_pop_if(&b, nif);
734
735 nir_metadata_preserve(impl, nir_metadata_none);
736 }
737
738 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)739 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
740 {
741 nir_foreach_instr_safe (instr, block) {
742 if (instr->type != nir_instr_type_intrinsic)
743 continue;
744
745 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
746
747 switch (intr->intrinsic) {
748 case nir_intrinsic_load_per_vertex_input: {
749 // src[] = { vertex, offset }.
750
751 b->cursor = nir_before_instr(&intr->instr);
752
753 nir_def *address = nir_load_tess_param_base_ir3(b);
754 nir_def *offset = build_per_vertex_offset(
755 b, state, intr->src[0].ssa,
756 nir_intrinsic_io_semantics(intr).location,
757 nir_intrinsic_component(intr), intr->src[1].ssa);
758
759 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
760 offset, NULL);
761 break;
762 }
763
764 case nir_intrinsic_load_input: {
765 // src[] = { offset }.
766
767 b->cursor = nir_before_instr(&intr->instr);
768
769 nir_def *address, *offset;
770
771 /* note if vectorization of the tess level loads ever happens:
772 * "ldg" across 16-byte boundaries can behave incorrectly if results
773 * are never used. most likely some issue with (sy) not properly
774 * syncing with values coming from a second memory transaction.
775 */
776 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
777 if (is_tess_levels(location)) {
778 assert(intr->def.num_components == 1);
779 address = nir_load_tess_factor_base_ir3(b);
780 offset = build_tessfactor_base(
781 b, location, nir_intrinsic_component(intr), state);
782 } else {
783 address = nir_load_tess_param_base_ir3(b);
784 offset = build_patch_offset(b, state, location,
785 nir_intrinsic_component(intr),
786 intr->src[0].ssa);
787 }
788
789 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
790 offset, NULL);
791 break;
792 }
793
794 default:
795 break;
796 }
797 }
798 }
799
800 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)801 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
802 unsigned topology)
803 {
804 struct state state = {.topology = topology};
805
806 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
807 mesa_logi("NIR (before tess lowering) for %s shader:",
808 _mesa_shader_stage_to_string(shader->info.stage));
809 nir_log_shaderi(shader);
810 }
811
812 NIR_PASS_V(shader, nir_lower_tess_coord_z, topology == IR3_TESS_TRIANGLES);
813
814 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
815 assert(impl);
816
817 nir_builder b = nir_builder_create(impl);
818
819 nir_foreach_block_safe (block, impl)
820 lower_tess_eval_block(block, &b, &state);
821
822 v->input_size = calc_primitive_map_size(shader);
823
824 nir_metadata_preserve(impl, nir_metadata_none);
825 }
826
827 /* The hardware does not support incomplete primitives in multiple streams at
828 * once or ending the "wrong" stream, but Vulkan allows this. That is,
829 * EmitStreamVertex(N) followed by EmitStreamVertex(M) or EndStreamPrimitive(M)
830 * where N != M and there isn't a call to EndStreamPrimitive(N) in between isn't
831 * supported by the hardware. Fix this up by duplicating the entire shader per
832 * stream, removing EmitStreamVertex/EndStreamPrimitive calls for streams other
833 * than the current one.
834 */
835
836 static void
lower_mixed_streams(nir_shader * nir)837 lower_mixed_streams(nir_shader *nir)
838 {
839 /* We don't have to do anything for points because there is only one vertex
840 * per primitive and therefore no possibility of mixing.
841 */
842 if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
843 return;
844
845 nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
846
847 uint8_t stream_mask = 0;
848
849 nir_foreach_block (block, entrypoint) {
850 nir_foreach_instr (instr, block) {
851 if (instr->type != nir_instr_type_intrinsic)
852 continue;
853
854 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
855
856 if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
857 intrin->intrinsic == nir_intrinsic_end_primitive)
858 stream_mask |= 1 << nir_intrinsic_stream_id(intrin);
859 }
860 }
861
862 if (util_is_power_of_two_or_zero(stream_mask))
863 return;
864
865 nir_cf_list body;
866 nir_cf_list_extract(&body, &entrypoint->body);
867
868 nir_builder b = nir_builder_create(entrypoint);
869
870 u_foreach_bit (stream, stream_mask) {
871 b.cursor = nir_after_impl(entrypoint);
872
873 /* Inserting the cloned body invalidates any cursor not using an
874 * instruction, so we need to emit this to keep track of where the new
875 * body is to iterate over it.
876 */
877 nir_instr *anchor = &nir_nop(&b)->instr;
878
879 nir_cf_list_clone_and_reinsert(&body, &entrypoint->cf_node, b.cursor, NULL);
880
881 /* We need to iterate over all instructions after the anchor, which is a
882 * bit tricky to do so we do it manually.
883 */
884 for (nir_block *block = anchor->block; block != NULL;
885 block = nir_block_cf_tree_next(block)) {
886 for (nir_instr *instr =
887 (block == anchor->block) ? anchor : nir_block_first_instr(block),
888 *next = instr ? nir_instr_next(instr) : NULL;
889 instr != NULL; instr = next, next = next ? nir_instr_next(next) : NULL) {
890 if (instr->type != nir_instr_type_intrinsic)
891 continue;
892
893 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
894 if ((intrin->intrinsic == nir_intrinsic_emit_vertex ||
895 intrin->intrinsic == nir_intrinsic_end_primitive) &&
896 nir_intrinsic_stream_id(intrin) != stream) {
897 nir_instr_remove(instr);
898 }
899 }
900 }
901
902 nir_instr_remove(anchor);
903
904 /* The user can omit the last EndStreamPrimitive(), so add an extra one
905 * here before potentially adding other copies of the body that emit to
906 * different streams. Our lowering means that redundant calls to
907 * EndStreamPrimitive are safe and should be optimized out.
908 */
909 b.cursor = nir_after_impl(entrypoint);
910 nir_end_primitive(&b, .stream_id = stream);
911 }
912
913 nir_cf_delete(&body);
914 }
915
916 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)917 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
918 {
919 foreach_two_lists (dest_node, dests, src_node, srcs) {
920 nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
921 nir_variable *src = exec_node_data(nir_variable, src_node, node);
922 nir_copy_var(b, dest, src);
923 }
924 }
925
926 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)927 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
928 {
929 nir_foreach_instr_safe (instr, block) {
930 if (instr->type != nir_instr_type_intrinsic)
931 continue;
932
933 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
934
935 switch (intr->intrinsic) {
936 case nir_intrinsic_end_primitive: {
937 /* The HW will use the stream from the preceding emitted vertices,
938 * which thanks to the lower_mixed_streams is the same as the stream
939 * for this instruction, so we can ignore it here.
940 */
941 b->cursor = nir_before_instr(&intr->instr);
942 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
943 nir_instr_remove(&intr->instr);
944 break;
945 }
946
947 case nir_intrinsic_emit_vertex: {
948 /* Load the vertex count */
949 b->cursor = nir_before_instr(&intr->instr);
950 nir_def *count = nir_load_var(b, state->vertex_count_var);
951
952 nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
953
954 unsigned stream = nir_intrinsic_stream_id(intr);
955 /* vertex_flags_out |= stream */
956 nir_store_var(b, state->vertex_flags_out,
957 nir_ior_imm(b, nir_load_var(b, state->vertex_flags_out),
958 stream),
959 0x1 /* .x */);
960
961 copy_vars(b, &state->emit_outputs, &state->old_outputs);
962
963 nir_instr_remove(&intr->instr);
964
965 nir_store_var(b, state->emitted_vertex_var,
966 nir_iadd_imm(b,
967 nir_load_var(b,
968 state->emitted_vertex_var),
969 1),
970 0x1);
971
972 nir_pop_if(b, NULL);
973
974 /* Increment the vertex count by 1 */
975 nir_store_var(b, state->vertex_count_var,
976 nir_iadd_imm(b, count, 1), 0x1); /* .x */
977 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
978
979 break;
980 }
981
982 default:
983 break;
984 }
985 }
986 }
987
988 void
ir3_nir_lower_gs(nir_shader * shader)989 ir3_nir_lower_gs(nir_shader *shader)
990 {
991 struct state state = {};
992
993 /* Don't lower multiple times: */
994 nir_foreach_shader_out_variable (var, shader)
995 if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
996 return;
997
998 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
999 mesa_logi("NIR (before gs lowering):");
1000 nir_log_shaderi(shader);
1001 }
1002
1003 lower_mixed_streams(shader);
1004
1005 /* Create an output var for vertex_flags. This will be shadowed below,
1006 * same way regular outputs get shadowed, and this variable will become a
1007 * temporary.
1008 */
1009 state.vertex_flags_out = nir_variable_create(
1010 shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
1011 state.vertex_flags_out->data.driver_location = shader->num_outputs++;
1012 state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
1013 state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
1014
1015 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1016 assert(impl);
1017
1018 nir_builder b = nir_builder_at(nir_before_impl(impl));
1019
1020 state.header = nir_load_gs_header_ir3(&b);
1021
1022 /* Generate two set of shadow vars for the output variables. The first
1023 * set replaces the real outputs and the second set (emit_outputs) we'll
1024 * assign in the emit_vertex conditionals. Then at the end of the shader
1025 * we copy the emit_outputs to the real outputs, so that we get
1026 * store_output in uniform control flow.
1027 */
1028 exec_list_make_empty(&state.old_outputs);
1029 nir_foreach_shader_out_variable_safe (var, shader) {
1030 exec_node_remove(&var->node);
1031 exec_list_push_tail(&state.old_outputs, &var->node);
1032 }
1033 exec_list_make_empty(&state.new_outputs);
1034 exec_list_make_empty(&state.emit_outputs);
1035 nir_foreach_variable_in_list (var, &state.old_outputs) {
1036 /* Create a new output var by cloning the original output var and
1037 * stealing the name.
1038 */
1039 nir_variable *output = nir_variable_clone(var, shader);
1040 exec_list_push_tail(&state.new_outputs, &output->node);
1041
1042 /* Rewrite the original output to be a shadow variable. */
1043 var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
1044 var->data.mode = nir_var_shader_temp;
1045
1046 /* Clone the shadow variable to create the emit shadow variable that
1047 * we'll assign in the emit conditionals.
1048 */
1049 nir_variable *emit_output = nir_variable_clone(var, shader);
1050 emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
1051 exec_list_push_tail(&state.emit_outputs, &emit_output->node);
1052 }
1053
1054 /* During the shader we'll keep track of which vertex we're currently
1055 * emitting for the EmitVertex test and how many vertices we emitted so we
1056 * know to discard if didn't emit any. In most simple shaders, this can
1057 * all be statically determined and gets optimized away.
1058 */
1059 state.vertex_count_var =
1060 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
1061 state.emitted_vertex_var =
1062 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
1063
1064 /* Initialize to 0. */
1065 b.cursor = nir_before_impl(impl);
1066 nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1067 nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1068 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1069
1070 nir_foreach_block_safe (block, impl)
1071 lower_gs_block(block, &b, &state);
1072
1073 /* Note: returns are lowered, so there should be only one block before the
1074 * end block. If we had real returns, we would probably want to redirect
1075 * them to this new if statement, rather than emitting this code at every
1076 * return statement.
1077 */
1078 assert(impl->end_block->predecessors->entries == 1);
1079 nir_block *block = nir_impl_last_block(impl);
1080 b.cursor = nir_after_block_before_jump(block);
1081
1082 /* If we haven't emitted any vertex we need to copy the shadow (old)
1083 * outputs to emit outputs here.
1084 *
1085 * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1086 * in an extra vertex_flags write for good measure. If unneeded it
1087 * will be optimized out.
1088 *
1089 * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1090 */
1091 nir_def *cond =
1092 nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1093 nir_push_if(&b, cond);
1094 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1095 copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1096 nir_pop_if(&b, NULL);
1097
1098 nir_discard_if(&b, cond);
1099
1100 copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1101
1102 exec_list_append(&shader->variables, &state.old_outputs);
1103 exec_list_append(&shader->variables, &state.emit_outputs);
1104 exec_list_append(&shader->variables, &state.new_outputs);
1105
1106 nir_metadata_preserve(impl, nir_metadata_none);
1107
1108 nir_lower_global_vars_to_local(shader);
1109 nir_split_var_copies(shader);
1110 nir_lower_var_copies(shader);
1111
1112 nir_fixup_deref_modes(shader);
1113
1114 if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
1115 mesa_logi("NIR (after gs lowering):");
1116 nir_log_shaderi(shader);
1117 }
1118 }
1119