1 /*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27
28 struct state {
29 uint32_t topology;
30
31 struct primitive_map {
32 /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
33 unsigned loc[12 + 32];
34 unsigned stride;
35 } map;
36
37 nir_ssa_def *header;
38
39 nir_variable *vertex_count_var;
40 nir_variable *emitted_vertex_var;
41 nir_variable *vertex_flags_out;
42
43 struct exec_list old_outputs;
44 struct exec_list new_outputs;
45 struct exec_list emit_outputs;
46
47 /* tess ctrl shader on a650 gets the local primitive id at different bits: */
48 unsigned local_primitive_id_start;
49 };
50
51 static nir_ssa_def *
bitfield_extract(nir_builder * b,nir_ssa_def * v,uint32_t start,uint32_t mask)52 bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
53 {
54 return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
55 nir_imm_int(b, mask));
56 }
57
58 static nir_ssa_def *
build_invocation_id(nir_builder * b,struct state * state)59 build_invocation_id(nir_builder *b, struct state *state)
60 {
61 return bitfield_extract(b, state->header, 11, 31);
62 }
63
64 static nir_ssa_def *
build_vertex_id(nir_builder * b,struct state * state)65 build_vertex_id(nir_builder *b, struct state *state)
66 {
67 return bitfield_extract(b, state->header, 6, 31);
68 }
69
70 static nir_ssa_def *
build_local_primitive_id(nir_builder * b,struct state * state)71 build_local_primitive_id(nir_builder *b, struct state *state)
72 {
73 return bitfield_extract(b, state->header, state->local_primitive_id_start,
74 63);
75 }
76
77 static bool
is_tess_levels(gl_varying_slot slot)78 is_tess_levels(gl_varying_slot slot)
79 {
80 return (slot == VARYING_SLOT_PRIMITIVE_ID ||
81 slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
82 slot == VARYING_SLOT_TESS_LEVEL_INNER);
83 }
84
85 /* Return a deterministic index for varyings. We can't rely on driver_location
86 * to be correct without linking the different stages first, so we create
87 * "primitive maps" where the producer decides on the location of each varying
88 * slot and then exports a per-slot array to the consumer. This compacts the
89 * gl_varying_slot space down a bit so that the primitive maps aren't too
90 * large.
91 *
92 * Note: per-patch varyings are currently handled separately, without any
93 * compacting.
94 *
95 * TODO: We could probably use the driver_location's directly in the non-SSO
96 * (Vulkan) case.
97 */
98
99 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)100 shader_io_get_unique_index(gl_varying_slot slot)
101 {
102 switch (slot) {
103 case VARYING_SLOT_POS: return 0;
104 case VARYING_SLOT_PSIZ: return 1;
105 case VARYING_SLOT_COL0: return 2;
106 case VARYING_SLOT_COL1: return 3;
107 case VARYING_SLOT_BFC0: return 4;
108 case VARYING_SLOT_BFC1: return 5;
109 case VARYING_SLOT_FOGC: return 6;
110 case VARYING_SLOT_CLIP_DIST0: return 7;
111 case VARYING_SLOT_CLIP_DIST1: return 8;
112 case VARYING_SLOT_CLIP_VERTEX: return 9;
113 case VARYING_SLOT_LAYER: return 10;
114 case VARYING_SLOT_VIEWPORT: return 11;
115 case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
116 struct state state = {};
117 STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
118 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
119 struct ir3_shader_variant v = {};
120 STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
121 (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
122 return 12 + (slot - VARYING_SLOT_VAR0);
123 }
124 default:
125 unreachable("illegal slot in get unique index\n");
126 }
127 }
128
129 static nir_ssa_def *
build_local_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)130 build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
131 uint32_t location, uint32_t comp, nir_ssa_def *offset)
132 {
133 nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
134 nir_ssa_def *primitive_offset =
135 nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
136 nir_ssa_def *attr_offset;
137 nir_ssa_def *vertex_stride;
138 unsigned index = shader_io_get_unique_index(location);
139
140 switch (b->shader->info.stage) {
141 case MESA_SHADER_VERTEX:
142 case MESA_SHADER_TESS_EVAL:
143 vertex_stride = nir_imm_int(b, state->map.stride * 4);
144 attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
145 break;
146 case MESA_SHADER_TESS_CTRL:
147 case MESA_SHADER_GEOMETRY:
148 vertex_stride = nir_load_vs_vertex_stride_ir3(b);
149 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
150 nir_imm_int(b, comp * 4));
151 break;
152 default:
153 unreachable("bad shader stage");
154 }
155
156 nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
157
158 return nir_iadd(
159 b, nir_iadd(b, primitive_offset, vertex_offset),
160 nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
161 }
162
163 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_ssa_def * src0,nir_ssa_def * src1,nir_ssa_def * src2)164 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
165 nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
166 nir_ssa_def *src2)
167 {
168 nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
169
170 new_intr->src[0] = nir_src_for_ssa(src0);
171 if (src1)
172 new_intr->src[1] = nir_src_for_ssa(src1);
173 if (src2)
174 new_intr->src[2] = nir_src_for_ssa(src2);
175
176 new_intr->num_components = intr->num_components;
177
178 if (nir_intrinsic_infos[op].has_dest)
179 nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
180 intr->dest.ssa.bit_size, NULL);
181
182 nir_builder_instr_insert(b, &new_intr->instr);
183
184 if (nir_intrinsic_infos[op].has_dest)
185 nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
186
187 nir_instr_remove(&intr->instr);
188
189 return new_intr;
190 }
191
192 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)193 build_primitive_map(nir_shader *shader, struct primitive_map *map)
194 {
195 /* All interfaces except the TCS <-> TES interface use ldlw, which takes
196 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
197 * ldg, which takes an offset in dwords, but each per-vertex slot has
198 * space for every vertex, and there's space at the beginning for
199 * per-patch varyings.
200 */
201 unsigned slot_size = 16, start = 0;
202 if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
203 slot_size = shader->info.tess.tcs_vertices_out * 4;
204 start = util_last_bit(shader->info.patch_outputs_written) * 4;
205 }
206
207 uint64_t mask = shader->info.outputs_written;
208 unsigned loc = start;
209 while (mask) {
210 int location = u_bit_scan64(&mask);
211 if (is_tess_levels(location))
212 continue;
213
214 unsigned index = shader_io_get_unique_index(location);
215 map->loc[index] = loc;
216 loc += slot_size;
217 }
218
219 map->stride = loc;
220 /* Use units of dwords for the stride. */
221 if (shader->info.stage != MESA_SHADER_TESS_CTRL)
222 map->stride /= 4;
223 }
224
225 /* For shader stages that receive a primitive map, calculate how big it should
226 * be.
227 */
228
229 static unsigned
calc_primitive_map_size(nir_shader * shader)230 calc_primitive_map_size(nir_shader *shader)
231 {
232 uint64_t mask = shader->info.inputs_read;
233 unsigned max_index = 0;
234 while (mask) {
235 int location = u_bit_scan64(&mask);
236
237 if (is_tess_levels(location))
238 continue;
239
240 unsigned index = shader_io_get_unique_index(location);
241 max_index = MAX2(max_index, index + 1);
242 }
243
244 return max_index;
245 }
246
247 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)248 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
249 struct state *state)
250 {
251 nir_foreach_instr_safe (instr, block) {
252 if (instr->type != nir_instr_type_intrinsic)
253 continue;
254
255 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
256
257 switch (intr->intrinsic) {
258 case nir_intrinsic_store_output: {
259 // src[] = { value, offset }.
260
261 /* nir_lower_io_to_temporaries replaces all access to output
262 * variables with temp variables and then emits a nir_copy_var at
263 * the end of the shader. Thus, we should always get a full wrmask
264 * here.
265 */
266 assert(
267 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
268
269 b->cursor = nir_instr_remove(&intr->instr);
270
271 nir_ssa_def *vertex_id = build_vertex_id(b, state);
272 nir_ssa_def *offset = build_local_offset(
273 b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
274 nir_intrinsic_component(intr), intr->src[1].ssa);
275
276 nir_store_shared_ir3(b, intr->src[0].ssa, offset);
277 break;
278 }
279
280 default:
281 break;
282 }
283 }
284 }
285
286 static nir_ssa_def *
local_thread_id(nir_builder * b)287 local_thread_id(nir_builder *b)
288 {
289 return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
290 }
291
292 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)293 ir3_nir_lower_to_explicit_output(nir_shader *shader,
294 struct ir3_shader_variant *v,
295 unsigned topology)
296 {
297 struct state state = {};
298
299 build_primitive_map(shader, &state.map);
300 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
301
302 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
303 assert(impl);
304
305 nir_builder b;
306 nir_builder_init(&b, impl);
307 b.cursor = nir_before_cf_list(&impl->body);
308
309 if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
310 state.header = nir_load_tcs_header_ir3(&b);
311 else
312 state.header = nir_load_gs_header_ir3(&b);
313
314 nir_foreach_block_safe (block, impl)
315 lower_block_to_explicit_output(block, &b, &state);
316
317 nir_metadata_preserve(impl,
318 nir_metadata_block_index | nir_metadata_dominance);
319
320 v->output_size = state.map.stride;
321 }
322
323 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)324 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
325 struct state *state)
326 {
327 nir_foreach_instr_safe (instr, block) {
328 if (instr->type != nir_instr_type_intrinsic)
329 continue;
330
331 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
332
333 switch (intr->intrinsic) {
334 case nir_intrinsic_load_per_vertex_input: {
335 // src[] = { vertex, offset }.
336
337 b->cursor = nir_before_instr(&intr->instr);
338
339 nir_ssa_def *offset = build_local_offset(
340 b, state,
341 intr->src[0].ssa, // this is typically gl_InvocationID
342 nir_intrinsic_io_semantics(intr).location,
343 nir_intrinsic_component(intr), intr->src[1].ssa);
344
345 replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
346 NULL);
347 break;
348 }
349
350 case nir_intrinsic_load_invocation_id: {
351 b->cursor = nir_before_instr(&intr->instr);
352
353 nir_ssa_def *iid = build_invocation_id(b, state);
354 nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
355 nir_instr_remove(&intr->instr);
356 break;
357 }
358
359 default:
360 break;
361 }
362 }
363 }
364
365 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)366 ir3_nir_lower_to_explicit_input(nir_shader *shader,
367 struct ir3_shader_variant *v)
368 {
369 struct state state = {};
370
371 /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
372 * HS uses a different primitive id, which starts at bit 16 in the header
373 */
374 if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
375 v->compiler->tess_use_shared)
376 state.local_primitive_id_start = 16;
377
378 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
379 assert(impl);
380
381 nir_builder b;
382 nir_builder_init(&b, impl);
383 b.cursor = nir_before_cf_list(&impl->body);
384
385 if (shader->info.stage == MESA_SHADER_GEOMETRY)
386 state.header = nir_load_gs_header_ir3(&b);
387 else
388 state.header = nir_load_tcs_header_ir3(&b);
389
390 nir_foreach_block_safe (block, impl)
391 lower_block_to_explicit_input(block, &b, &state);
392
393 v->input_size = calc_primitive_map_size(shader);
394 }
395
396 static nir_ssa_def *
build_tcs_out_vertices(nir_builder * b)397 build_tcs_out_vertices(nir_builder *b)
398 {
399 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
400 return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
401 else
402 return nir_load_patch_vertices_in(b);
403 }
404
405 static nir_ssa_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)406 build_per_vertex_offset(nir_builder *b, struct state *state,
407 nir_ssa_def *vertex, uint32_t location, uint32_t comp,
408 nir_ssa_def *offset)
409 {
410 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
411 nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
412 nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
413 nir_ssa_def *attr_offset;
414
415 if (nir_src_is_const(nir_src_for_ssa(offset))) {
416 location += nir_src_as_uint(nir_src_for_ssa(offset));
417 offset = nir_imm_int(b, 0);
418 } else {
419 /* Offset is in vec4's, but we need it in unit of components for the
420 * load/store_global_ir3 offset.
421 */
422 offset = nir_ishl(b, offset, nir_imm_int(b, 2));
423 }
424
425 nir_ssa_def *vertex_offset;
426 if (vertex) {
427 unsigned index = shader_io_get_unique_index(location);
428 switch (b->shader->info.stage) {
429 case MESA_SHADER_TESS_CTRL:
430 attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
431 break;
432 case MESA_SHADER_TESS_EVAL:
433 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
434 nir_imm_int(b, comp));
435 break;
436 default:
437 unreachable("bad shader state");
438 }
439
440 attr_offset = nir_iadd(b, attr_offset,
441 nir_imul24(b, offset, build_tcs_out_vertices(b)));
442 vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
443 } else {
444 assert(location >= VARYING_SLOT_PATCH0 &&
445 location <= VARYING_SLOT_TESS_MAX);
446 unsigned index = location - VARYING_SLOT_PATCH0;
447 attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
448 vertex_offset = nir_imm_int(b, 0);
449 }
450
451 return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
452 }
453
454 static nir_ssa_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_ssa_def * offset)455 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
456 uint32_t comp, nir_ssa_def *offset)
457 {
458 return build_per_vertex_offset(b, state, NULL, base, comp, offset);
459 }
460
461 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)462 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
463 {
464 switch (state->topology) {
465 case IR3_TESS_TRIANGLES:
466 *inner = 1;
467 *outer = 3;
468 break;
469 case IR3_TESS_QUADS:
470 *inner = 2;
471 *outer = 4;
472 break;
473 case IR3_TESS_ISOLINES:
474 *inner = 0;
475 *outer = 2;
476 break;
477 default:
478 unreachable("bad");
479 }
480 }
481
482 static nir_ssa_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)483 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
484 struct state *state)
485 {
486 uint32_t inner_levels, outer_levels;
487 tess_level_components(state, &inner_levels, &outer_levels);
488
489 const uint32_t patch_stride = 1 + inner_levels + outer_levels;
490
491 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
492
493 nir_ssa_def *patch_offset =
494 nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
495
496 uint32_t offset;
497 switch (slot) {
498 case VARYING_SLOT_PRIMITIVE_ID:
499 offset = 0;
500 break;
501 case VARYING_SLOT_TESS_LEVEL_OUTER:
502 offset = 1;
503 break;
504 case VARYING_SLOT_TESS_LEVEL_INNER:
505 offset = 1 + outer_levels;
506 break;
507 default:
508 unreachable("bad");
509 }
510
511 return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
512 }
513
514 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)515 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
516 {
517 nir_foreach_instr_safe (instr, block) {
518 if (instr->type != nir_instr_type_intrinsic)
519 continue;
520
521 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
522
523 switch (intr->intrinsic) {
524 case nir_intrinsic_load_per_vertex_output: {
525 // src[] = { vertex, offset }.
526
527 b->cursor = nir_before_instr(&intr->instr);
528
529 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
530 nir_ssa_def *offset = build_per_vertex_offset(
531 b, state, intr->src[0].ssa,
532 nir_intrinsic_io_semantics(intr).location,
533 nir_intrinsic_component(intr), intr->src[1].ssa);
534
535 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
536 offset, NULL);
537 break;
538 }
539
540 case nir_intrinsic_store_per_vertex_output: {
541 // src[] = { value, vertex, offset }.
542
543 b->cursor = nir_before_instr(&intr->instr);
544
545 /* sparse writemask not supported */
546 assert(
547 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
548
549 nir_ssa_def *value = intr->src[0].ssa;
550 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
551 nir_ssa_def *offset = build_per_vertex_offset(
552 b, state, intr->src[1].ssa,
553 nir_intrinsic_io_semantics(intr).location,
554 nir_intrinsic_component(intr), intr->src[2].ssa);
555
556 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
557 address, offset);
558
559 break;
560 }
561
562 case nir_intrinsic_load_output: {
563 // src[] = { offset }.
564
565 b->cursor = nir_before_instr(&intr->instr);
566
567 nir_ssa_def *address, *offset;
568
569 /* note if vectorization of the tess level loads ever happens:
570 * "ldg" across 16-byte boundaries can behave incorrectly if results
571 * are never used. most likely some issue with (sy) not properly
572 * syncing with values coming from a second memory transaction.
573 */
574 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
575 if (is_tess_levels(location)) {
576 assert(intr->dest.ssa.num_components == 1);
577 address = nir_load_tess_factor_base_ir3(b);
578 offset = build_tessfactor_base(
579 b, location, nir_intrinsic_component(intr), state);
580 } else {
581 address = nir_load_tess_param_base_ir3(b);
582 offset = build_patch_offset(b, state, location,
583 nir_intrinsic_component(intr),
584 intr->src[0].ssa);
585 }
586
587 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
588 offset, NULL);
589 break;
590 }
591
592 case nir_intrinsic_store_output: {
593 // src[] = { value, offset }.
594
595 /* write patch output to bo */
596
597 b->cursor = nir_before_instr(&intr->instr);
598
599 /* sparse writemask not supported */
600 assert(
601 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
602
603 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
604 if (is_tess_levels(location)) {
605 uint32_t inner_levels, outer_levels, levels;
606 tess_level_components(state, &inner_levels, &outer_levels);
607
608 assert(intr->src[0].ssa->num_components == 1);
609
610 nir_if *nif = NULL;
611 if (location != VARYING_SLOT_PRIMITIVE_ID) {
612 /* with tess levels are defined as float[4] and float[2],
613 * but tess factor BO has smaller sizes for tris/isolines,
614 * so we have to discard any writes beyond the number of
615 * components for inner/outer levels
616 */
617 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
618 levels = outer_levels;
619 else
620 levels = inner_levels;
621
622 nir_ssa_def *offset = nir_iadd_imm(
623 b, intr->src[1].ssa, nir_intrinsic_component(intr));
624 nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
625 }
626
627 nir_ssa_def *offset = build_tessfactor_base(
628 b, location, nir_intrinsic_component(intr), state);
629
630 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
631 intr->src[0].ssa,
632 nir_load_tess_factor_base_ir3(b),
633 nir_iadd(b, intr->src[1].ssa, offset));
634
635 if (location != VARYING_SLOT_PRIMITIVE_ID) {
636 nir_pop_if(b, nif);
637 }
638 } else {
639 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
640 nir_ssa_def *offset = build_patch_offset(
641 b, state, location, nir_intrinsic_component(intr),
642 intr->src[1].ssa);
643
644 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
645 intr->src[0].ssa, address, offset);
646 }
647 break;
648 }
649
650 default:
651 break;
652 }
653 }
654 }
655
656 static void
emit_tess_epilouge(nir_builder * b,struct state * state)657 emit_tess_epilouge(nir_builder *b, struct state *state)
658 {
659 /* Insert endpatch instruction:
660 *
661 * TODO we should re-work this to use normal flow control.
662 */
663
664 nir_end_patch_ir3(b);
665 }
666
667 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)668 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
669 unsigned topology)
670 {
671 struct state state = {.topology = topology};
672
673 if (shader_debug_enabled(shader->info.stage)) {
674 mesa_logi("NIR (before tess lowering) for %s shader:",
675 _mesa_shader_stage_to_string(shader->info.stage));
676 nir_log_shaderi(shader);
677 }
678
679 build_primitive_map(shader, &state.map);
680 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
681 v->output_size = state.map.stride;
682
683 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
684 assert(impl);
685
686 nir_builder b;
687 nir_builder_init(&b, impl);
688 b.cursor = nir_before_cf_list(&impl->body);
689
690 state.header = nir_load_tcs_header_ir3(&b);
691
692 /* If required, store gl_PrimitiveID. */
693 if (v->key.tcs_store_primid) {
694 b.cursor = nir_after_cf_list(&impl->body);
695
696 nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
697 .io_semantics = {
698 .location = VARYING_SLOT_PRIMITIVE_ID,
699 .num_slots = 1
700 });
701
702 b.cursor = nir_before_cf_list(&impl->body);
703 }
704
705 nir_foreach_block_safe (block, impl)
706 lower_tess_ctrl_block(block, &b, &state);
707
708 /* Now move the body of the TCS into a conditional:
709 *
710 * if (gl_InvocationID < num_vertices)
711 * // body
712 *
713 */
714
715 nir_cf_list body;
716 nir_cf_extract(&body, nir_before_cf_list(&impl->body),
717 nir_after_cf_list(&impl->body));
718
719 b.cursor = nir_after_cf_list(&impl->body);
720
721 /* Re-emit the header, since the old one got moved into the if branch */
722 state.header = nir_load_tcs_header_ir3(&b);
723 nir_ssa_def *iid = build_invocation_id(&b, &state);
724
725 const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
726 nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
727
728 nir_if *nif = nir_push_if(&b, cond);
729
730 nir_cf_reinsert(&body, b.cursor);
731
732 b.cursor = nir_after_cf_list(&nif->then_list);
733
734 /* Insert conditional exit for threads invocation id != 0 */
735 nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
736 nir_cond_end_ir3(&b, iid0_cond);
737
738 emit_tess_epilouge(&b, &state);
739
740 nir_pop_if(&b, nif);
741
742 nir_metadata_preserve(impl, nir_metadata_none);
743 }
744
745 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)746 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
747 {
748 nir_foreach_instr_safe (instr, block) {
749 if (instr->type != nir_instr_type_intrinsic)
750 continue;
751
752 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
753
754 switch (intr->intrinsic) {
755 case nir_intrinsic_load_tess_coord: {
756 b->cursor = nir_after_instr(&intr->instr);
757 nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
758 nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
759 nir_ssa_def *z;
760
761 if (state->topology == IR3_TESS_TRIANGLES)
762 z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
763 else
764 z = nir_imm_float(b, 0.0f);
765
766 nir_ssa_def *coord = nir_vec3(b, x, y, z);
767
768 nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
769 b->cursor.instr);
770 break;
771 }
772
773 case nir_intrinsic_load_per_vertex_input: {
774 // src[] = { vertex, offset }.
775
776 b->cursor = nir_before_instr(&intr->instr);
777
778 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
779 nir_ssa_def *offset = build_per_vertex_offset(
780 b, state, intr->src[0].ssa,
781 nir_intrinsic_io_semantics(intr).location,
782 nir_intrinsic_component(intr), intr->src[1].ssa);
783
784 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
785 offset, NULL);
786 break;
787 }
788
789 case nir_intrinsic_load_input: {
790 // src[] = { offset }.
791
792 b->cursor = nir_before_instr(&intr->instr);
793
794 nir_ssa_def *address, *offset;
795
796 /* note if vectorization of the tess level loads ever happens:
797 * "ldg" across 16-byte boundaries can behave incorrectly if results
798 * are never used. most likely some issue with (sy) not properly
799 * syncing with values coming from a second memory transaction.
800 */
801 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
802 if (is_tess_levels(location)) {
803 assert(intr->dest.ssa.num_components == 1);
804 address = nir_load_tess_factor_base_ir3(b);
805 offset = build_tessfactor_base(
806 b, location, nir_intrinsic_component(intr), state);
807 } else {
808 address = nir_load_tess_param_base_ir3(b);
809 offset = build_patch_offset(b, state, location,
810 nir_intrinsic_component(intr),
811 intr->src[0].ssa);
812 }
813
814 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
815 offset, NULL);
816 break;
817 }
818
819 default:
820 break;
821 }
822 }
823 }
824
825 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)826 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
827 unsigned topology)
828 {
829 struct state state = {.topology = topology};
830
831 if (shader_debug_enabled(shader->info.stage)) {
832 mesa_logi("NIR (before tess lowering) for %s shader:",
833 _mesa_shader_stage_to_string(shader->info.stage));
834 nir_log_shaderi(shader);
835 }
836
837 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
838 assert(impl);
839
840 nir_builder b;
841 nir_builder_init(&b, impl);
842
843 nir_foreach_block_safe (block, impl)
844 lower_tess_eval_block(block, &b, &state);
845
846 v->input_size = calc_primitive_map_size(shader);
847
848 nir_metadata_preserve(impl, nir_metadata_none);
849 }
850
851 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)852 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
853 {
854 foreach_two_lists (dest_node, dests, src_node, srcs) {
855 nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
856 nir_variable *src = exec_node_data(nir_variable, src_node, node);
857 nir_copy_var(b, dest, src);
858 }
859 }
860
861 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)862 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
863 {
864 nir_foreach_instr_safe (instr, block) {
865 if (instr->type != nir_instr_type_intrinsic)
866 continue;
867
868 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
869
870 switch (intr->intrinsic) {
871 case nir_intrinsic_end_primitive: {
872 /* Note: This ignores the stream, which seems to match the blob
873 * behavior. I'm guessing the HW ignores any extraneous cut
874 * signals from an EndPrimitive() that doesn't correspond to the
875 * rasterized stream.
876 */
877 b->cursor = nir_before_instr(&intr->instr);
878 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
879 nir_instr_remove(&intr->instr);
880 break;
881 }
882
883 case nir_intrinsic_emit_vertex: {
884 /* Load the vertex count */
885 b->cursor = nir_before_instr(&intr->instr);
886 nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
887
888 nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
889
890 unsigned stream = nir_intrinsic_stream_id(intr);
891 /* vertex_flags_out |= stream */
892 nir_store_var(b, state->vertex_flags_out,
893 nir_ior(b, nir_load_var(b, state->vertex_flags_out),
894 nir_imm_int(b, stream)),
895 0x1 /* .x */);
896
897 copy_vars(b, &state->emit_outputs, &state->old_outputs);
898
899 nir_instr_remove(&intr->instr);
900
901 nir_store_var(b, state->emitted_vertex_var,
902 nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
903 nir_imm_int(b, 1)),
904 0x1);
905
906 nir_pop_if(b, NULL);
907
908 /* Increment the vertex count by 1 */
909 nir_store_var(b, state->vertex_count_var,
910 nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
911 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
912
913 break;
914 }
915
916 default:
917 break;
918 }
919 }
920 }
921
922 void
ir3_nir_lower_gs(nir_shader * shader)923 ir3_nir_lower_gs(nir_shader *shader)
924 {
925 struct state state = {};
926
927 /* Don't lower multiple times: */
928 nir_foreach_shader_out_variable (var, shader)
929 if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
930 return;
931
932 if (shader_debug_enabled(shader->info.stage)) {
933 mesa_logi("NIR (before gs lowering):");
934 nir_log_shaderi(shader);
935 }
936
937 /* Create an output var for vertex_flags. This will be shadowed below,
938 * same way regular outputs get shadowed, and this variable will become a
939 * temporary.
940 */
941 state.vertex_flags_out = nir_variable_create(
942 shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
943 state.vertex_flags_out->data.driver_location = shader->num_outputs++;
944 state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
945 state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
946
947 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
948 assert(impl);
949
950 nir_builder b;
951 nir_builder_init(&b, impl);
952 b.cursor = nir_before_cf_list(&impl->body);
953
954 state.header = nir_load_gs_header_ir3(&b);
955
956 /* Generate two set of shadow vars for the output variables. The first
957 * set replaces the real outputs and the second set (emit_outputs) we'll
958 * assign in the emit_vertex conditionals. Then at the end of the shader
959 * we copy the emit_outputs to the real outputs, so that we get
960 * store_output in uniform control flow.
961 */
962 exec_list_make_empty(&state.old_outputs);
963 nir_foreach_shader_out_variable_safe (var, shader) {
964 exec_node_remove(&var->node);
965 exec_list_push_tail(&state.old_outputs, &var->node);
966 }
967 exec_list_make_empty(&state.new_outputs);
968 exec_list_make_empty(&state.emit_outputs);
969 nir_foreach_variable_in_list (var, &state.old_outputs) {
970 /* Create a new output var by cloning the original output var and
971 * stealing the name.
972 */
973 nir_variable *output = nir_variable_clone(var, shader);
974 exec_list_push_tail(&state.new_outputs, &output->node);
975
976 /* Rewrite the original output to be a shadow variable. */
977 var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
978 var->data.mode = nir_var_shader_temp;
979
980 /* Clone the shadow variable to create the emit shadow variable that
981 * we'll assign in the emit conditionals.
982 */
983 nir_variable *emit_output = nir_variable_clone(var, shader);
984 emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
985 exec_list_push_tail(&state.emit_outputs, &emit_output->node);
986 }
987
988 /* During the shader we'll keep track of which vertex we're currently
989 * emitting for the EmitVertex test and how many vertices we emitted so we
990 * know to discard if didn't emit any. In most simple shaders, this can
991 * all be statically determined and gets optimized away.
992 */
993 state.vertex_count_var =
994 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
995 state.emitted_vertex_var =
996 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
997
998 /* Initialize to 0. */
999 b.cursor = nir_before_cf_list(&impl->body);
1000 nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1001 nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1002 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1003
1004 nir_foreach_block_safe (block, impl)
1005 lower_gs_block(block, &b, &state);
1006
1007 /* Note: returns are lowered, so there should be only one block before the
1008 * end block. If we had real returns, we would probably want to redirect
1009 * them to this new if statement, rather than emitting this code at every
1010 * return statement.
1011 */
1012 assert(impl->end_block->predecessors->entries == 1);
1013 nir_block *block = nir_impl_last_block(impl);
1014 b.cursor = nir_after_block_before_jump(block);
1015
1016 /* If we haven't emitted any vertex we need to copy the shadow (old)
1017 * outputs to emit outputs here.
1018 *
1019 * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1020 * in an extra vertex_flags write for good measure. If unneeded it
1021 * will be optimized out.
1022 *
1023 * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1024 */
1025 nir_ssa_def *cond =
1026 nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1027 nir_push_if(&b, cond);
1028 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1029 copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1030 nir_pop_if(&b, NULL);
1031
1032 nir_discard_if(&b, cond);
1033
1034 copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1035
1036 exec_list_append(&shader->variables, &state.old_outputs);
1037 exec_list_append(&shader->variables, &state.emit_outputs);
1038 exec_list_append(&shader->variables, &state.new_outputs);
1039
1040 nir_metadata_preserve(impl, nir_metadata_none);
1041
1042 nir_lower_global_vars_to_local(shader);
1043 nir_split_var_copies(shader);
1044 nir_lower_var_copies(shader);
1045
1046 nir_fixup_deref_modes(shader);
1047
1048 if (shader_debug_enabled(shader->info.stage)) {
1049 mesa_logi("NIR (after gs lowering):");
1050 nir_log_shaderi(shader);
1051 }
1052 }
1053