• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "compiler/nir/nir_builder.h"
7 #include "ir3_compiler.h"
8 #include "ir3_nir.h"
9 
10 struct state {
11    uint32_t topology;
12 
13    struct primitive_map {
14       /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
15       unsigned loc[13 + 32];
16       unsigned stride;
17    } map;
18 
19    nir_def *header;
20 
21    nir_variable *vertex_count_var;
22    nir_variable *emitted_vertex_var;
23    nir_variable *vertex_flags_out;
24 
25    struct exec_list old_outputs;
26    struct exec_list new_outputs;
27    struct exec_list emit_outputs;
28 
29    /* tess ctrl shader on a650 gets the local primitive id at different bits: */
30    unsigned local_primitive_id_start;
31 };
32 
33 static nir_def *
bitfield_extract(nir_builder * b,nir_def * v,uint32_t start,uint32_t mask)34 bitfield_extract(nir_builder *b, nir_def *v, uint32_t start, uint32_t mask)
35 {
36    return nir_iand_imm(b, nir_ushr_imm(b, v, start), mask);
37 }
38 
39 static nir_def *
build_invocation_id(nir_builder * b,struct state * state)40 build_invocation_id(nir_builder *b, struct state *state)
41 {
42    return bitfield_extract(b, state->header, 11, 31);
43 }
44 
45 static nir_def *
build_vertex_id(nir_builder * b,struct state * state)46 build_vertex_id(nir_builder *b, struct state *state)
47 {
48    return bitfield_extract(b, state->header, 6, 31);
49 }
50 
51 static nir_def *
build_local_primitive_id(nir_builder * b,struct state * state)52 build_local_primitive_id(nir_builder *b, struct state *state)
53 {
54    return bitfield_extract(b, state->header, state->local_primitive_id_start,
55                            63);
56 }
57 
58 static bool
is_tess_levels(gl_varying_slot slot)59 is_tess_levels(gl_varying_slot slot)
60 {
61    return (slot == VARYING_SLOT_PRIMITIVE_ID ||
62            slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
63            slot == VARYING_SLOT_TESS_LEVEL_INNER);
64 }
65 
66 /* Return a deterministic index for varyings. We can't rely on driver_location
67  * to be correct without linking the different stages first, so we create
68  * "primitive maps" where the producer decides on the location of each varying
69  * slot and then exports a per-slot array to the consumer. This compacts the
70  * gl_varying_slot space down a bit so that the primitive maps aren't too
71  * large.
72  *
73  * Note: per-patch varyings are currently handled separately, without any
74  * compacting.
75  *
76  * TODO: We could probably use the driver_location's directly in the non-SSO
77  * (Vulkan) case.
78  */
79 
80 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)81 shader_io_get_unique_index(gl_varying_slot slot)
82 {
83    switch (slot) {
84    case VARYING_SLOT_POS:         return 0;
85    case VARYING_SLOT_PSIZ:        return 1;
86    case VARYING_SLOT_COL0:        return 2;
87    case VARYING_SLOT_COL1:        return 3;
88    case VARYING_SLOT_BFC0:        return 4;
89    case VARYING_SLOT_BFC1:        return 5;
90    case VARYING_SLOT_FOGC:        return 6;
91    case VARYING_SLOT_CLIP_DIST0:  return 7;
92    case VARYING_SLOT_CLIP_DIST1:  return 8;
93    case VARYING_SLOT_CLIP_VERTEX: return 9;
94    case VARYING_SLOT_LAYER:       return 10;
95    case VARYING_SLOT_VIEWPORT:    return 11;
96    case VARYING_SLOT_PRIMITIVE_SHADING_RATE: return 12;
97    case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
98       struct state state = {};
99       STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
100                     (13 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
101       struct ir3_shader_variant v = {};
102       STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
103                     (13 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
104       return 13 + (slot - VARYING_SLOT_VAR0);
105    }
106    default:
107       unreachable("illegal slot in get unique index\n");
108    }
109 }
110 
111 static nir_def *
build_local_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)112 build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
113                    uint32_t location, uint32_t comp, nir_def *offset)
114 {
115    nir_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
116    nir_def *primitive_offset =
117       nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
118    nir_def *attr_offset;
119    nir_def *vertex_stride;
120    unsigned index = shader_io_get_unique_index(location);
121 
122    switch (b->shader->info.stage) {
123    case MESA_SHADER_VERTEX:
124    case MESA_SHADER_TESS_EVAL:
125       vertex_stride = nir_imm_int(b, state->map.stride * 4);
126       attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
127       break;
128    case MESA_SHADER_TESS_CTRL:
129    case MESA_SHADER_GEOMETRY:
130       vertex_stride = nir_load_vs_vertex_stride_ir3(b);
131       attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
132                                  comp * 4);
133       break;
134    default:
135       unreachable("bad shader stage");
136    }
137 
138    nir_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
139 
140    return nir_iadd(
141       b, nir_iadd(b, primitive_offset, vertex_offset),
142       nir_iadd(b, attr_offset, nir_ishl_imm(b, offset, 4)));
143 }
144 
145 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_def * src0,nir_def * src1,nir_def * src2)146 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
147                   nir_intrinsic_op op, nir_def *src0, nir_def *src1,
148                   nir_def *src2)
149 {
150    nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
151 
152    new_intr->src[0] = nir_src_for_ssa(src0);
153    if (src1)
154       new_intr->src[1] = nir_src_for_ssa(src1);
155    if (src2)
156       new_intr->src[2] = nir_src_for_ssa(src2);
157 
158    new_intr->num_components = intr->num_components;
159 
160    if (nir_intrinsic_infos[op].has_dest)
161       nir_def_init(&new_intr->instr, &new_intr->def,
162                    intr->num_components, intr->def.bit_size);
163 
164    nir_builder_instr_insert(b, &new_intr->instr);
165 
166    if (nir_intrinsic_infos[op].has_dest)
167       nir_def_rewrite_uses(&intr->def, &new_intr->def);
168 
169    nir_instr_remove(&intr->instr);
170 
171    return new_intr;
172 }
173 
174 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)175 build_primitive_map(nir_shader *shader, struct primitive_map *map)
176 {
177    /* All interfaces except the TCS <-> TES interface use ldlw, which takes
178     * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
179     * ldg, which takes an offset in dwords, but each per-vertex slot has
180     * space for every vertex, and there's space at the beginning for
181     * per-patch varyings.
182     */
183    unsigned slot_size = 16, start = 0;
184    if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
185       slot_size = shader->info.tess.tcs_vertices_out * 4;
186       start = util_last_bit(shader->info.patch_outputs_written) * 4;
187    }
188 
189    uint64_t mask = shader->info.outputs_written;
190    unsigned loc = start;
191    while (mask) {
192       int location = u_bit_scan64(&mask);
193       if (is_tess_levels(location))
194          continue;
195 
196       unsigned index = shader_io_get_unique_index(location);
197       map->loc[index] = loc;
198       loc += slot_size;
199    }
200 
201    map->stride = loc;
202    /* Use units of dwords for the stride. */
203    if (shader->info.stage != MESA_SHADER_TESS_CTRL)
204       map->stride /= 4;
205 }
206 
207 /* For shader stages that receive a primitive map, calculate how big it should
208  * be.
209  */
210 
211 static unsigned
calc_primitive_map_size(nir_shader * shader)212 calc_primitive_map_size(nir_shader *shader)
213 {
214    uint64_t mask = shader->info.inputs_read;
215    unsigned max_index = 0;
216    while (mask) {
217       int location = u_bit_scan64(&mask);
218 
219       if (is_tess_levels(location))
220          continue;
221 
222       unsigned index = shader_io_get_unique_index(location);
223       max_index = MAX2(max_index, index + 1);
224    }
225 
226    return max_index;
227 }
228 
229 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)230 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
231                                struct state *state)
232 {
233    nir_foreach_instr_safe (instr, block) {
234       if (instr->type != nir_instr_type_intrinsic)
235          continue;
236 
237       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
238 
239       switch (intr->intrinsic) {
240       case nir_intrinsic_store_output: {
241          // src[] = { value, offset }.
242 
243          /* nir_lower_io_to_temporaries replaces all access to output
244           * variables with temp variables and then emits a nir_copy_var at
245           * the end of the shader.  Thus, we should always get a full wrmask
246           * here.
247           */
248          assert(
249             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
250 
251          b->cursor = nir_instr_remove(&intr->instr);
252 
253          nir_def *vertex_id = build_vertex_id(b, state);
254          nir_def *offset = build_local_offset(
255             b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
256             nir_intrinsic_component(intr), intr->src[1].ssa);
257 
258          nir_store_shared_ir3(b, intr->src[0].ssa, offset);
259          break;
260       }
261 
262       default:
263          break;
264       }
265    }
266 }
267 
268 static nir_def *
local_thread_id(nir_builder * b)269 local_thread_id(nir_builder *b)
270 {
271    return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
272 }
273 
274 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)275 ir3_nir_lower_to_explicit_output(nir_shader *shader,
276                                  struct ir3_shader_variant *v,
277                                  unsigned topology)
278 {
279    struct state state = {};
280 
281    build_primitive_map(shader, &state.map);
282    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
283 
284    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
285    assert(impl);
286 
287    nir_builder b = nir_builder_at(nir_before_impl(impl));
288 
289    if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
290       state.header = nir_load_tcs_header_ir3(&b);
291    else
292       state.header = nir_load_gs_header_ir3(&b);
293 
294    nir_foreach_block_safe (block, impl)
295       lower_block_to_explicit_output(block, &b, &state);
296 
297    nir_metadata_preserve(impl,
298                          nir_metadata_control_flow);
299 
300    v->output_size = state.map.stride;
301 }
302 
303 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)304 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
305                               struct state *state)
306 {
307    nir_foreach_instr_safe (instr, block) {
308       if (instr->type != nir_instr_type_intrinsic)
309          continue;
310 
311       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
312 
313       switch (intr->intrinsic) {
314       case nir_intrinsic_load_per_vertex_input: {
315          // src[] = { vertex, offset }.
316 
317          b->cursor = nir_before_instr(&intr->instr);
318 
319          nir_def *offset = build_local_offset(
320             b, state,
321             intr->src[0].ssa, // this is typically gl_InvocationID
322             nir_intrinsic_io_semantics(intr).location,
323             nir_intrinsic_component(intr), intr->src[1].ssa);
324 
325          replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
326                            NULL);
327          break;
328       }
329 
330       case nir_intrinsic_load_invocation_id: {
331          b->cursor = nir_before_instr(&intr->instr);
332 
333          nir_def *iid = build_invocation_id(b, state);
334          nir_def_replace(&intr->def, iid);
335          break;
336       }
337 
338       default:
339          break;
340       }
341    }
342 }
343 
344 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)345 ir3_nir_lower_to_explicit_input(nir_shader *shader,
346                                 struct ir3_shader_variant *v)
347 {
348    struct state state = {};
349 
350    /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
351     * HS uses a different primitive id, which starts at bit 16 in the header
352     */
353    if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
354        v->compiler->tess_use_shared)
355       state.local_primitive_id_start = 16;
356 
357    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
358    assert(impl);
359 
360    nir_builder b = nir_builder_at(nir_before_impl(impl));
361 
362    if (shader->info.stage == MESA_SHADER_GEOMETRY)
363       state.header = nir_load_gs_header_ir3(&b);
364    else
365       state.header = nir_load_tcs_header_ir3(&b);
366 
367    nir_foreach_block_safe (block, impl)
368       lower_block_to_explicit_input(block, &b, &state);
369 
370    v->input_size = calc_primitive_map_size(shader);
371 }
372 
373 static nir_def *
build_tcs_out_vertices(nir_builder * b)374 build_tcs_out_vertices(nir_builder *b)
375 {
376    if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
377       return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
378    else
379       return nir_load_patch_vertices_in(b);
380 }
381 
382 static nir_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)383 build_per_vertex_offset(nir_builder *b, struct state *state,
384                         nir_def *vertex, uint32_t location, uint32_t comp,
385                         nir_def *offset)
386 {
387    nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
388    nir_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
389    nir_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
390    nir_def *attr_offset;
391 
392    if (nir_src_is_const(nir_src_for_ssa(offset))) {
393       location += nir_src_as_uint(nir_src_for_ssa(offset));
394       offset = nir_imm_int(b, 0);
395    } else {
396       /* Offset is in vec4's, but we need it in unit of components for the
397        * load/store_global_ir3 offset.
398        */
399       offset = nir_ishl_imm(b, offset, 2);
400    }
401 
402    nir_def *vertex_offset;
403    if (vertex) {
404       unsigned index = shader_io_get_unique_index(location);
405       switch (b->shader->info.stage) {
406       case MESA_SHADER_TESS_CTRL:
407          attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
408          break;
409       case MESA_SHADER_TESS_EVAL:
410          attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
411                                     comp);
412          break;
413       default:
414          unreachable("bad shader state");
415       }
416 
417       attr_offset = nir_iadd(b, attr_offset,
418                              nir_imul24(b, offset, build_tcs_out_vertices(b)));
419       vertex_offset = nir_ishl_imm(b, vertex, 2);
420    } else {
421       assert(location >= VARYING_SLOT_PATCH0 &&
422              location <= VARYING_SLOT_TESS_MAX);
423       unsigned index = location - VARYING_SLOT_PATCH0;
424       attr_offset = nir_iadd_imm(b, offset, index * 4 + comp);
425       vertex_offset = nir_imm_int(b, 0);
426    }
427 
428    return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
429 }
430 
431 static nir_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_def * offset)432 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
433                    uint32_t comp, nir_def *offset)
434 {
435    return build_per_vertex_offset(b, state, NULL, base, comp, offset);
436 }
437 
438 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)439 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
440 {
441    switch (state->topology) {
442    case IR3_TESS_TRIANGLES:
443       *inner = 1;
444       *outer = 3;
445       break;
446    case IR3_TESS_QUADS:
447       *inner = 2;
448       *outer = 4;
449       break;
450    case IR3_TESS_ISOLINES:
451       *inner = 0;
452       *outer = 2;
453       break;
454    default:
455       unreachable("bad");
456    }
457 }
458 
459 static nir_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)460 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
461                       struct state *state)
462 {
463    uint32_t inner_levels, outer_levels;
464    tess_level_components(state, &inner_levels, &outer_levels);
465 
466    const uint32_t patch_stride = 1 + inner_levels + outer_levels;
467 
468    nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
469 
470    nir_def *patch_offset =
471       nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
472 
473    uint32_t offset;
474    switch (slot) {
475    case VARYING_SLOT_PRIMITIVE_ID:
476       offset = 0;
477       break;
478    case VARYING_SLOT_TESS_LEVEL_OUTER:
479       offset = 1;
480       break;
481    case VARYING_SLOT_TESS_LEVEL_INNER:
482       offset = 1 + outer_levels;
483       break;
484    default:
485       unreachable("bad");
486    }
487 
488    return nir_iadd_imm(b, patch_offset, offset + comp);
489 }
490 
491 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)492 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
493 {
494    nir_foreach_instr_safe (instr, block) {
495       if (instr->type != nir_instr_type_intrinsic)
496          continue;
497 
498       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
499 
500       switch (intr->intrinsic) {
501       case nir_intrinsic_load_per_vertex_output: {
502          // src[] = { vertex, offset }.
503 
504          b->cursor = nir_before_instr(&intr->instr);
505 
506          nir_def *address = nir_load_tess_param_base_ir3(b);
507          nir_def *offset = build_per_vertex_offset(
508             b, state, intr->src[0].ssa,
509             nir_intrinsic_io_semantics(intr).location,
510             nir_intrinsic_component(intr), intr->src[1].ssa);
511 
512          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
513                            offset, NULL);
514          break;
515       }
516 
517       case nir_intrinsic_store_per_vertex_output: {
518          // src[] = { value, vertex, offset }.
519 
520          b->cursor = nir_before_instr(&intr->instr);
521 
522          /* sparse writemask not supported */
523          assert(
524             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
525 
526          nir_def *value = intr->src[0].ssa;
527          nir_def *address = nir_load_tess_param_base_ir3(b);
528          nir_def *offset = build_per_vertex_offset(
529             b, state, intr->src[1].ssa,
530             nir_intrinsic_io_semantics(intr).location,
531             nir_intrinsic_component(intr), intr->src[2].ssa);
532 
533          replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
534                            address, offset);
535 
536          break;
537       }
538 
539       case nir_intrinsic_load_output: {
540          // src[] = { offset }.
541 
542          b->cursor = nir_before_instr(&intr->instr);
543 
544          nir_def *address, *offset;
545 
546          /* note if vectorization of the tess level loads ever happens:
547           * "ldg" across 16-byte boundaries can behave incorrectly if results
548           * are never used. most likely some issue with (sy) not properly
549           * syncing with values coming from a second memory transaction.
550           */
551          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
552          if (is_tess_levels(location)) {
553             assert(intr->def.num_components == 1);
554             address = nir_load_tess_factor_base_ir3(b);
555             offset = build_tessfactor_base(
556                b, location, nir_intrinsic_component(intr), state);
557          } else {
558             address = nir_load_tess_param_base_ir3(b);
559             offset = build_patch_offset(b, state, location,
560                                         nir_intrinsic_component(intr),
561                                         intr->src[0].ssa);
562          }
563 
564          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
565                            offset, NULL);
566          break;
567       }
568 
569       case nir_intrinsic_store_output: {
570          // src[] = { value, offset }.
571 
572          /* write patch output to bo */
573 
574          b->cursor = nir_before_instr(&intr->instr);
575 
576          /* sparse writemask not supported */
577          assert(
578             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
579 
580          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
581          if (is_tess_levels(location)) {
582             uint32_t inner_levels, outer_levels, levels;
583             tess_level_components(state, &inner_levels, &outer_levels);
584 
585             assert(intr->src[0].ssa->num_components == 1);
586 
587             nir_if *nif = NULL;
588             if (location != VARYING_SLOT_PRIMITIVE_ID) {
589                /* with tess levels are defined as float[4] and float[2],
590                 * but tess factor BO has smaller sizes for tris/isolines,
591                 * so we have to discard any writes beyond the number of
592                 * components for inner/outer levels
593                 */
594                if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
595                   levels = outer_levels;
596                else
597                   levels = inner_levels;
598 
599                nir_def *offset = nir_iadd_imm(
600                   b, intr->src[1].ssa, nir_intrinsic_component(intr));
601                nif = nir_push_if(b, nir_ult_imm(b, offset, levels));
602             }
603 
604             nir_def *offset = build_tessfactor_base(
605                b, location, nir_intrinsic_component(intr), state);
606 
607             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
608                               intr->src[0].ssa,
609                               nir_load_tess_factor_base_ir3(b),
610                               nir_iadd(b, intr->src[1].ssa, offset));
611 
612             if (location != VARYING_SLOT_PRIMITIVE_ID) {
613                nir_pop_if(b, nif);
614             }
615          } else {
616             nir_def *address = nir_load_tess_param_base_ir3(b);
617             nir_def *offset = build_patch_offset(
618                b, state, location, nir_intrinsic_component(intr),
619                intr->src[1].ssa);
620 
621             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
622                               intr->src[0].ssa, address, offset);
623          }
624          break;
625       }
626 
627       default:
628          break;
629       }
630    }
631 }
632 
633 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)634 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
635                         unsigned topology)
636 {
637    struct state state = {.topology = topology};
638 
639    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
640       mesa_logi("NIR (before tess lowering) for %s shader:",
641                 _mesa_shader_stage_to_string(shader->info.stage));
642       nir_log_shaderi(shader);
643    }
644 
645    build_primitive_map(shader, &state.map);
646    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
647    v->output_size = state.map.stride;
648 
649    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
650    assert(impl);
651 
652    nir_builder b = nir_builder_at(nir_before_impl(impl));
653 
654    state.header = nir_load_tcs_header_ir3(&b);
655 
656    /* If required, store gl_PrimitiveID. */
657    if (v->key.tcs_store_primid) {
658       b.cursor = nir_after_impl(impl);
659 
660       nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
661                        .io_semantics = {
662                            .location = VARYING_SLOT_PRIMITIVE_ID,
663                            .num_slots = 1
664                         });
665 
666       b.cursor = nir_before_impl(impl);
667    }
668 
669    nir_foreach_block_safe (block, impl)
670       lower_tess_ctrl_block(block, &b, &state);
671 
672    /* Now move the body of the TCS into a conditional:
673     *
674     *   if (gl_InvocationID < num_vertices)
675     *     // body
676     *
677     */
678 
679    nir_cf_list body;
680    nir_cf_extract(&body, nir_before_impl(impl),
681                   nir_after_impl(impl));
682 
683    b.cursor = nir_after_impl(impl);
684 
685    /* Re-emit the header, since the old one got moved into the if branch */
686    state.header = nir_load_tcs_header_ir3(&b);
687    nir_def *iid = build_invocation_id(&b, &state);
688 
689    const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
690    nir_def *cond = nir_ult_imm(&b, iid, nvertices);
691 
692    nir_if *nif = nir_push_if(&b, cond);
693 
694    nir_cf_reinsert(&body, b.cursor);
695 
696    b.cursor = nir_after_cf_list(&nif->then_list);
697 
698    nir_pop_if(&b, nif);
699 
700    nir_metadata_preserve(impl, nir_metadata_none);
701 }
702 
703 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)704 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
705 {
706    nir_foreach_instr_safe (instr, block) {
707       if (instr->type != nir_instr_type_intrinsic)
708          continue;
709 
710       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
711 
712       switch (intr->intrinsic) {
713       case nir_intrinsic_load_per_vertex_input: {
714          // src[] = { vertex, offset }.
715 
716          b->cursor = nir_before_instr(&intr->instr);
717 
718          nir_def *address = nir_load_tess_param_base_ir3(b);
719          nir_def *offset = build_per_vertex_offset(
720             b, state, intr->src[0].ssa,
721             nir_intrinsic_io_semantics(intr).location,
722             nir_intrinsic_component(intr), intr->src[1].ssa);
723 
724          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
725                            offset, NULL);
726          break;
727       }
728 
729       case nir_intrinsic_load_input: {
730          // src[] = { offset }.
731 
732          b->cursor = nir_before_instr(&intr->instr);
733 
734          nir_def *address, *offset;
735 
736          /* note if vectorization of the tess level loads ever happens:
737           * "ldg" across 16-byte boundaries can behave incorrectly if results
738           * are never used. most likely some issue with (sy) not properly
739           * syncing with values coming from a second memory transaction.
740           */
741          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
742          if (is_tess_levels(location)) {
743             assert(intr->def.num_components == 1);
744             address = nir_load_tess_factor_base_ir3(b);
745             offset = build_tessfactor_base(
746                b, location, nir_intrinsic_component(intr), state);
747          } else {
748             address = nir_load_tess_param_base_ir3(b);
749             offset = build_patch_offset(b, state, location,
750                                         nir_intrinsic_component(intr),
751                                         intr->src[0].ssa);
752          }
753 
754          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
755                            offset, NULL);
756          break;
757       }
758 
759       default:
760          break;
761       }
762    }
763 }
764 
765 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)766 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
767                         unsigned topology)
768 {
769    struct state state = {.topology = topology};
770 
771    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
772       mesa_logi("NIR (before tess lowering) for %s shader:",
773                 _mesa_shader_stage_to_string(shader->info.stage));
774       nir_log_shaderi(shader);
775    }
776 
777    NIR_PASS_V(shader, nir_lower_tess_coord_z, topology == IR3_TESS_TRIANGLES);
778 
779    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
780    assert(impl);
781 
782    nir_builder b = nir_builder_create(impl);
783 
784    nir_foreach_block_safe (block, impl)
785       lower_tess_eval_block(block, &b, &state);
786 
787    v->input_size = calc_primitive_map_size(shader);
788 
789    nir_metadata_preserve(impl, nir_metadata_none);
790 }
791 
792 /* The hardware does not support incomplete primitives in multiple streams at
793  * once or ending the "wrong" stream, but Vulkan allows this. That is,
794  * EmitStreamVertex(N) followed by EmitStreamVertex(M) or EndStreamPrimitive(M)
795  * where N != M and there isn't a call to EndStreamPrimitive(N) in between isn't
796  * supported by the hardware. Fix this up by duplicating the entire shader per
797  * stream, removing EmitStreamVertex/EndStreamPrimitive calls for streams other
798  * than the current one.
799  */
800 
801 static void
lower_mixed_streams(nir_shader * nir)802 lower_mixed_streams(nir_shader *nir)
803 {
804    /* We don't have to do anything for points because there is only one vertex
805     * per primitive and therefore no possibility of mixing.
806     */
807    if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
808       return;
809 
810    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
811 
812    uint8_t stream_mask = 0;
813 
814    nir_foreach_block (block, entrypoint) {
815       nir_foreach_instr (instr, block) {
816          if (instr->type != nir_instr_type_intrinsic)
817             continue;
818 
819          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
820 
821          if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
822              intrin->intrinsic == nir_intrinsic_end_primitive)
823             stream_mask |= 1 << nir_intrinsic_stream_id(intrin);
824       }
825    }
826 
827    if (util_is_power_of_two_or_zero(stream_mask))
828       return;
829 
830    nir_cf_list body;
831    nir_cf_list_extract(&body, &entrypoint->body);
832 
833    nir_builder b = nir_builder_create(entrypoint);
834 
835    u_foreach_bit (stream, stream_mask) {
836       b.cursor = nir_after_impl(entrypoint);
837 
838       /* Inserting the cloned body invalidates any cursor not using an
839        * instruction, so we need to emit this to keep track of where the new
840        * body is to iterate over it.
841        */
842       nir_instr *anchor = &nir_nop(&b)->instr;
843 
844       nir_cf_list_clone_and_reinsert(&body, &entrypoint->cf_node, b.cursor, NULL);
845 
846       /* We need to iterate over all instructions after the anchor, which is a
847        * bit tricky to do so we do it manually.
848        */
849       for (nir_block *block = anchor->block; block != NULL;
850            block = nir_block_cf_tree_next(block)) {
851          for (nir_instr *instr =
852                (block == anchor->block) ? anchor : nir_block_first_instr(block),
853                *next = instr ? nir_instr_next(instr) : NULL;
854               instr != NULL; instr = next, next = next ? nir_instr_next(next) : NULL) {
855             if (instr->type != nir_instr_type_intrinsic)
856                continue;
857 
858             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
859             if ((intrin->intrinsic == nir_intrinsic_emit_vertex ||
860                  intrin->intrinsic == nir_intrinsic_end_primitive) &&
861                 nir_intrinsic_stream_id(intrin) != stream) {
862                nir_instr_remove(instr);
863             }
864          }
865       }
866 
867       nir_instr_remove(anchor);
868 
869       /* The user can omit the last EndStreamPrimitive(), so add an extra one
870        * here before potentially adding other copies of the body that emit to
871        * different streams. Our lowering means that redundant calls to
872        * EndStreamPrimitive are safe and should be optimized out.
873        */
874       b.cursor = nir_after_impl(entrypoint);
875       nir_end_primitive(&b, .stream_id = stream);
876    }
877 
878    nir_cf_delete(&body);
879 }
880 
881 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)882 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
883 {
884    foreach_two_lists (dest_node, dests, src_node, srcs) {
885       nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
886       nir_variable *src = exec_node_data(nir_variable, src_node, node);
887       nir_copy_var(b, dest, src);
888    }
889 }
890 
891 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)892 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
893 {
894    nir_foreach_instr_safe (instr, block) {
895       if (instr->type != nir_instr_type_intrinsic)
896          continue;
897 
898       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
899 
900       switch (intr->intrinsic) {
901       case nir_intrinsic_end_primitive: {
902          /* The HW will use the stream from the preceding emitted vertices,
903           * which thanks to the lower_mixed_streams is the same as the stream
904           * for this instruction, so we can ignore it here.
905           */
906          b->cursor = nir_before_instr(&intr->instr);
907          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
908          nir_instr_remove(&intr->instr);
909          break;
910       }
911 
912       case nir_intrinsic_emit_vertex: {
913          /* Load the vertex count */
914          b->cursor = nir_before_instr(&intr->instr);
915          nir_def *count = nir_load_var(b, state->vertex_count_var);
916 
917          nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
918 
919          unsigned stream = nir_intrinsic_stream_id(intr);
920          /* vertex_flags_out |= stream */
921          nir_store_var(b, state->vertex_flags_out,
922                        nir_ior_imm(b, nir_load_var(b, state->vertex_flags_out),
923                                    stream),
924                        0x1 /* .x */);
925 
926          copy_vars(b, &state->emit_outputs, &state->old_outputs);
927 
928          nir_instr_remove(&intr->instr);
929 
930          nir_store_var(b, state->emitted_vertex_var,
931                        nir_iadd_imm(b,
932                                     nir_load_var(b,
933                                                  state->emitted_vertex_var),
934                                                  1),
935                        0x1);
936 
937          nir_pop_if(b, NULL);
938 
939          /* Increment the vertex count by 1 */
940          nir_store_var(b, state->vertex_count_var,
941                        nir_iadd_imm(b, count, 1), 0x1); /* .x */
942          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
943 
944          break;
945       }
946 
947       default:
948          break;
949       }
950    }
951 }
952 
953 void
ir3_nir_lower_gs(nir_shader * shader)954 ir3_nir_lower_gs(nir_shader *shader)
955 {
956    struct state state = {};
957 
958    /* Don't lower multiple times: */
959    nir_foreach_shader_out_variable (var, shader)
960       if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
961          return;
962 
963    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
964       mesa_logi("NIR (before gs lowering):");
965       nir_log_shaderi(shader);
966    }
967 
968    lower_mixed_streams(shader);
969 
970    /* Create an output var for vertex_flags. This will be shadowed below,
971     * same way regular outputs get shadowed, and this variable will become a
972     * temporary.
973     */
974    state.vertex_flags_out = nir_variable_create(
975       shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
976    state.vertex_flags_out->data.driver_location = shader->num_outputs++;
977    state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
978    state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
979 
980    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
981    assert(impl);
982 
983    nir_builder b = nir_builder_at(nir_before_impl(impl));
984 
985    state.header = nir_load_gs_header_ir3(&b);
986 
987    /* Generate two set of shadow vars for the output variables.  The first
988     * set replaces the real outputs and the second set (emit_outputs) we'll
989     * assign in the emit_vertex conditionals.  Then at the end of the shader
990     * we copy the emit_outputs to the real outputs, so that we get
991     * store_output in uniform control flow.
992     */
993    exec_list_make_empty(&state.old_outputs);
994    nir_foreach_shader_out_variable_safe (var, shader) {
995       exec_node_remove(&var->node);
996       exec_list_push_tail(&state.old_outputs, &var->node);
997    }
998    exec_list_make_empty(&state.new_outputs);
999    exec_list_make_empty(&state.emit_outputs);
1000    nir_foreach_variable_in_list (var, &state.old_outputs) {
1001       /* Create a new output var by cloning the original output var and
1002        * stealing the name.
1003        */
1004       nir_variable *output = nir_variable_clone(var, shader);
1005       exec_list_push_tail(&state.new_outputs, &output->node);
1006 
1007       /* Rewrite the original output to be a shadow variable. */
1008       var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
1009       var->data.mode = nir_var_shader_temp;
1010 
1011       /* Clone the shadow variable to create the emit shadow variable that
1012        * we'll assign in the emit conditionals.
1013        */
1014       nir_variable *emit_output = nir_variable_clone(var, shader);
1015       emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
1016       exec_list_push_tail(&state.emit_outputs, &emit_output->node);
1017    }
1018 
1019    /* During the shader we'll keep track of which vertex we're currently
1020     * emitting for the EmitVertex test and how many vertices we emitted so we
1021     * know to discard if didn't emit any.  In most simple shaders, this can
1022     * all be statically determined and gets optimized away.
1023     */
1024    state.vertex_count_var =
1025       nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
1026    state.emitted_vertex_var =
1027       nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
1028 
1029    /* Initialize to 0. */
1030    b.cursor = nir_before_impl(impl);
1031    nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1032    nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1033    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1034 
1035    nir_foreach_block_safe (block, impl)
1036       lower_gs_block(block, &b, &state);
1037 
1038    /* Note: returns are lowered, so there should be only one block before the
1039     * end block.  If we had real returns, we would probably want to redirect
1040     * them to this new if statement, rather than emitting this code at every
1041     * return statement.
1042     */
1043    assert(impl->end_block->predecessors->entries == 1);
1044    nir_block *block = nir_impl_last_block(impl);
1045    b.cursor = nir_after_block_before_jump(block);
1046 
1047    /* If we haven't emitted any vertex we need to copy the shadow (old)
1048     * outputs to emit outputs here.
1049     *
1050     * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1051     * in an extra vertex_flags write for good measure.  If unneeded it
1052     * will be optimized out.
1053     *
1054     * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1055     */
1056    nir_def *cond =
1057       nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1058    nir_push_if(&b, cond);
1059    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1060    copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1061    nir_pop_if(&b, NULL);
1062 
1063    nir_discard_if(&b, cond);
1064 
1065    copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1066 
1067    exec_list_append(&shader->variables, &state.old_outputs);
1068    exec_list_append(&shader->variables, &state.emit_outputs);
1069    exec_list_append(&shader->variables, &state.new_outputs);
1070 
1071    nir_metadata_preserve(impl, nir_metadata_none);
1072 
1073    nir_lower_global_vars_to_local(shader);
1074    nir_split_var_copies(shader);
1075    nir_lower_var_copies(shader);
1076 
1077    nir_fixup_deref_modes(shader);
1078 
1079    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
1080       mesa_logi("NIR (after gs lowering):");
1081       nir_log_shaderi(shader);
1082    }
1083 }
1084