• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27 
28 struct state {
29    uint32_t topology;
30 
31    struct primitive_map {
32       /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
33       unsigned loc[12 + 32];
34       unsigned stride;
35    } map;
36 
37    nir_def *header;
38 
39    nir_variable *vertex_count_var;
40    nir_variable *emitted_vertex_var;
41    nir_variable *vertex_flags_out;
42 
43    struct exec_list old_outputs;
44    struct exec_list new_outputs;
45    struct exec_list emit_outputs;
46 
47    /* tess ctrl shader on a650 gets the local primitive id at different bits: */
48    unsigned local_primitive_id_start;
49 };
50 
51 static nir_def *
bitfield_extract(nir_builder * b,nir_def * v,uint32_t start,uint32_t mask)52 bitfield_extract(nir_builder *b, nir_def *v, uint32_t start, uint32_t mask)
53 {
54    return nir_iand_imm(b, nir_ushr_imm(b, v, start), mask);
55 }
56 
57 static nir_def *
build_invocation_id(nir_builder * b,struct state * state)58 build_invocation_id(nir_builder *b, struct state *state)
59 {
60    return bitfield_extract(b, state->header, 11, 31);
61 }
62 
63 static nir_def *
build_vertex_id(nir_builder * b,struct state * state)64 build_vertex_id(nir_builder *b, struct state *state)
65 {
66    return bitfield_extract(b, state->header, 6, 31);
67 }
68 
69 static nir_def *
build_local_primitive_id(nir_builder * b,struct state * state)70 build_local_primitive_id(nir_builder *b, struct state *state)
71 {
72    return bitfield_extract(b, state->header, state->local_primitive_id_start,
73                            63);
74 }
75 
76 static bool
is_tess_levels(gl_varying_slot slot)77 is_tess_levels(gl_varying_slot slot)
78 {
79    return (slot == VARYING_SLOT_PRIMITIVE_ID ||
80            slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
81            slot == VARYING_SLOT_TESS_LEVEL_INNER);
82 }
83 
84 /* Return a deterministic index for varyings. We can't rely on driver_location
85  * to be correct without linking the different stages first, so we create
86  * "primitive maps" where the producer decides on the location of each varying
87  * slot and then exports a per-slot array to the consumer. This compacts the
88  * gl_varying_slot space down a bit so that the primitive maps aren't too
89  * large.
90  *
91  * Note: per-patch varyings are currently handled separately, without any
92  * compacting.
93  *
94  * TODO: We could probably use the driver_location's directly in the non-SSO
95  * (Vulkan) case.
96  */
97 
98 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)99 shader_io_get_unique_index(gl_varying_slot slot)
100 {
101    switch (slot) {
102    case VARYING_SLOT_POS:         return 0;
103    case VARYING_SLOT_PSIZ:        return 1;
104    case VARYING_SLOT_COL0:        return 2;
105    case VARYING_SLOT_COL1:        return 3;
106    case VARYING_SLOT_BFC0:        return 4;
107    case VARYING_SLOT_BFC1:        return 5;
108    case VARYING_SLOT_FOGC:        return 6;
109    case VARYING_SLOT_CLIP_DIST0:  return 7;
110    case VARYING_SLOT_CLIP_DIST1:  return 8;
111    case VARYING_SLOT_CLIP_VERTEX: return 9;
112    case VARYING_SLOT_LAYER:       return 10;
113    case VARYING_SLOT_VIEWPORT:    return 11;
114    case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
115       struct state state = {};
116       STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
117                     (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
118       struct ir3_shader_variant v = {};
119       STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
120                     (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
121       return 12 + (slot - VARYING_SLOT_VAR0);
122    }
123    default:
124       unreachable("illegal slot in get unique index\n");
125    }
126 }
127 
128 static nir_def *
build_local_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)129 build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
130                    uint32_t location, uint32_t comp, nir_def *offset)
131 {
132    nir_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
133    nir_def *primitive_offset =
134       nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
135    nir_def *attr_offset;
136    nir_def *vertex_stride;
137    unsigned index = shader_io_get_unique_index(location);
138 
139    switch (b->shader->info.stage) {
140    case MESA_SHADER_VERTEX:
141    case MESA_SHADER_TESS_EVAL:
142       vertex_stride = nir_imm_int(b, state->map.stride * 4);
143       attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
144       break;
145    case MESA_SHADER_TESS_CTRL:
146    case MESA_SHADER_GEOMETRY:
147       vertex_stride = nir_load_vs_vertex_stride_ir3(b);
148       attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
149                                  comp * 4);
150       break;
151    default:
152       unreachable("bad shader stage");
153    }
154 
155    nir_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
156 
157    return nir_iadd(
158       b, nir_iadd(b, primitive_offset, vertex_offset),
159       nir_iadd(b, attr_offset, nir_ishl_imm(b, offset, 4)));
160 }
161 
162 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_def * src0,nir_def * src1,nir_def * src2)163 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
164                   nir_intrinsic_op op, nir_def *src0, nir_def *src1,
165                   nir_def *src2)
166 {
167    nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
168 
169    new_intr->src[0] = nir_src_for_ssa(src0);
170    if (src1)
171       new_intr->src[1] = nir_src_for_ssa(src1);
172    if (src2)
173       new_intr->src[2] = nir_src_for_ssa(src2);
174 
175    new_intr->num_components = intr->num_components;
176 
177    if (nir_intrinsic_infos[op].has_dest)
178       nir_def_init(&new_intr->instr, &new_intr->def,
179                    intr->num_components, intr->def.bit_size);
180 
181    nir_builder_instr_insert(b, &new_intr->instr);
182 
183    if (nir_intrinsic_infos[op].has_dest)
184       nir_def_rewrite_uses(&intr->def, &new_intr->def);
185 
186    nir_instr_remove(&intr->instr);
187 
188    return new_intr;
189 }
190 
191 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)192 build_primitive_map(nir_shader *shader, struct primitive_map *map)
193 {
194    /* All interfaces except the TCS <-> TES interface use ldlw, which takes
195     * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
196     * ldg, which takes an offset in dwords, but each per-vertex slot has
197     * space for every vertex, and there's space at the beginning for
198     * per-patch varyings.
199     */
200    unsigned slot_size = 16, start = 0;
201    if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
202       slot_size = shader->info.tess.tcs_vertices_out * 4;
203       start = util_last_bit(shader->info.patch_outputs_written) * 4;
204    }
205 
206    uint64_t mask = shader->info.outputs_written;
207    unsigned loc = start;
208    while (mask) {
209       int location = u_bit_scan64(&mask);
210       if (is_tess_levels(location))
211          continue;
212 
213       unsigned index = shader_io_get_unique_index(location);
214       map->loc[index] = loc;
215       loc += slot_size;
216    }
217 
218    map->stride = loc;
219    /* Use units of dwords for the stride. */
220    if (shader->info.stage != MESA_SHADER_TESS_CTRL)
221       map->stride /= 4;
222 }
223 
224 /* For shader stages that receive a primitive map, calculate how big it should
225  * be.
226  */
227 
228 static unsigned
calc_primitive_map_size(nir_shader * shader)229 calc_primitive_map_size(nir_shader *shader)
230 {
231    uint64_t mask = shader->info.inputs_read;
232    unsigned max_index = 0;
233    while (mask) {
234       int location = u_bit_scan64(&mask);
235 
236       if (is_tess_levels(location))
237          continue;
238 
239       unsigned index = shader_io_get_unique_index(location);
240       max_index = MAX2(max_index, index + 1);
241    }
242 
243    return max_index;
244 }
245 
246 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)247 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
248                                struct state *state)
249 {
250    nir_foreach_instr_safe (instr, block) {
251       if (instr->type != nir_instr_type_intrinsic)
252          continue;
253 
254       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
255 
256       switch (intr->intrinsic) {
257       case nir_intrinsic_store_output: {
258          // src[] = { value, offset }.
259 
260          /* nir_lower_io_to_temporaries replaces all access to output
261           * variables with temp variables and then emits a nir_copy_var at
262           * the end of the shader.  Thus, we should always get a full wrmask
263           * here.
264           */
265          assert(
266             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
267 
268          b->cursor = nir_instr_remove(&intr->instr);
269 
270          nir_def *vertex_id = build_vertex_id(b, state);
271          nir_def *offset = build_local_offset(
272             b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
273             nir_intrinsic_component(intr), intr->src[1].ssa);
274 
275          nir_store_shared_ir3(b, intr->src[0].ssa, offset);
276          break;
277       }
278 
279       default:
280          break;
281       }
282    }
283 }
284 
285 static nir_def *
local_thread_id(nir_builder * b)286 local_thread_id(nir_builder *b)
287 {
288    return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
289 }
290 
291 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)292 ir3_nir_lower_to_explicit_output(nir_shader *shader,
293                                  struct ir3_shader_variant *v,
294                                  unsigned topology)
295 {
296    struct state state = {};
297 
298    build_primitive_map(shader, &state.map);
299    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
300 
301    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
302    assert(impl);
303 
304    nir_builder b = nir_builder_at(nir_before_impl(impl));
305 
306    if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
307       state.header = nir_load_tcs_header_ir3(&b);
308    else
309       state.header = nir_load_gs_header_ir3(&b);
310 
311    nir_foreach_block_safe (block, impl)
312       lower_block_to_explicit_output(block, &b, &state);
313 
314    nir_metadata_preserve(impl,
315                          nir_metadata_block_index | nir_metadata_dominance);
316 
317    v->output_size = state.map.stride;
318 }
319 
320 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)321 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
322                               struct state *state)
323 {
324    nir_foreach_instr_safe (instr, block) {
325       if (instr->type != nir_instr_type_intrinsic)
326          continue;
327 
328       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
329 
330       switch (intr->intrinsic) {
331       case nir_intrinsic_load_per_vertex_input: {
332          // src[] = { vertex, offset }.
333 
334          b->cursor = nir_before_instr(&intr->instr);
335 
336          nir_def *offset = build_local_offset(
337             b, state,
338             intr->src[0].ssa, // this is typically gl_InvocationID
339             nir_intrinsic_io_semantics(intr).location,
340             nir_intrinsic_component(intr), intr->src[1].ssa);
341 
342          replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
343                            NULL);
344          break;
345       }
346 
347       case nir_intrinsic_load_invocation_id: {
348          b->cursor = nir_before_instr(&intr->instr);
349 
350          nir_def *iid = build_invocation_id(b, state);
351          nir_def_rewrite_uses(&intr->def, iid);
352          nir_instr_remove(&intr->instr);
353          break;
354       }
355 
356       default:
357          break;
358       }
359    }
360 }
361 
362 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)363 ir3_nir_lower_to_explicit_input(nir_shader *shader,
364                                 struct ir3_shader_variant *v)
365 {
366    struct state state = {};
367 
368    /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
369     * HS uses a different primitive id, which starts at bit 16 in the header
370     */
371    if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
372        v->compiler->tess_use_shared)
373       state.local_primitive_id_start = 16;
374 
375    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
376    assert(impl);
377 
378    nir_builder b = nir_builder_at(nir_before_impl(impl));
379 
380    if (shader->info.stage == MESA_SHADER_GEOMETRY)
381       state.header = nir_load_gs_header_ir3(&b);
382    else
383       state.header = nir_load_tcs_header_ir3(&b);
384 
385    nir_foreach_block_safe (block, impl)
386       lower_block_to_explicit_input(block, &b, &state);
387 
388    v->input_size = calc_primitive_map_size(shader);
389 }
390 
391 static nir_def *
build_tcs_out_vertices(nir_builder * b)392 build_tcs_out_vertices(nir_builder *b)
393 {
394    if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
395       return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
396    else
397       return nir_load_patch_vertices_in(b);
398 }
399 
400 static nir_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_def * vertex,uint32_t location,uint32_t comp,nir_def * offset)401 build_per_vertex_offset(nir_builder *b, struct state *state,
402                         nir_def *vertex, uint32_t location, uint32_t comp,
403                         nir_def *offset)
404 {
405    nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
406    nir_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
407    nir_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
408    nir_def *attr_offset;
409 
410    if (nir_src_is_const(nir_src_for_ssa(offset))) {
411       location += nir_src_as_uint(nir_src_for_ssa(offset));
412       offset = nir_imm_int(b, 0);
413    } else {
414       /* Offset is in vec4's, but we need it in unit of components for the
415        * load/store_global_ir3 offset.
416        */
417       offset = nir_ishl_imm(b, offset, 2);
418    }
419 
420    nir_def *vertex_offset;
421    if (vertex) {
422       unsigned index = shader_io_get_unique_index(location);
423       switch (b->shader->info.stage) {
424       case MESA_SHADER_TESS_CTRL:
425          attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
426          break;
427       case MESA_SHADER_TESS_EVAL:
428          attr_offset = nir_iadd_imm(b, nir_load_primitive_location_ir3(b, index),
429                                     comp);
430          break;
431       default:
432          unreachable("bad shader state");
433       }
434 
435       attr_offset = nir_iadd(b, attr_offset,
436                              nir_imul24(b, offset, build_tcs_out_vertices(b)));
437       vertex_offset = nir_ishl_imm(b, vertex, 2);
438    } else {
439       assert(location >= VARYING_SLOT_PATCH0 &&
440              location <= VARYING_SLOT_TESS_MAX);
441       unsigned index = location - VARYING_SLOT_PATCH0;
442       attr_offset = nir_iadd_imm(b, offset, index * 4 + comp);
443       vertex_offset = nir_imm_int(b, 0);
444    }
445 
446    return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
447 }
448 
449 static nir_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_def * offset)450 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
451                    uint32_t comp, nir_def *offset)
452 {
453    return build_per_vertex_offset(b, state, NULL, base, comp, offset);
454 }
455 
456 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)457 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
458 {
459    switch (state->topology) {
460    case IR3_TESS_TRIANGLES:
461       *inner = 1;
462       *outer = 3;
463       break;
464    case IR3_TESS_QUADS:
465       *inner = 2;
466       *outer = 4;
467       break;
468    case IR3_TESS_ISOLINES:
469       *inner = 0;
470       *outer = 2;
471       break;
472    default:
473       unreachable("bad");
474    }
475 }
476 
477 static nir_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)478 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
479                       struct state *state)
480 {
481    uint32_t inner_levels, outer_levels;
482    tess_level_components(state, &inner_levels, &outer_levels);
483 
484    const uint32_t patch_stride = 1 + inner_levels + outer_levels;
485 
486    nir_def *patch_id = nir_load_rel_patch_id_ir3(b);
487 
488    nir_def *patch_offset =
489       nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
490 
491    uint32_t offset;
492    switch (slot) {
493    case VARYING_SLOT_PRIMITIVE_ID:
494       offset = 0;
495       break;
496    case VARYING_SLOT_TESS_LEVEL_OUTER:
497       offset = 1;
498       break;
499    case VARYING_SLOT_TESS_LEVEL_INNER:
500       offset = 1 + outer_levels;
501       break;
502    default:
503       unreachable("bad");
504    }
505 
506    return nir_iadd_imm(b, patch_offset, offset + comp);
507 }
508 
509 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)510 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
511 {
512    nir_foreach_instr_safe (instr, block) {
513       if (instr->type != nir_instr_type_intrinsic)
514          continue;
515 
516       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
517 
518       switch (intr->intrinsic) {
519       case nir_intrinsic_load_per_vertex_output: {
520          // src[] = { vertex, offset }.
521 
522          b->cursor = nir_before_instr(&intr->instr);
523 
524          nir_def *address = nir_load_tess_param_base_ir3(b);
525          nir_def *offset = build_per_vertex_offset(
526             b, state, intr->src[0].ssa,
527             nir_intrinsic_io_semantics(intr).location,
528             nir_intrinsic_component(intr), intr->src[1].ssa);
529 
530          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
531                            offset, NULL);
532          break;
533       }
534 
535       case nir_intrinsic_store_per_vertex_output: {
536          // src[] = { value, vertex, offset }.
537 
538          b->cursor = nir_before_instr(&intr->instr);
539 
540          /* sparse writemask not supported */
541          assert(
542             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
543 
544          nir_def *value = intr->src[0].ssa;
545          nir_def *address = nir_load_tess_param_base_ir3(b);
546          nir_def *offset = build_per_vertex_offset(
547             b, state, intr->src[1].ssa,
548             nir_intrinsic_io_semantics(intr).location,
549             nir_intrinsic_component(intr), intr->src[2].ssa);
550 
551          replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
552                            address, offset);
553 
554          break;
555       }
556 
557       case nir_intrinsic_load_output: {
558          // src[] = { offset }.
559 
560          b->cursor = nir_before_instr(&intr->instr);
561 
562          nir_def *address, *offset;
563 
564          /* note if vectorization of the tess level loads ever happens:
565           * "ldg" across 16-byte boundaries can behave incorrectly if results
566           * are never used. most likely some issue with (sy) not properly
567           * syncing with values coming from a second memory transaction.
568           */
569          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
570          if (is_tess_levels(location)) {
571             assert(intr->def.num_components == 1);
572             address = nir_load_tess_factor_base_ir3(b);
573             offset = build_tessfactor_base(
574                b, location, nir_intrinsic_component(intr), state);
575          } else {
576             address = nir_load_tess_param_base_ir3(b);
577             offset = build_patch_offset(b, state, location,
578                                         nir_intrinsic_component(intr),
579                                         intr->src[0].ssa);
580          }
581 
582          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
583                            offset, NULL);
584          break;
585       }
586 
587       case nir_intrinsic_store_output: {
588          // src[] = { value, offset }.
589 
590          /* write patch output to bo */
591 
592          b->cursor = nir_before_instr(&intr->instr);
593 
594          /* sparse writemask not supported */
595          assert(
596             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
597 
598          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
599          if (is_tess_levels(location)) {
600             uint32_t inner_levels, outer_levels, levels;
601             tess_level_components(state, &inner_levels, &outer_levels);
602 
603             assert(intr->src[0].ssa->num_components == 1);
604 
605             nir_if *nif = NULL;
606             if (location != VARYING_SLOT_PRIMITIVE_ID) {
607                /* with tess levels are defined as float[4] and float[2],
608                 * but tess factor BO has smaller sizes for tris/isolines,
609                 * so we have to discard any writes beyond the number of
610                 * components for inner/outer levels
611                 */
612                if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
613                   levels = outer_levels;
614                else
615                   levels = inner_levels;
616 
617                nir_def *offset = nir_iadd_imm(
618                   b, intr->src[1].ssa, nir_intrinsic_component(intr));
619                nif = nir_push_if(b, nir_ult_imm(b, offset, levels));
620             }
621 
622             nir_def *offset = build_tessfactor_base(
623                b, location, nir_intrinsic_component(intr), state);
624 
625             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
626                               intr->src[0].ssa,
627                               nir_load_tess_factor_base_ir3(b),
628                               nir_iadd(b, intr->src[1].ssa, offset));
629 
630             if (location != VARYING_SLOT_PRIMITIVE_ID) {
631                nir_pop_if(b, nif);
632             }
633          } else {
634             nir_def *address = nir_load_tess_param_base_ir3(b);
635             nir_def *offset = build_patch_offset(
636                b, state, location, nir_intrinsic_component(intr),
637                intr->src[1].ssa);
638 
639             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
640                               intr->src[0].ssa, address, offset);
641          }
642          break;
643       }
644 
645       default:
646          break;
647       }
648    }
649 }
650 
651 static void
emit_tess_epilouge(nir_builder * b,struct state * state)652 emit_tess_epilouge(nir_builder *b, struct state *state)
653 {
654    /* Insert endpatch instruction:
655     *
656     * TODO we should re-work this to use normal flow control.
657     */
658 
659    nir_end_patch_ir3(b);
660 }
661 
662 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)663 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
664                         unsigned topology)
665 {
666    struct state state = {.topology = topology};
667 
668    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
669       mesa_logi("NIR (before tess lowering) for %s shader:",
670                 _mesa_shader_stage_to_string(shader->info.stage));
671       nir_log_shaderi(shader);
672    }
673 
674    build_primitive_map(shader, &state.map);
675    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
676    v->output_size = state.map.stride;
677 
678    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
679    assert(impl);
680 
681    nir_builder b = nir_builder_at(nir_before_impl(impl));
682 
683    state.header = nir_load_tcs_header_ir3(&b);
684 
685    /* If required, store gl_PrimitiveID. */
686    if (v->key.tcs_store_primid) {
687       b.cursor = nir_after_impl(impl);
688 
689       nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
690                        .io_semantics = {
691                            .location = VARYING_SLOT_PRIMITIVE_ID,
692                            .num_slots = 1
693                         });
694 
695       b.cursor = nir_before_impl(impl);
696    }
697 
698    nir_foreach_block_safe (block, impl)
699       lower_tess_ctrl_block(block, &b, &state);
700 
701    /* Now move the body of the TCS into a conditional:
702     *
703     *   if (gl_InvocationID < num_vertices)
704     *     // body
705     *
706     */
707 
708    nir_cf_list body;
709    nir_cf_extract(&body, nir_before_impl(impl),
710                   nir_after_impl(impl));
711 
712    b.cursor = nir_after_impl(impl);
713 
714    /* Re-emit the header, since the old one got moved into the if branch */
715    state.header = nir_load_tcs_header_ir3(&b);
716    nir_def *iid = build_invocation_id(&b, &state);
717 
718    const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
719    nir_def *cond = nir_ult_imm(&b, iid, nvertices);
720 
721    nir_if *nif = nir_push_if(&b, cond);
722 
723    nir_cf_reinsert(&body, b.cursor);
724 
725    b.cursor = nir_after_cf_list(&nif->then_list);
726 
727    /* Insert conditional exit for threads invocation id != 0 */
728    nir_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
729    nir_cond_end_ir3(&b, iid0_cond);
730 
731    emit_tess_epilouge(&b, &state);
732 
733    nir_pop_if(&b, nif);
734 
735    nir_metadata_preserve(impl, nir_metadata_none);
736 }
737 
738 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)739 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
740 {
741    nir_foreach_instr_safe (instr, block) {
742       if (instr->type != nir_instr_type_intrinsic)
743          continue;
744 
745       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
746 
747       switch (intr->intrinsic) {
748       case nir_intrinsic_load_per_vertex_input: {
749          // src[] = { vertex, offset }.
750 
751          b->cursor = nir_before_instr(&intr->instr);
752 
753          nir_def *address = nir_load_tess_param_base_ir3(b);
754          nir_def *offset = build_per_vertex_offset(
755             b, state, intr->src[0].ssa,
756             nir_intrinsic_io_semantics(intr).location,
757             nir_intrinsic_component(intr), intr->src[1].ssa);
758 
759          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
760                            offset, NULL);
761          break;
762       }
763 
764       case nir_intrinsic_load_input: {
765          // src[] = { offset }.
766 
767          b->cursor = nir_before_instr(&intr->instr);
768 
769          nir_def *address, *offset;
770 
771          /* note if vectorization of the tess level loads ever happens:
772           * "ldg" across 16-byte boundaries can behave incorrectly if results
773           * are never used. most likely some issue with (sy) not properly
774           * syncing with values coming from a second memory transaction.
775           */
776          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
777          if (is_tess_levels(location)) {
778             assert(intr->def.num_components == 1);
779             address = nir_load_tess_factor_base_ir3(b);
780             offset = build_tessfactor_base(
781                b, location, nir_intrinsic_component(intr), state);
782          } else {
783             address = nir_load_tess_param_base_ir3(b);
784             offset = build_patch_offset(b, state, location,
785                                         nir_intrinsic_component(intr),
786                                         intr->src[0].ssa);
787          }
788 
789          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
790                            offset, NULL);
791          break;
792       }
793 
794       default:
795          break;
796       }
797    }
798 }
799 
800 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)801 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
802                         unsigned topology)
803 {
804    struct state state = {.topology = topology};
805 
806    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
807       mesa_logi("NIR (before tess lowering) for %s shader:",
808                 _mesa_shader_stage_to_string(shader->info.stage));
809       nir_log_shaderi(shader);
810    }
811 
812    NIR_PASS_V(shader, nir_lower_tess_coord_z, topology == IR3_TESS_TRIANGLES);
813 
814    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
815    assert(impl);
816 
817    nir_builder b = nir_builder_create(impl);
818 
819    nir_foreach_block_safe (block, impl)
820       lower_tess_eval_block(block, &b, &state);
821 
822    v->input_size = calc_primitive_map_size(shader);
823 
824    nir_metadata_preserve(impl, nir_metadata_none);
825 }
826 
827 /* The hardware does not support incomplete primitives in multiple streams at
828  * once or ending the "wrong" stream, but Vulkan allows this. That is,
829  * EmitStreamVertex(N) followed by EmitStreamVertex(M) or EndStreamPrimitive(M)
830  * where N != M and there isn't a call to EndStreamPrimitive(N) in between isn't
831  * supported by the hardware. Fix this up by duplicating the entire shader per
832  * stream, removing EmitStreamVertex/EndStreamPrimitive calls for streams other
833  * than the current one.
834  */
835 
836 static void
lower_mixed_streams(nir_shader * nir)837 lower_mixed_streams(nir_shader *nir)
838 {
839    /* We don't have to do anything for points because there is only one vertex
840     * per primitive and therefore no possibility of mixing.
841     */
842    if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
843       return;
844 
845    nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
846 
847    uint8_t stream_mask = 0;
848 
849    nir_foreach_block (block, entrypoint) {
850       nir_foreach_instr (instr, block) {
851          if (instr->type != nir_instr_type_intrinsic)
852             continue;
853 
854          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
855 
856          if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
857              intrin->intrinsic == nir_intrinsic_end_primitive)
858             stream_mask |= 1 << nir_intrinsic_stream_id(intrin);
859       }
860    }
861 
862    if (util_is_power_of_two_or_zero(stream_mask))
863       return;
864 
865    nir_cf_list body;
866    nir_cf_list_extract(&body, &entrypoint->body);
867 
868    nir_builder b = nir_builder_create(entrypoint);
869 
870    u_foreach_bit (stream, stream_mask) {
871       b.cursor = nir_after_impl(entrypoint);
872 
873       /* Inserting the cloned body invalidates any cursor not using an
874        * instruction, so we need to emit this to keep track of where the new
875        * body is to iterate over it.
876        */
877       nir_instr *anchor = &nir_nop(&b)->instr;
878 
879       nir_cf_list_clone_and_reinsert(&body, &entrypoint->cf_node, b.cursor, NULL);
880 
881       /* We need to iterate over all instructions after the anchor, which is a
882        * bit tricky to do so we do it manually.
883        */
884       for (nir_block *block = anchor->block; block != NULL;
885            block = nir_block_cf_tree_next(block)) {
886          for (nir_instr *instr =
887                (block == anchor->block) ? anchor : nir_block_first_instr(block),
888                *next = instr ? nir_instr_next(instr) : NULL;
889               instr != NULL; instr = next, next = next ? nir_instr_next(next) : NULL) {
890             if (instr->type != nir_instr_type_intrinsic)
891                continue;
892 
893             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
894             if ((intrin->intrinsic == nir_intrinsic_emit_vertex ||
895                  intrin->intrinsic == nir_intrinsic_end_primitive) &&
896                 nir_intrinsic_stream_id(intrin) != stream) {
897                nir_instr_remove(instr);
898             }
899          }
900       }
901 
902       nir_instr_remove(anchor);
903 
904       /* The user can omit the last EndStreamPrimitive(), so add an extra one
905        * here before potentially adding other copies of the body that emit to
906        * different streams. Our lowering means that redundant calls to
907        * EndStreamPrimitive are safe and should be optimized out.
908        */
909       b.cursor = nir_after_impl(entrypoint);
910       nir_end_primitive(&b, .stream_id = stream);
911    }
912 
913    nir_cf_delete(&body);
914 }
915 
916 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)917 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
918 {
919    foreach_two_lists (dest_node, dests, src_node, srcs) {
920       nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
921       nir_variable *src = exec_node_data(nir_variable, src_node, node);
922       nir_copy_var(b, dest, src);
923    }
924 }
925 
926 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)927 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
928 {
929    nir_foreach_instr_safe (instr, block) {
930       if (instr->type != nir_instr_type_intrinsic)
931          continue;
932 
933       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
934 
935       switch (intr->intrinsic) {
936       case nir_intrinsic_end_primitive: {
937          /* The HW will use the stream from the preceding emitted vertices,
938           * which thanks to the lower_mixed_streams is the same as the stream
939           * for this instruction, so we can ignore it here.
940           */
941          b->cursor = nir_before_instr(&intr->instr);
942          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
943          nir_instr_remove(&intr->instr);
944          break;
945       }
946 
947       case nir_intrinsic_emit_vertex: {
948          /* Load the vertex count */
949          b->cursor = nir_before_instr(&intr->instr);
950          nir_def *count = nir_load_var(b, state->vertex_count_var);
951 
952          nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
953 
954          unsigned stream = nir_intrinsic_stream_id(intr);
955          /* vertex_flags_out |= stream */
956          nir_store_var(b, state->vertex_flags_out,
957                        nir_ior_imm(b, nir_load_var(b, state->vertex_flags_out),
958                                    stream),
959                        0x1 /* .x */);
960 
961          copy_vars(b, &state->emit_outputs, &state->old_outputs);
962 
963          nir_instr_remove(&intr->instr);
964 
965          nir_store_var(b, state->emitted_vertex_var,
966                        nir_iadd_imm(b,
967                                     nir_load_var(b,
968                                                  state->emitted_vertex_var),
969                                                  1),
970                        0x1);
971 
972          nir_pop_if(b, NULL);
973 
974          /* Increment the vertex count by 1 */
975          nir_store_var(b, state->vertex_count_var,
976                        nir_iadd_imm(b, count, 1), 0x1); /* .x */
977          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
978 
979          break;
980       }
981 
982       default:
983          break;
984       }
985    }
986 }
987 
988 void
ir3_nir_lower_gs(nir_shader * shader)989 ir3_nir_lower_gs(nir_shader *shader)
990 {
991    struct state state = {};
992 
993    /* Don't lower multiple times: */
994    nir_foreach_shader_out_variable (var, shader)
995       if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
996          return;
997 
998    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
999       mesa_logi("NIR (before gs lowering):");
1000       nir_log_shaderi(shader);
1001    }
1002 
1003    lower_mixed_streams(shader);
1004 
1005    /* Create an output var for vertex_flags. This will be shadowed below,
1006     * same way regular outputs get shadowed, and this variable will become a
1007     * temporary.
1008     */
1009    state.vertex_flags_out = nir_variable_create(
1010       shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
1011    state.vertex_flags_out->data.driver_location = shader->num_outputs++;
1012    state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
1013    state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
1014 
1015    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1016    assert(impl);
1017 
1018    nir_builder b = nir_builder_at(nir_before_impl(impl));
1019 
1020    state.header = nir_load_gs_header_ir3(&b);
1021 
1022    /* Generate two set of shadow vars for the output variables.  The first
1023     * set replaces the real outputs and the second set (emit_outputs) we'll
1024     * assign in the emit_vertex conditionals.  Then at the end of the shader
1025     * we copy the emit_outputs to the real outputs, so that we get
1026     * store_output in uniform control flow.
1027     */
1028    exec_list_make_empty(&state.old_outputs);
1029    nir_foreach_shader_out_variable_safe (var, shader) {
1030       exec_node_remove(&var->node);
1031       exec_list_push_tail(&state.old_outputs, &var->node);
1032    }
1033    exec_list_make_empty(&state.new_outputs);
1034    exec_list_make_empty(&state.emit_outputs);
1035    nir_foreach_variable_in_list (var, &state.old_outputs) {
1036       /* Create a new output var by cloning the original output var and
1037        * stealing the name.
1038        */
1039       nir_variable *output = nir_variable_clone(var, shader);
1040       exec_list_push_tail(&state.new_outputs, &output->node);
1041 
1042       /* Rewrite the original output to be a shadow variable. */
1043       var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
1044       var->data.mode = nir_var_shader_temp;
1045 
1046       /* Clone the shadow variable to create the emit shadow variable that
1047        * we'll assign in the emit conditionals.
1048        */
1049       nir_variable *emit_output = nir_variable_clone(var, shader);
1050       emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
1051       exec_list_push_tail(&state.emit_outputs, &emit_output->node);
1052    }
1053 
1054    /* During the shader we'll keep track of which vertex we're currently
1055     * emitting for the EmitVertex test and how many vertices we emitted so we
1056     * know to discard if didn't emit any.  In most simple shaders, this can
1057     * all be statically determined and gets optimized away.
1058     */
1059    state.vertex_count_var =
1060       nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
1061    state.emitted_vertex_var =
1062       nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
1063 
1064    /* Initialize to 0. */
1065    b.cursor = nir_before_impl(impl);
1066    nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1067    nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1068    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1069 
1070    nir_foreach_block_safe (block, impl)
1071       lower_gs_block(block, &b, &state);
1072 
1073    /* Note: returns are lowered, so there should be only one block before the
1074     * end block.  If we had real returns, we would probably want to redirect
1075     * them to this new if statement, rather than emitting this code at every
1076     * return statement.
1077     */
1078    assert(impl->end_block->predecessors->entries == 1);
1079    nir_block *block = nir_impl_last_block(impl);
1080    b.cursor = nir_after_block_before_jump(block);
1081 
1082    /* If we haven't emitted any vertex we need to copy the shadow (old)
1083     * outputs to emit outputs here.
1084     *
1085     * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1086     * in an extra vertex_flags write for good measure.  If unneeded it
1087     * will be optimized out.
1088     *
1089     * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1090     */
1091    nir_def *cond =
1092       nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1093    nir_push_if(&b, cond);
1094    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1095    copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1096    nir_pop_if(&b, NULL);
1097 
1098    nir_discard_if(&b, cond);
1099 
1100    copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1101 
1102    exec_list_append(&shader->variables, &state.old_outputs);
1103    exec_list_append(&shader->variables, &state.emit_outputs);
1104    exec_list_append(&shader->variables, &state.new_outputs);
1105 
1106    nir_metadata_preserve(impl, nir_metadata_none);
1107 
1108    nir_lower_global_vars_to_local(shader);
1109    nir_split_var_copies(shader);
1110    nir_lower_var_copies(shader);
1111 
1112    nir_fixup_deref_modes(shader);
1113 
1114    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
1115       mesa_logi("NIR (after gs lowering):");
1116       nir_log_shaderi(shader);
1117    }
1118 }
1119