• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27 
28 struct state {
29    uint32_t topology;
30 
31    struct primitive_map {
32       /* +POSITION, +PSIZE, ... - see shader_io_get_unique_index */
33       unsigned loc[12 + 32];
34       unsigned stride;
35    } map;
36 
37    nir_ssa_def *header;
38 
39    nir_variable *vertex_count_var;
40    nir_variable *emitted_vertex_var;
41    nir_variable *vertex_flags_out;
42 
43    struct exec_list old_outputs;
44    struct exec_list new_outputs;
45    struct exec_list emit_outputs;
46 
47    /* tess ctrl shader on a650 gets the local primitive id at different bits: */
48    unsigned local_primitive_id_start;
49 };
50 
51 static nir_ssa_def *
bitfield_extract(nir_builder * b,nir_ssa_def * v,uint32_t start,uint32_t mask)52 bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
53 {
54    return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
55                    nir_imm_int(b, mask));
56 }
57 
58 static nir_ssa_def *
build_invocation_id(nir_builder * b,struct state * state)59 build_invocation_id(nir_builder *b, struct state *state)
60 {
61    return bitfield_extract(b, state->header, 11, 31);
62 }
63 
64 static nir_ssa_def *
build_vertex_id(nir_builder * b,struct state * state)65 build_vertex_id(nir_builder *b, struct state *state)
66 {
67    return bitfield_extract(b, state->header, 6, 31);
68 }
69 
70 static nir_ssa_def *
build_local_primitive_id(nir_builder * b,struct state * state)71 build_local_primitive_id(nir_builder *b, struct state *state)
72 {
73    return bitfield_extract(b, state->header, state->local_primitive_id_start,
74                            63);
75 }
76 
77 static bool
is_tess_levels(gl_varying_slot slot)78 is_tess_levels(gl_varying_slot slot)
79 {
80    return (slot == VARYING_SLOT_PRIMITIVE_ID ||
81            slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
82            slot == VARYING_SLOT_TESS_LEVEL_INNER);
83 }
84 
85 /* Return a deterministic index for varyings. We can't rely on driver_location
86  * to be correct without linking the different stages first, so we create
87  * "primitive maps" where the producer decides on the location of each varying
88  * slot and then exports a per-slot array to the consumer. This compacts the
89  * gl_varying_slot space down a bit so that the primitive maps aren't too
90  * large.
91  *
92  * Note: per-patch varyings are currently handled separately, without any
93  * compacting.
94  *
95  * TODO: We could probably use the driver_location's directly in the non-SSO
96  * (Vulkan) case.
97  */
98 
99 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)100 shader_io_get_unique_index(gl_varying_slot slot)
101 {
102    switch (slot) {
103    case VARYING_SLOT_POS:         return 0;
104    case VARYING_SLOT_PSIZ:        return 1;
105    case VARYING_SLOT_COL0:        return 2;
106    case VARYING_SLOT_COL1:        return 3;
107    case VARYING_SLOT_BFC0:        return 4;
108    case VARYING_SLOT_BFC1:        return 5;
109    case VARYING_SLOT_FOGC:        return 6;
110    case VARYING_SLOT_CLIP_DIST0:  return 7;
111    case VARYING_SLOT_CLIP_DIST1:  return 8;
112    case VARYING_SLOT_CLIP_VERTEX: return 9;
113    case VARYING_SLOT_LAYER:       return 10;
114    case VARYING_SLOT_VIEWPORT:    return 11;
115    case VARYING_SLOT_VAR0 ... VARYING_SLOT_VAR31: {
116       struct state state = {};
117       STATIC_ASSERT(ARRAY_SIZE(state.map.loc) - 1 ==
118                     (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
119       struct ir3_shader_variant v = {};
120       STATIC_ASSERT(ARRAY_SIZE(v.output_loc) - 1 ==
121                     (12 + VARYING_SLOT_VAR31 - VARYING_SLOT_VAR0));
122       return 12 + (slot - VARYING_SLOT_VAR0);
123    }
124    default:
125       unreachable("illegal slot in get unique index\n");
126    }
127 }
128 
129 static nir_ssa_def *
build_local_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)130 build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
131                    uint32_t location, uint32_t comp, nir_ssa_def *offset)
132 {
133    nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
134    nir_ssa_def *primitive_offset =
135       nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
136    nir_ssa_def *attr_offset;
137    nir_ssa_def *vertex_stride;
138    unsigned index = shader_io_get_unique_index(location);
139 
140    switch (b->shader->info.stage) {
141    case MESA_SHADER_VERTEX:
142    case MESA_SHADER_TESS_EVAL:
143       vertex_stride = nir_imm_int(b, state->map.stride * 4);
144       attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
145       break;
146    case MESA_SHADER_TESS_CTRL:
147    case MESA_SHADER_GEOMETRY:
148       vertex_stride = nir_load_vs_vertex_stride_ir3(b);
149       attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
150                              nir_imm_int(b, comp * 4));
151       break;
152    default:
153       unreachable("bad shader stage");
154    }
155 
156    nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
157 
158    return nir_iadd(
159       b, nir_iadd(b, primitive_offset, vertex_offset),
160       nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
161 }
162 
163 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_ssa_def * src0,nir_ssa_def * src1,nir_ssa_def * src2)164 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
165                   nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
166                   nir_ssa_def *src2)
167 {
168    nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
169 
170    new_intr->src[0] = nir_src_for_ssa(src0);
171    if (src1)
172       new_intr->src[1] = nir_src_for_ssa(src1);
173    if (src2)
174       new_intr->src[2] = nir_src_for_ssa(src2);
175 
176    new_intr->num_components = intr->num_components;
177 
178    if (nir_intrinsic_infos[op].has_dest)
179       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
180                         intr->dest.ssa.bit_size, NULL);
181 
182    nir_builder_instr_insert(b, &new_intr->instr);
183 
184    if (nir_intrinsic_infos[op].has_dest)
185       nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
186 
187    nir_instr_remove(&intr->instr);
188 
189    return new_intr;
190 }
191 
192 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)193 build_primitive_map(nir_shader *shader, struct primitive_map *map)
194 {
195    /* All interfaces except the TCS <-> TES interface use ldlw, which takes
196     * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
197     * ldg, which takes an offset in dwords, but each per-vertex slot has
198     * space for every vertex, and there's space at the beginning for
199     * per-patch varyings.
200     */
201    unsigned slot_size = 16, start = 0;
202    if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
203       slot_size = shader->info.tess.tcs_vertices_out * 4;
204       start = util_last_bit(shader->info.patch_outputs_written) * 4;
205    }
206 
207    uint64_t mask = shader->info.outputs_written;
208    unsigned loc = start;
209    while (mask) {
210       int location = u_bit_scan64(&mask);
211       if (is_tess_levels(location))
212          continue;
213 
214       unsigned index = shader_io_get_unique_index(location);
215       map->loc[index] = loc;
216       loc += slot_size;
217    }
218 
219    map->stride = loc;
220    /* Use units of dwords for the stride. */
221    if (shader->info.stage != MESA_SHADER_TESS_CTRL)
222       map->stride /= 4;
223 }
224 
225 /* For shader stages that receive a primitive map, calculate how big it should
226  * be.
227  */
228 
229 static unsigned
calc_primitive_map_size(nir_shader * shader)230 calc_primitive_map_size(nir_shader *shader)
231 {
232    uint64_t mask = shader->info.inputs_read;
233    unsigned max_index = 0;
234    while (mask) {
235       int location = u_bit_scan64(&mask);
236 
237       if (is_tess_levels(location))
238          continue;
239 
240       unsigned index = shader_io_get_unique_index(location);
241       max_index = MAX2(max_index, index + 1);
242    }
243 
244    return max_index;
245 }
246 
247 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)248 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
249                                struct state *state)
250 {
251    nir_foreach_instr_safe (instr, block) {
252       if (instr->type != nir_instr_type_intrinsic)
253          continue;
254 
255       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
256 
257       switch (intr->intrinsic) {
258       case nir_intrinsic_store_output: {
259          // src[] = { value, offset }.
260 
261          /* nir_lower_io_to_temporaries replaces all access to output
262           * variables with temp variables and then emits a nir_copy_var at
263           * the end of the shader.  Thus, we should always get a full wrmask
264           * here.
265           */
266          assert(
267             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
268 
269          b->cursor = nir_instr_remove(&intr->instr);
270 
271          nir_ssa_def *vertex_id = build_vertex_id(b, state);
272          nir_ssa_def *offset = build_local_offset(
273             b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
274             nir_intrinsic_component(intr), intr->src[1].ssa);
275 
276          nir_store_shared_ir3(b, intr->src[0].ssa, offset);
277          break;
278       }
279 
280       default:
281          break;
282       }
283    }
284 }
285 
286 static nir_ssa_def *
local_thread_id(nir_builder * b)287 local_thread_id(nir_builder *b)
288 {
289    return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
290 }
291 
292 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)293 ir3_nir_lower_to_explicit_output(nir_shader *shader,
294                                  struct ir3_shader_variant *v,
295                                  unsigned topology)
296 {
297    struct state state = {};
298 
299    build_primitive_map(shader, &state.map);
300    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
301 
302    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
303    assert(impl);
304 
305    nir_builder b;
306    nir_builder_init(&b, impl);
307    b.cursor = nir_before_cf_list(&impl->body);
308 
309    if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
310       state.header = nir_load_tcs_header_ir3(&b);
311    else
312       state.header = nir_load_gs_header_ir3(&b);
313 
314    nir_foreach_block_safe (block, impl)
315       lower_block_to_explicit_output(block, &b, &state);
316 
317    nir_metadata_preserve(impl,
318                          nir_metadata_block_index | nir_metadata_dominance);
319 
320    v->output_size = state.map.stride;
321 }
322 
323 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)324 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
325                               struct state *state)
326 {
327    nir_foreach_instr_safe (instr, block) {
328       if (instr->type != nir_instr_type_intrinsic)
329          continue;
330 
331       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
332 
333       switch (intr->intrinsic) {
334       case nir_intrinsic_load_per_vertex_input: {
335          // src[] = { vertex, offset }.
336 
337          b->cursor = nir_before_instr(&intr->instr);
338 
339          nir_ssa_def *offset = build_local_offset(
340             b, state,
341             intr->src[0].ssa, // this is typically gl_InvocationID
342             nir_intrinsic_io_semantics(intr).location,
343             nir_intrinsic_component(intr), intr->src[1].ssa);
344 
345          replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
346                            NULL);
347          break;
348       }
349 
350       case nir_intrinsic_load_invocation_id: {
351          b->cursor = nir_before_instr(&intr->instr);
352 
353          nir_ssa_def *iid = build_invocation_id(b, state);
354          nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
355          nir_instr_remove(&intr->instr);
356          break;
357       }
358 
359       default:
360          break;
361       }
362    }
363 }
364 
365 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)366 ir3_nir_lower_to_explicit_input(nir_shader *shader,
367                                 struct ir3_shader_variant *v)
368 {
369    struct state state = {};
370 
371    /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
372     * HS uses a different primitive id, which starts at bit 16 in the header
373     */
374    if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
375        v->compiler->tess_use_shared)
376       state.local_primitive_id_start = 16;
377 
378    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
379    assert(impl);
380 
381    nir_builder b;
382    nir_builder_init(&b, impl);
383    b.cursor = nir_before_cf_list(&impl->body);
384 
385    if (shader->info.stage == MESA_SHADER_GEOMETRY)
386       state.header = nir_load_gs_header_ir3(&b);
387    else
388       state.header = nir_load_tcs_header_ir3(&b);
389 
390    nir_foreach_block_safe (block, impl)
391       lower_block_to_explicit_input(block, &b, &state);
392 
393    v->input_size = calc_primitive_map_size(shader);
394 }
395 
396 static nir_ssa_def *
build_tcs_out_vertices(nir_builder * b)397 build_tcs_out_vertices(nir_builder *b)
398 {
399    if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
400       return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
401    else
402       return nir_load_patch_vertices_in(b);
403 }
404 
405 static nir_ssa_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)406 build_per_vertex_offset(nir_builder *b, struct state *state,
407                         nir_ssa_def *vertex, uint32_t location, uint32_t comp,
408                         nir_ssa_def *offset)
409 {
410    nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
411    nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
412    nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
413    nir_ssa_def *attr_offset;
414 
415    if (nir_src_is_const(nir_src_for_ssa(offset))) {
416       location += nir_src_as_uint(nir_src_for_ssa(offset));
417       offset = nir_imm_int(b, 0);
418    } else {
419       /* Offset is in vec4's, but we need it in unit of components for the
420        * load/store_global_ir3 offset.
421        */
422       offset = nir_ishl(b, offset, nir_imm_int(b, 2));
423    }
424 
425    nir_ssa_def *vertex_offset;
426    if (vertex) {
427       unsigned index = shader_io_get_unique_index(location);
428       switch (b->shader->info.stage) {
429       case MESA_SHADER_TESS_CTRL:
430          attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
431          break;
432       case MESA_SHADER_TESS_EVAL:
433          attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
434                                 nir_imm_int(b, comp));
435          break;
436       default:
437          unreachable("bad shader state");
438       }
439 
440       attr_offset = nir_iadd(b, attr_offset,
441                              nir_imul24(b, offset, build_tcs_out_vertices(b)));
442       vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
443    } else {
444       assert(location >= VARYING_SLOT_PATCH0 &&
445              location <= VARYING_SLOT_TESS_MAX);
446       unsigned index = location - VARYING_SLOT_PATCH0;
447       attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
448       vertex_offset = nir_imm_int(b, 0);
449    }
450 
451    return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
452 }
453 
454 static nir_ssa_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_ssa_def * offset)455 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
456                    uint32_t comp, nir_ssa_def *offset)
457 {
458    return build_per_vertex_offset(b, state, NULL, base, comp, offset);
459 }
460 
461 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)462 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
463 {
464    switch (state->topology) {
465    case IR3_TESS_TRIANGLES:
466       *inner = 1;
467       *outer = 3;
468       break;
469    case IR3_TESS_QUADS:
470       *inner = 2;
471       *outer = 4;
472       break;
473    case IR3_TESS_ISOLINES:
474       *inner = 0;
475       *outer = 2;
476       break;
477    default:
478       unreachable("bad");
479    }
480 }
481 
482 static nir_ssa_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)483 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
484                       struct state *state)
485 {
486    uint32_t inner_levels, outer_levels;
487    tess_level_components(state, &inner_levels, &outer_levels);
488 
489    const uint32_t patch_stride = 1 + inner_levels + outer_levels;
490 
491    nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
492 
493    nir_ssa_def *patch_offset =
494       nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
495 
496    uint32_t offset;
497    switch (slot) {
498    case VARYING_SLOT_PRIMITIVE_ID:
499       offset = 0;
500       break;
501    case VARYING_SLOT_TESS_LEVEL_OUTER:
502       offset = 1;
503       break;
504    case VARYING_SLOT_TESS_LEVEL_INNER:
505       offset = 1 + outer_levels;
506       break;
507    default:
508       unreachable("bad");
509    }
510 
511    return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
512 }
513 
514 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)515 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
516 {
517    nir_foreach_instr_safe (instr, block) {
518       if (instr->type != nir_instr_type_intrinsic)
519          continue;
520 
521       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
522 
523       switch (intr->intrinsic) {
524       case nir_intrinsic_load_per_vertex_output: {
525          // src[] = { vertex, offset }.
526 
527          b->cursor = nir_before_instr(&intr->instr);
528 
529          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
530          nir_ssa_def *offset = build_per_vertex_offset(
531             b, state, intr->src[0].ssa,
532             nir_intrinsic_io_semantics(intr).location,
533             nir_intrinsic_component(intr), intr->src[1].ssa);
534 
535          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
536                            offset, NULL);
537          break;
538       }
539 
540       case nir_intrinsic_store_per_vertex_output: {
541          // src[] = { value, vertex, offset }.
542 
543          b->cursor = nir_before_instr(&intr->instr);
544 
545          /* sparse writemask not supported */
546          assert(
547             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
548 
549          nir_ssa_def *value = intr->src[0].ssa;
550          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
551          nir_ssa_def *offset = build_per_vertex_offset(
552             b, state, intr->src[1].ssa,
553             nir_intrinsic_io_semantics(intr).location,
554             nir_intrinsic_component(intr), intr->src[2].ssa);
555 
556          replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
557                            address, offset);
558 
559          break;
560       }
561 
562       case nir_intrinsic_load_output: {
563          // src[] = { offset }.
564 
565          b->cursor = nir_before_instr(&intr->instr);
566 
567          nir_ssa_def *address, *offset;
568 
569          /* note if vectorization of the tess level loads ever happens:
570           * "ldg" across 16-byte boundaries can behave incorrectly if results
571           * are never used. most likely some issue with (sy) not properly
572           * syncing with values coming from a second memory transaction.
573           */
574          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
575          if (is_tess_levels(location)) {
576             assert(intr->dest.ssa.num_components == 1);
577             address = nir_load_tess_factor_base_ir3(b);
578             offset = build_tessfactor_base(
579                b, location, nir_intrinsic_component(intr), state);
580          } else {
581             address = nir_load_tess_param_base_ir3(b);
582             offset = build_patch_offset(b, state, location,
583                                         nir_intrinsic_component(intr),
584                                         intr->src[0].ssa);
585          }
586 
587          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
588                            offset, NULL);
589          break;
590       }
591 
592       case nir_intrinsic_store_output: {
593          // src[] = { value, offset }.
594 
595          /* write patch output to bo */
596 
597          b->cursor = nir_before_instr(&intr->instr);
598 
599          /* sparse writemask not supported */
600          assert(
601             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
602 
603          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
604          if (is_tess_levels(location)) {
605             uint32_t inner_levels, outer_levels, levels;
606             tess_level_components(state, &inner_levels, &outer_levels);
607 
608             assert(intr->src[0].ssa->num_components == 1);
609 
610             nir_if *nif = NULL;
611             if (location != VARYING_SLOT_PRIMITIVE_ID) {
612                /* with tess levels are defined as float[4] and float[2],
613                 * but tess factor BO has smaller sizes for tris/isolines,
614                 * so we have to discard any writes beyond the number of
615                 * components for inner/outer levels
616                 */
617                if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
618                   levels = outer_levels;
619                else
620                   levels = inner_levels;
621 
622                nir_ssa_def *offset = nir_iadd_imm(
623                   b, intr->src[1].ssa, nir_intrinsic_component(intr));
624                nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
625             }
626 
627             nir_ssa_def *offset = build_tessfactor_base(
628                b, location, nir_intrinsic_component(intr), state);
629 
630             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
631                               intr->src[0].ssa,
632                               nir_load_tess_factor_base_ir3(b),
633                               nir_iadd(b, intr->src[1].ssa, offset));
634 
635             if (location != VARYING_SLOT_PRIMITIVE_ID) {
636                nir_pop_if(b, nif);
637             }
638          } else {
639             nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
640             nir_ssa_def *offset = build_patch_offset(
641                b, state, location, nir_intrinsic_component(intr),
642                intr->src[1].ssa);
643 
644             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
645                               intr->src[0].ssa, address, offset);
646          }
647          break;
648       }
649 
650       default:
651          break;
652       }
653    }
654 }
655 
656 static void
emit_tess_epilouge(nir_builder * b,struct state * state)657 emit_tess_epilouge(nir_builder *b, struct state *state)
658 {
659    /* Insert endpatch instruction:
660     *
661     * TODO we should re-work this to use normal flow control.
662     */
663 
664    nir_end_patch_ir3(b);
665 }
666 
667 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)668 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
669                         unsigned topology)
670 {
671    struct state state = {.topology = topology};
672 
673    if (shader_debug_enabled(shader->info.stage)) {
674       mesa_logi("NIR (before tess lowering) for %s shader:",
675                 _mesa_shader_stage_to_string(shader->info.stage));
676       nir_log_shaderi(shader);
677    }
678 
679    build_primitive_map(shader, &state.map);
680    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
681    v->output_size = state.map.stride;
682 
683    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
684    assert(impl);
685 
686    nir_builder b;
687    nir_builder_init(&b, impl);
688    b.cursor = nir_before_cf_list(&impl->body);
689 
690    state.header = nir_load_tcs_header_ir3(&b);
691 
692    /* If required, store gl_PrimitiveID. */
693    if (v->key.tcs_store_primid) {
694       b.cursor = nir_after_cf_list(&impl->body);
695 
696       nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
697                        .io_semantics = {
698                            .location = VARYING_SLOT_PRIMITIVE_ID,
699                            .num_slots = 1
700                         });
701 
702       b.cursor = nir_before_cf_list(&impl->body);
703    }
704 
705    nir_foreach_block_safe (block, impl)
706       lower_tess_ctrl_block(block, &b, &state);
707 
708    /* Now move the body of the TCS into a conditional:
709     *
710     *   if (gl_InvocationID < num_vertices)
711     *     // body
712     *
713     */
714 
715    nir_cf_list body;
716    nir_cf_extract(&body, nir_before_cf_list(&impl->body),
717                   nir_after_cf_list(&impl->body));
718 
719    b.cursor = nir_after_cf_list(&impl->body);
720 
721    /* Re-emit the header, since the old one got moved into the if branch */
722    state.header = nir_load_tcs_header_ir3(&b);
723    nir_ssa_def *iid = build_invocation_id(&b, &state);
724 
725    const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
726    nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
727 
728    nir_if *nif = nir_push_if(&b, cond);
729 
730    nir_cf_reinsert(&body, b.cursor);
731 
732    b.cursor = nir_after_cf_list(&nif->then_list);
733 
734    /* Insert conditional exit for threads invocation id != 0 */
735    nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
736    nir_cond_end_ir3(&b, iid0_cond);
737 
738    emit_tess_epilouge(&b, &state);
739 
740    nir_pop_if(&b, nif);
741 
742    nir_metadata_preserve(impl, nir_metadata_none);
743 }
744 
745 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)746 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
747 {
748    nir_foreach_instr_safe (instr, block) {
749       if (instr->type != nir_instr_type_intrinsic)
750          continue;
751 
752       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
753 
754       switch (intr->intrinsic) {
755       case nir_intrinsic_load_tess_coord: {
756          b->cursor = nir_after_instr(&intr->instr);
757          nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
758          nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
759          nir_ssa_def *z;
760 
761          if (state->topology == IR3_TESS_TRIANGLES)
762             z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
763          else
764             z = nir_imm_float(b, 0.0f);
765 
766          nir_ssa_def *coord = nir_vec3(b, x, y, z);
767 
768          nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
769                                         b->cursor.instr);
770          break;
771       }
772 
773       case nir_intrinsic_load_per_vertex_input: {
774          // src[] = { vertex, offset }.
775 
776          b->cursor = nir_before_instr(&intr->instr);
777 
778          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
779          nir_ssa_def *offset = build_per_vertex_offset(
780             b, state, intr->src[0].ssa,
781             nir_intrinsic_io_semantics(intr).location,
782             nir_intrinsic_component(intr), intr->src[1].ssa);
783 
784          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
785                            offset, NULL);
786          break;
787       }
788 
789       case nir_intrinsic_load_input: {
790          // src[] = { offset }.
791 
792          b->cursor = nir_before_instr(&intr->instr);
793 
794          nir_ssa_def *address, *offset;
795 
796          /* note if vectorization of the tess level loads ever happens:
797           * "ldg" across 16-byte boundaries can behave incorrectly if results
798           * are never used. most likely some issue with (sy) not properly
799           * syncing with values coming from a second memory transaction.
800           */
801          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
802          if (is_tess_levels(location)) {
803             assert(intr->dest.ssa.num_components == 1);
804             address = nir_load_tess_factor_base_ir3(b);
805             offset = build_tessfactor_base(
806                b, location, nir_intrinsic_component(intr), state);
807          } else {
808             address = nir_load_tess_param_base_ir3(b);
809             offset = build_patch_offset(b, state, location,
810                                         nir_intrinsic_component(intr),
811                                         intr->src[0].ssa);
812          }
813 
814          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
815                            offset, NULL);
816          break;
817       }
818 
819       default:
820          break;
821       }
822    }
823 }
824 
825 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)826 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
827                         unsigned topology)
828 {
829    struct state state = {.topology = topology};
830 
831    if (shader_debug_enabled(shader->info.stage)) {
832       mesa_logi("NIR (before tess lowering) for %s shader:",
833                 _mesa_shader_stage_to_string(shader->info.stage));
834       nir_log_shaderi(shader);
835    }
836 
837    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
838    assert(impl);
839 
840    nir_builder b;
841    nir_builder_init(&b, impl);
842 
843    nir_foreach_block_safe (block, impl)
844       lower_tess_eval_block(block, &b, &state);
845 
846    v->input_size = calc_primitive_map_size(shader);
847 
848    nir_metadata_preserve(impl, nir_metadata_none);
849 }
850 
851 static void
copy_vars(nir_builder * b,struct exec_list * dests,struct exec_list * srcs)852 copy_vars(nir_builder *b, struct exec_list *dests, struct exec_list *srcs)
853 {
854    foreach_two_lists (dest_node, dests, src_node, srcs) {
855       nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
856       nir_variable *src = exec_node_data(nir_variable, src_node, node);
857       nir_copy_var(b, dest, src);
858    }
859 }
860 
861 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)862 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
863 {
864    nir_foreach_instr_safe (instr, block) {
865       if (instr->type != nir_instr_type_intrinsic)
866          continue;
867 
868       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
869 
870       switch (intr->intrinsic) {
871       case nir_intrinsic_end_primitive: {
872          /* Note: This ignores the stream, which seems to match the blob
873           * behavior. I'm guessing the HW ignores any extraneous cut
874           * signals from an EndPrimitive() that doesn't correspond to the
875           * rasterized stream.
876           */
877          b->cursor = nir_before_instr(&intr->instr);
878          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
879          nir_instr_remove(&intr->instr);
880          break;
881       }
882 
883       case nir_intrinsic_emit_vertex: {
884          /* Load the vertex count */
885          b->cursor = nir_before_instr(&intr->instr);
886          nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
887 
888          nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
889 
890          unsigned stream = nir_intrinsic_stream_id(intr);
891          /* vertex_flags_out |= stream */
892          nir_store_var(b, state->vertex_flags_out,
893                        nir_ior(b, nir_load_var(b, state->vertex_flags_out),
894                                nir_imm_int(b, stream)),
895                        0x1 /* .x */);
896 
897          copy_vars(b, &state->emit_outputs, &state->old_outputs);
898 
899          nir_instr_remove(&intr->instr);
900 
901          nir_store_var(b, state->emitted_vertex_var,
902                        nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
903                                 nir_imm_int(b, 1)),
904                        0x1);
905 
906          nir_pop_if(b, NULL);
907 
908          /* Increment the vertex count by 1 */
909          nir_store_var(b, state->vertex_count_var,
910                        nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
911          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
912 
913          break;
914       }
915 
916       default:
917          break;
918       }
919    }
920 }
921 
922 void
ir3_nir_lower_gs(nir_shader * shader)923 ir3_nir_lower_gs(nir_shader *shader)
924 {
925    struct state state = {};
926 
927    /* Don't lower multiple times: */
928    nir_foreach_shader_out_variable (var, shader)
929       if (var->data.location == VARYING_SLOT_GS_VERTEX_FLAGS_IR3)
930          return;
931 
932    if (shader_debug_enabled(shader->info.stage)) {
933       mesa_logi("NIR (before gs lowering):");
934       nir_log_shaderi(shader);
935    }
936 
937    /* Create an output var for vertex_flags. This will be shadowed below,
938     * same way regular outputs get shadowed, and this variable will become a
939     * temporary.
940     */
941    state.vertex_flags_out = nir_variable_create(
942       shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
943    state.vertex_flags_out->data.driver_location = shader->num_outputs++;
944    state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
945    state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
946 
947    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
948    assert(impl);
949 
950    nir_builder b;
951    nir_builder_init(&b, impl);
952    b.cursor = nir_before_cf_list(&impl->body);
953 
954    state.header = nir_load_gs_header_ir3(&b);
955 
956    /* Generate two set of shadow vars for the output variables.  The first
957     * set replaces the real outputs and the second set (emit_outputs) we'll
958     * assign in the emit_vertex conditionals.  Then at the end of the shader
959     * we copy the emit_outputs to the real outputs, so that we get
960     * store_output in uniform control flow.
961     */
962    exec_list_make_empty(&state.old_outputs);
963    nir_foreach_shader_out_variable_safe (var, shader) {
964       exec_node_remove(&var->node);
965       exec_list_push_tail(&state.old_outputs, &var->node);
966    }
967    exec_list_make_empty(&state.new_outputs);
968    exec_list_make_empty(&state.emit_outputs);
969    nir_foreach_variable_in_list (var, &state.old_outputs) {
970       /* Create a new output var by cloning the original output var and
971        * stealing the name.
972        */
973       nir_variable *output = nir_variable_clone(var, shader);
974       exec_list_push_tail(&state.new_outputs, &output->node);
975 
976       /* Rewrite the original output to be a shadow variable. */
977       var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
978       var->data.mode = nir_var_shader_temp;
979 
980       /* Clone the shadow variable to create the emit shadow variable that
981        * we'll assign in the emit conditionals.
982        */
983       nir_variable *emit_output = nir_variable_clone(var, shader);
984       emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
985       exec_list_push_tail(&state.emit_outputs, &emit_output->node);
986    }
987 
988    /* During the shader we'll keep track of which vertex we're currently
989     * emitting for the EmitVertex test and how many vertices we emitted so we
990     * know to discard if didn't emit any.  In most simple shaders, this can
991     * all be statically determined and gets optimized away.
992     */
993    state.vertex_count_var =
994       nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
995    state.emitted_vertex_var =
996       nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
997 
998    /* Initialize to 0. */
999    b.cursor = nir_before_cf_list(&impl->body);
1000    nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
1001    nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
1002    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1003 
1004    nir_foreach_block_safe (block, impl)
1005       lower_gs_block(block, &b, &state);
1006 
1007    /* Note: returns are lowered, so there should be only one block before the
1008     * end block.  If we had real returns, we would probably want to redirect
1009     * them to this new if statement, rather than emitting this code at every
1010     * return statement.
1011     */
1012    assert(impl->end_block->predecessors->entries == 1);
1013    nir_block *block = nir_impl_last_block(impl);
1014    b.cursor = nir_after_block_before_jump(block);
1015 
1016    /* If we haven't emitted any vertex we need to copy the shadow (old)
1017     * outputs to emit outputs here.
1018     *
1019     * Also some piglit GS tests[1] don't have EndPrimitive() so throw
1020     * in an extra vertex_flags write for good measure.  If unneeded it
1021     * will be optimized out.
1022     *
1023     * [1] ex, tests/spec/glsl-1.50/execution/compatibility/clipping/gs-clip-vertex-const-accept.shader_test
1024     */
1025    nir_ssa_def *cond =
1026       nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
1027    nir_push_if(&b, cond);
1028    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
1029    copy_vars(&b, &state.emit_outputs, &state.old_outputs);
1030    nir_pop_if(&b, NULL);
1031 
1032    nir_discard_if(&b, cond);
1033 
1034    copy_vars(&b, &state.new_outputs, &state.emit_outputs);
1035 
1036    exec_list_append(&shader->variables, &state.old_outputs);
1037    exec_list_append(&shader->variables, &state.emit_outputs);
1038    exec_list_append(&shader->variables, &state.new_outputs);
1039 
1040    nir_metadata_preserve(impl, nir_metadata_none);
1041 
1042    nir_lower_global_vars_to_local(shader);
1043    nir_split_var_copies(shader);
1044    nir_lower_var_copies(shader);
1045 
1046    nir_fixup_deref_modes(shader);
1047 
1048    if (shader_debug_enabled(shader->info.stage)) {
1049       mesa_logi("NIR (after gs lowering):");
1050       nir_log_shaderi(shader);
1051    }
1052 }
1053