1 /*
2 * Copyright © 2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26
27 #include "util/u_helpers.h"
28
29 /**
30 * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
31 * intrinsics into something amenable to the V3D architecture.
32 *
33 * Most of the work is turning the VS's store_output intrinsics from working
34 * on a base representing the gallium-level vec4 driver_location to an offset
35 * within the VPM, and emitting the header that's read by the fixed function
36 * hardware between the VS and FS.
37 *
38 * We also adjust the offsets on uniform loads to be in bytes, since that's
39 * what we need for indirect addressing with general TMU access.
40 */
41
42 struct v3d_nir_lower_io_state {
43 int pos_vpm_offset;
44 int vp_vpm_offset;
45 int zs_vpm_offset;
46 int rcp_wc_vpm_offset;
47 int psiz_vpm_offset;
48 int varyings_vpm_offset;
49
50 /* Geometry shader state */
51 struct {
52 /* VPM offset for the current vertex data output */
53 nir_variable *output_offset_var;
54 /* VPM offset for the current vertex header */
55 nir_variable *header_offset_var;
56 /* VPM header for the current vertex */
57 nir_variable *header_var;
58
59 /* Size of the complete VPM output header */
60 uint32_t output_header_size;
61 /* Size of the output data for a single vertex */
62 uint32_t output_vertex_data_size;
63 } gs;
64
65 BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
66
67 nir_ssa_def *pos[4];
68 };
69
70 static void
71 v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
72 struct v3d_nir_lower_io_state *state);
73
74 static void
v3d_nir_store_output(nir_builder * b,int base,nir_ssa_def * offset,nir_ssa_def * chan)75 v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
76 nir_ssa_def *chan)
77 {
78 if (offset) {
79 /* When generating the VIR instruction, the base and the offset
80 * are just going to get added together with an ADD instruction
81 * so we might as well do the add here at the NIR level instead
82 * and let the constant folding do its magic.
83 */
84 offset = nir_iadd_imm(b, offset, base);
85 base = 0;
86 } else {
87 offset = nir_imm_int(b, 0);
88 }
89
90 nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
91 }
92
93 /* Convert the uniform offset to bytes. If it happens to be a constant,
94 * constant-folding will clean up the shift for us.
95 */
96 static void
v3d_nir_lower_uniform(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)97 v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
98 nir_intrinsic_instr *intr)
99 {
100 /* On SPIR-V/Vulkan we are already getting our offsets in
101 * bytes.
102 */
103 if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
104 return;
105
106 b->cursor = nir_before_instr(&intr->instr);
107
108 nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
109
110 nir_instr_rewrite_src(&intr->instr,
111 &intr->src[0],
112 nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
113 nir_imm_int(b, 4))));
114 }
115
116 static int
v3d_varying_slot_vpm_offset(struct v3d_compile * c,unsigned location,unsigned component)117 v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
118 {
119 uint32_t num_used_outputs = 0;
120 struct v3d_varying_slot *used_outputs = NULL;
121 switch (c->s->info.stage) {
122 case MESA_SHADER_VERTEX:
123 num_used_outputs = c->vs_key->num_used_outputs;
124 used_outputs = c->vs_key->used_outputs;
125 break;
126 case MESA_SHADER_GEOMETRY:
127 num_used_outputs = c->gs_key->num_used_outputs;
128 used_outputs = c->gs_key->used_outputs;
129 break;
130 default:
131 unreachable("Unsupported shader stage");
132 }
133
134 for (int i = 0; i < num_used_outputs; i++) {
135 struct v3d_varying_slot slot = used_outputs[i];
136
137 if (v3d_slot_get_slot(slot) == location &&
138 v3d_slot_get_component(slot) == component) {
139 return i;
140 }
141 }
142
143 return -1;
144 }
145
146 /* Lowers a store_output(gallium driver location) to a series of store_outputs
147 * with a driver_location equal to the offset in the VPM.
148 *
149 * For geometry shaders we need to emit multiple vertices so the VPM offsets
150 * need to be computed in the shader code based on the current vertex index.
151 */
152 static void
v3d_nir_lower_vpm_output(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr,struct v3d_nir_lower_io_state * state)153 v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
154 nir_intrinsic_instr *intr,
155 struct v3d_nir_lower_io_state *state)
156 {
157 b->cursor = nir_before_instr(&intr->instr);
158
159 /* If this is a geometry shader we need to emit our outputs
160 * to the current vertex offset in the VPM.
161 */
162 nir_ssa_def *offset_reg =
163 c->s->info.stage == MESA_SHADER_GEOMETRY ?
164 nir_load_var(b, state->gs.output_offset_var) : NULL;
165
166 int start_comp = nir_intrinsic_component(intr);
167 unsigned location = nir_intrinsic_io_semantics(intr).location;
168 nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
169 intr->num_components);
170 /* Save off the components of the position for the setup of VPM inputs
171 * read by fixed function HW.
172 */
173 if (location == VARYING_SLOT_POS) {
174 for (int i = 0; i < intr->num_components; i++) {
175 state->pos[start_comp + i] = nir_channel(b, src, i);
176 }
177 }
178
179 /* Just psiz to the position in the FF header right now. */
180 if (location == VARYING_SLOT_PSIZ &&
181 state->psiz_vpm_offset != -1) {
182 v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src);
183 }
184
185 if (location == VARYING_SLOT_LAYER) {
186 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
187 nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
188 header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
189
190 /* From the GLES 3.2 spec:
191 *
192 * "When fragments are written to a layered framebuffer, the
193 * fragment’s layer number selects an image from the array
194 * of images at each attachment (...). If the fragment’s
195 * layer number is negative, or greater than or equal to
196 * the minimum number of layers of any attachment, the
197 * effects of the fragment on the framebuffer contents are
198 * undefined."
199 *
200 * This suggests we can just ignore that situation, however,
201 * for V3D an out-of-bounds layer index means that the binner
202 * might do out-of-bounds writes access to the tile state. The
203 * simulator has an assert to catch this, so we play safe here
204 * and we make sure that doesn't happen by setting gl_Layer
205 * to 0 in that case (we always allocate tile state for at
206 * least one layer).
207 */
208 nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
209 nir_ssa_def *cond = nir_ige(b, src, fb_layers);
210 nir_ssa_def *layer_id =
211 nir_bcsel(b, cond,
212 nir_imm_int(b, 0),
213 nir_ishl(b, src, nir_imm_int(b, 16)));
214 header = nir_ior(b, header, layer_id);
215 nir_store_var(b, state->gs.header_var, header, 0x1);
216 }
217
218 /* Scalarize outputs if it hasn't happened already, since we want to
219 * schedule each VPM write individually. We can skip any outut
220 * components not read by the FS.
221 */
222 for (int i = 0; i < intr->num_components; i++) {
223 int vpm_offset =
224 v3d_varying_slot_vpm_offset(c, location, start_comp + i);
225
226
227 if (vpm_offset == -1)
228 continue;
229
230 if (nir_src_is_const(intr->src[1]))
231 vpm_offset += nir_src_as_uint(intr->src[1]) * 4;
232
233 BITSET_SET(state->varyings_stored, vpm_offset);
234
235 v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
236 offset_reg, nir_channel(b, src, i));
237 }
238
239 nir_instr_remove(&intr->instr);
240 }
241
242 static inline void
reset_gs_header(nir_builder * b,struct v3d_nir_lower_io_state * state)243 reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state)
244 {
245 const uint8_t NEW_PRIMITIVE_OFFSET = 0;
246 const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8;
247
248 uint32_t vertex_data_size = state->gs.output_vertex_data_size;
249 assert((vertex_data_size & 0xffffff00) == 0);
250
251 uint32_t header;
252 header = 1 << NEW_PRIMITIVE_OFFSET;
253 header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET;
254 nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1);
255 }
256
257 static void
v3d_nir_lower_emit_vertex(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * instr,struct v3d_nir_lower_io_state * state)258 v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
259 nir_intrinsic_instr *instr,
260 struct v3d_nir_lower_io_state *state)
261 {
262 b->cursor = nir_before_instr(&instr->instr);
263
264 nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
265 nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
266 nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
267
268 /* Emit fixed function outputs */
269 v3d_nir_emit_ff_vpm_outputs(c, b, state);
270
271 /* Emit vertex header */
272 v3d_nir_store_output(b, 0, header_offset, header);
273
274 /* Update VPM offset for next vertex output data and header */
275 output_offset =
276 nir_iadd(b, output_offset,
277 nir_imm_int(b, state->gs.output_vertex_data_size));
278
279 header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
280
281 /* Reset the New Primitive bit */
282 header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
283
284 nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
285 nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
286 nir_store_var(b, state->gs.header_var, header, 0x1);
287
288 nir_instr_remove(&instr->instr);
289 }
290
291 static void
v3d_nir_lower_end_primitive(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * instr,struct v3d_nir_lower_io_state * state)292 v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
293 nir_intrinsic_instr *instr,
294 struct v3d_nir_lower_io_state *state)
295 {
296 assert(state->gs.header_var);
297 b->cursor = nir_before_instr(&instr->instr);
298 reset_gs_header(b, state);
299
300 nir_instr_remove(&instr->instr);
301 }
302
303 /* Some vertex attribute formats may require to apply a swizzle but the hardware
304 * doesn't provide means to do that, so we need to apply the swizzle in the
305 * vertex shader.
306 *
307 * This is required at least in Vulkan to support madatory vertex attribute
308 * format VK_FORMAT_B8G8R8A8_UNORM.
309 */
310 static void
v3d_nir_lower_vertex_input(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * instr)311 v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
312 nir_intrinsic_instr *instr)
313 {
314 assert(c->s->info.stage == MESA_SHADER_VERTEX);
315
316 if (!c->vs_key->va_swap_rb_mask)
317 return;
318
319 const uint32_t location = nir_intrinsic_io_semantics(instr).location;
320
321 if (!(c->vs_key->va_swap_rb_mask & (1 << location)))
322 return;
323
324 assert(instr->num_components == 1);
325 const uint32_t comp = nir_intrinsic_component(instr);
326 if (comp == 0 || comp == 2)
327 nir_intrinsic_set_component(instr, (comp + 2) % 4);
328 }
329
330 /* Sometimes the origin of gl_PointCoord is in the upper left rather than the
331 * lower left so we need to flip it.
332 *
333 * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
334 */
335 static void
v3d_nir_lower_fragment_input(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)336 v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
337 nir_intrinsic_instr *intr)
338 {
339 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
340
341 /* Gallium uses lower_wpos_pntc */
342 if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
343 return;
344
345 b->cursor = nir_after_instr(&intr->instr);
346
347 int comp = nir_intrinsic_component(intr);
348
349 nir_variable *input_var =
350 nir_find_variable_with_driver_location(c->s,
351 nir_var_shader_in,
352 nir_intrinsic_base(intr));
353
354 if (input_var && util_varying_is_point_coord(input_var->data.location,
355 c->fs_key->point_sprite_mask)) {
356 assert(intr->num_components == 1);
357
358 nir_ssa_def *result = &intr->dest.ssa;
359
360 switch (comp) {
361 case 0:
362 case 1:
363 if (!c->fs_key->is_points)
364 result = nir_imm_float(b, 0.0);
365 break;
366 case 2:
367 result = nir_imm_float(b, 0.0);
368 break;
369 case 3:
370 result = nir_imm_float(b, 1.0);
371 break;
372 }
373 if (c->fs_key->point_coord_upper_left && comp == 1)
374 result = nir_fsub(b, nir_imm_float(b, 1.0), result);
375 if (result != &intr->dest.ssa) {
376 nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
377 result,
378 result->parent_instr);
379 }
380 }
381 }
382
383 static void
v3d_nir_lower_io_instr(struct v3d_compile * c,nir_builder * b,struct nir_instr * instr,struct v3d_nir_lower_io_state * state)384 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
385 struct nir_instr *instr,
386 struct v3d_nir_lower_io_state *state)
387 {
388 if (instr->type != nir_instr_type_intrinsic)
389 return;
390 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
391
392 switch (intr->intrinsic) {
393 case nir_intrinsic_load_input:
394 if (c->s->info.stage == MESA_SHADER_VERTEX)
395 v3d_nir_lower_vertex_input(c, b, intr);
396 else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
397 v3d_nir_lower_fragment_input(c, b, intr);
398 break;
399
400 case nir_intrinsic_load_uniform:
401 v3d_nir_lower_uniform(c, b, intr);
402 break;
403
404 case nir_intrinsic_store_output:
405 if (c->s->info.stage == MESA_SHADER_VERTEX ||
406 c->s->info.stage == MESA_SHADER_GEOMETRY) {
407 v3d_nir_lower_vpm_output(c, b, intr, state);
408 }
409 break;
410
411 case nir_intrinsic_emit_vertex:
412 v3d_nir_lower_emit_vertex(c, b, intr, state);
413 break;
414
415 case nir_intrinsic_end_primitive:
416 v3d_nir_lower_end_primitive(c, b, intr, state);
417 break;
418
419 default:
420 break;
421 }
422 }
423
424 /* Remap the output var's .driver_location. This is purely for
425 * nir_print_shader() so that store_output can map back to a variable name.
426 */
427 static void
v3d_nir_lower_io_update_output_var_base(struct v3d_compile * c,struct v3d_nir_lower_io_state * state)428 v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
429 struct v3d_nir_lower_io_state *state)
430 {
431 nir_foreach_shader_out_variable_safe(var, c->s) {
432 if (var->data.location == VARYING_SLOT_POS &&
433 state->pos_vpm_offset != -1) {
434 var->data.driver_location = state->pos_vpm_offset;
435 continue;
436 }
437
438 if (var->data.location == VARYING_SLOT_PSIZ &&
439 state->psiz_vpm_offset != -1) {
440 var->data.driver_location = state->psiz_vpm_offset;
441 continue;
442 }
443
444 int vpm_offset =
445 v3d_varying_slot_vpm_offset(c,
446 var->data.location,
447 var->data.location_frac);
448 if (vpm_offset != -1) {
449 var->data.driver_location =
450 state->varyings_vpm_offset + vpm_offset;
451 } else {
452 /* If we couldn't find a mapping for the var, delete
453 * it so that its old .driver_location doesn't confuse
454 * nir_print_shader().
455 */
456 exec_node_remove(&var->node);
457 }
458 }
459 }
460
461 static void
v3d_nir_setup_vpm_layout_vs(struct v3d_compile * c,struct v3d_nir_lower_io_state * state)462 v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c,
463 struct v3d_nir_lower_io_state *state)
464 {
465 uint32_t vpm_offset = 0;
466
467 state->pos_vpm_offset = -1;
468 state->vp_vpm_offset = -1;
469 state->zs_vpm_offset = -1;
470 state->rcp_wc_vpm_offset = -1;
471 state->psiz_vpm_offset = -1;
472
473 bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage;
474 if (needs_ff_outputs) {
475 if (c->vs_key->is_coord) {
476 state->pos_vpm_offset = vpm_offset;
477 vpm_offset += 4;
478 }
479
480 state->vp_vpm_offset = vpm_offset;
481 vpm_offset += 2;
482
483 if (!c->vs_key->is_coord) {
484 state->zs_vpm_offset = vpm_offset++;
485 state->rcp_wc_vpm_offset = vpm_offset++;
486 }
487
488 if (c->vs_key->per_vertex_point_size)
489 state->psiz_vpm_offset = vpm_offset++;
490 }
491
492 state->varyings_vpm_offset = vpm_offset;
493
494 c->vpm_output_size = MAX2(1, vpm_offset + c->vs_key->num_used_outputs);
495 }
496
497 static void
v3d_nir_setup_vpm_layout_gs(struct v3d_compile * c,struct v3d_nir_lower_io_state * state)498 v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c,
499 struct v3d_nir_lower_io_state *state)
500 {
501 /* 1 header slot for number of output vertices */
502 uint32_t vpm_offset = 1;
503
504 /* 1 header slot per output vertex */
505 const uint32_t num_vertices = c->s->info.gs.vertices_out;
506 vpm_offset += num_vertices;
507
508 state->gs.output_header_size = vpm_offset;
509
510 /* Vertex data: here we only compute offsets into a generic vertex data
511 * elements. When it is time to actually write a particular vertex to
512 * the VPM, we will add the offset for that vertex into the VPM output
513 * to these offsets.
514 *
515 * If geometry shaders are present, they are always the last shader
516 * stage before rasterization, so we always emit fixed function outputs.
517 */
518 vpm_offset = 0;
519 if (c->gs_key->is_coord) {
520 state->pos_vpm_offset = vpm_offset;
521 vpm_offset += 4;
522 } else {
523 state->pos_vpm_offset = -1;
524 }
525
526 state->vp_vpm_offset = vpm_offset;
527 vpm_offset += 2;
528
529 if (!c->gs_key->is_coord) {
530 state->zs_vpm_offset = vpm_offset++;
531 state->rcp_wc_vpm_offset = vpm_offset++;
532 } else {
533 state->zs_vpm_offset = -1;
534 state->rcp_wc_vpm_offset = -1;
535 }
536
537 /* Mesa enables OES_geometry_shader_point_size automatically with
538 * OES_geometry_shader so we always need to handle point size
539 * writes if present.
540 */
541 if (c->gs_key->per_vertex_point_size)
542 state->psiz_vpm_offset = vpm_offset++;
543
544 state->varyings_vpm_offset = vpm_offset;
545
546 state->gs.output_vertex_data_size =
547 state->varyings_vpm_offset + c->gs_key->num_used_outputs;
548
549 c->vpm_output_size =
550 state->gs.output_header_size +
551 state->gs.output_vertex_data_size * num_vertices;
552 }
553
554 static void
v3d_nir_emit_ff_vpm_outputs(struct v3d_compile * c,nir_builder * b,struct v3d_nir_lower_io_state * state)555 v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
556 struct v3d_nir_lower_io_state *state)
557 {
558 /* If this is a geometry shader we need to emit our fixed function
559 * outputs to the current vertex offset in the VPM.
560 */
561 nir_ssa_def *offset_reg =
562 c->s->info.stage == MESA_SHADER_GEOMETRY ?
563 nir_load_var(b, state->gs.output_offset_var) : NULL;
564
565 for (int i = 0; i < 4; i++) {
566 if (!state->pos[i])
567 state->pos[i] = nir_ssa_undef(b, 1, 32);
568 }
569
570 nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
571
572 if (state->pos_vpm_offset != -1) {
573 for (int i = 0; i < 4; i++) {
574 v3d_nir_store_output(b, state->pos_vpm_offset + i,
575 offset_reg, state->pos[i]);
576 }
577 }
578
579 if (state->vp_vpm_offset != -1) {
580 for (int i = 0; i < 2; i++) {
581 nir_ssa_def *pos;
582 nir_ssa_def *scale;
583 pos = state->pos[i];
584 if (i == 0)
585 scale = nir_load_viewport_x_scale(b);
586 else
587 scale = nir_load_viewport_y_scale(b);
588 pos = nir_fmul(b, pos, scale);
589 pos = nir_fmul(b, pos, rcp_wc);
590 /* Pre-V3D 4.3 hardware has a quirk where it expects XY
591 * coordinates in .8 fixed-point format, but then it
592 * will internally round it to .6 fixed-point,
593 * introducing a double rounding. The double rounding
594 * can cause very slight differences in triangle
595 * raterization coverage that can actually be noticed by
596 * some CTS tests.
597 *
598 * The correct fix for this as recommended by Broadcom
599 * is to convert to .8 fixed-point with ffloor().
600 */
601 pos = nir_f2i32(b, nir_ffloor(b, pos));
602 v3d_nir_store_output(b, state->vp_vpm_offset + i,
603 offset_reg, pos);
604 }
605 }
606
607 if (state->zs_vpm_offset != -1) {
608 nir_ssa_def *z = state->pos[2];
609 z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
610 z = nir_fmul(b, z, rcp_wc);
611 z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
612 v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z);
613 }
614
615 if (state->rcp_wc_vpm_offset != -1) {
616 v3d_nir_store_output(b, state->rcp_wc_vpm_offset,
617 offset_reg, rcp_wc);
618 }
619
620 /* Store 0 to varyings requested by the FS but not stored by the
621 * previous stage. This should be undefined behavior, but
622 * glsl-routing seems to rely on it.
623 */
624 uint32_t num_used_outputs;
625 switch (c->s->info.stage) {
626 case MESA_SHADER_VERTEX:
627 num_used_outputs = c->vs_key->num_used_outputs;
628 break;
629 case MESA_SHADER_GEOMETRY:
630 num_used_outputs = c->gs_key->num_used_outputs;
631 break;
632 default:
633 unreachable("Unsupported shader stage");
634 }
635
636 for (int i = 0; i < num_used_outputs; i++) {
637 if (!BITSET_TEST(state->varyings_stored, i)) {
638 v3d_nir_store_output(b, state->varyings_vpm_offset + i,
639 offset_reg, nir_imm_int(b, 0));
640 }
641 }
642 }
643
644 static void
emit_gs_prolog(struct v3d_compile * c,nir_builder * b,nir_function_impl * impl,struct v3d_nir_lower_io_state * state)645 emit_gs_prolog(struct v3d_compile *c, nir_builder *b,
646 nir_function_impl *impl,
647 struct v3d_nir_lower_io_state *state)
648 {
649 nir_block *first = nir_start_block(impl);
650 b->cursor = nir_before_block(first);
651
652 const struct glsl_type *uint_type = glsl_uint_type();
653
654 assert(!state->gs.output_offset_var);
655 state->gs.output_offset_var =
656 nir_local_variable_create(impl, uint_type, "output_offset");
657 nir_store_var(b, state->gs.output_offset_var,
658 nir_imm_int(b, state->gs.output_header_size), 0x1);
659
660 assert(!state->gs.header_offset_var);
661 state->gs.header_offset_var =
662 nir_local_variable_create(impl, uint_type, "header_offset");
663 nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1);
664
665 assert(!state->gs.header_var);
666 state->gs.header_var =
667 nir_local_variable_create(impl, uint_type, "header");
668 reset_gs_header(b, state);
669 }
670
671 static void
emit_gs_vpm_output_header_prolog(struct v3d_compile * c,nir_builder * b,struct v3d_nir_lower_io_state * state)672 emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
673 struct v3d_nir_lower_io_state *state)
674 {
675 const uint8_t VERTEX_COUNT_OFFSET = 16;
676
677 /* Our GS header has 1 generic header slot (at VPM offset 0) and then
678 * one slot per output vertex after it. This means we don't need to
679 * have a variable just to keep track of the number of vertices we
680 * emitted and instead we can just compute it here from the header
681 * offset variable by removing the one generic header slot that always
682 * goes at the begining of out header.
683 */
684 nir_ssa_def *header_offset =
685 nir_load_var(b, state->gs.header_offset_var);
686 nir_ssa_def *vertex_count =
687 nir_isub(b, header_offset, nir_imm_int(b, 1));
688 nir_ssa_def *header =
689 nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
690 nir_ishl(b, vertex_count,
691 nir_imm_int(b, VERTEX_COUNT_OFFSET)));
692
693 v3d_nir_store_output(b, 0, NULL, header);
694 }
695
696 bool
v3d_nir_lower_io(nir_shader * s,struct v3d_compile * c)697 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
698 {
699 struct v3d_nir_lower_io_state state = { 0 };
700
701 /* Set up the layout of the VPM outputs. */
702 switch (s->info.stage) {
703 case MESA_SHADER_VERTEX:
704 v3d_nir_setup_vpm_layout_vs(c, &state);
705 break;
706 case MESA_SHADER_GEOMETRY:
707 v3d_nir_setup_vpm_layout_gs(c, &state);
708 break;
709 case MESA_SHADER_FRAGMENT:
710 case MESA_SHADER_COMPUTE:
711 break;
712 default:
713 unreachable("Unsupported shader stage");
714 }
715
716 nir_foreach_function(function, s) {
717 if (function->impl) {
718 nir_builder b;
719 nir_builder_init(&b, function->impl);
720
721 if (c->s->info.stage == MESA_SHADER_GEOMETRY)
722 emit_gs_prolog(c, &b, function->impl, &state);
723
724 nir_foreach_block(block, function->impl) {
725 nir_foreach_instr_safe(instr, block)
726 v3d_nir_lower_io_instr(c, &b, instr,
727 &state);
728 }
729
730 nir_block *last = nir_impl_last_block(function->impl);
731 b.cursor = nir_after_block(last);
732 if (s->info.stage == MESA_SHADER_VERTEX) {
733 v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
734 } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
735 emit_gs_vpm_output_header_prolog(c, &b, &state);
736 }
737
738 nir_metadata_preserve(function->impl,
739 nir_metadata_block_index |
740 nir_metadata_dominance);
741 }
742 }
743
744 if (s->info.stage == MESA_SHADER_VERTEX ||
745 s->info.stage == MESA_SHADER_GEOMETRY) {
746 v3d_nir_lower_io_update_output_var_base(c, &state);
747 }
748
749 /* It is really unlikely that we don't get progress here, and fully
750 * filtering when not would make code more complex, but we are still
751 * interested on getting this lowering going through NIR_PASS
752 */
753 return true;
754 }
755