1 /*
2 * Copyright © 2006-2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26
27 using namespace elk;
28
elk_vs_thread_payload(const elk_fs_visitor & v)29 elk_vs_thread_payload::elk_vs_thread_payload(const elk_fs_visitor &v)
30 {
31 unsigned r = 0;
32
33 /* R0: Thread header. */
34 r += reg_unit(v.devinfo);
35
36 /* R1: URB handles. */
37 urb_handles = elk_ud8_grf(r, 0);
38 r += reg_unit(v.devinfo);
39
40 num_regs = r;
41 }
42
elk_tcs_thread_payload(const elk_fs_visitor & v)43 elk_tcs_thread_payload::elk_tcs_thread_payload(const elk_fs_visitor &v)
44 {
45 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
46 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(v.prog_data);
47 struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) v.key;
48
49 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
50 patch_urb_output = elk_ud1_grf(0, 0);
51 primitive_id = elk_vec1_grf(0, 1);
52
53 /* r1-r4 contain the ICP handles. */
54 icp_handle_start = elk_ud8_grf(1, 0);
55
56 num_regs = 5;
57 } else {
58 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
59 assert(tcs_key->input_vertices <= ELK_MAX_TCS_INPUT_VERTICES);
60
61 unsigned r = 0;
62
63 r += reg_unit(v.devinfo);
64
65 patch_urb_output = elk_ud8_grf(r, 0);
66 r += reg_unit(v.devinfo);
67
68 if (tcs_prog_data->include_primitive_id) {
69 primitive_id = elk_vec8_grf(r, 0);
70 r += reg_unit(v.devinfo);
71 }
72
73 /* ICP handles occupy the next 1-32 registers. */
74 icp_handle_start = elk_ud8_grf(r, 0);
75 r += elk_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
76
77 num_regs = r;
78 }
79 }
80
elk_tes_thread_payload(const elk_fs_visitor & v)81 elk_tes_thread_payload::elk_tes_thread_payload(const elk_fs_visitor &v)
82 {
83 unsigned r = 0;
84
85 /* R0: Thread Header. */
86 patch_urb_input = retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD);
87 primitive_id = elk_vec1_grf(0, 1);
88 r += reg_unit(v.devinfo);
89
90 /* R1-3: gl_TessCoord.xyz. */
91 for (unsigned i = 0; i < 3; i++) {
92 coords[i] = elk_vec8_grf(r, 0);
93 r += reg_unit(v.devinfo);
94 }
95
96 /* R4: URB output handles. */
97 urb_output = elk_ud8_grf(r, 0);
98 r += reg_unit(v.devinfo);
99
100 num_regs = r;
101 }
102
elk_gs_thread_payload(elk_fs_visitor & v)103 elk_gs_thread_payload::elk_gs_thread_payload(elk_fs_visitor &v)
104 {
105 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
106 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(v.prog_data);
107 const fs_builder bld = fs_builder(&v).at_end();
108
109 /* R0: thread header. */
110 unsigned r = reg_unit(v.devinfo);
111
112 /* R1: output URB handles. */
113 urb_handles = bld.vgrf(ELK_REGISTER_TYPE_UD);
114 bld.AND(urb_handles, elk_ud8_grf(r, 0),
115 v.devinfo->ver >= 20 ? elk_imm_ud(0xFFFFFF) : elk_imm_ud(0xFFFF));
116
117 /* R1: Instance ID stored in bits 31:27 */
118 instance_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
119 bld.SHR(instance_id, elk_ud8_grf(r, 0), elk_imm_ud(27u));
120
121 r += reg_unit(v.devinfo);
122
123 if (gs_prog_data->include_primitive_id) {
124 primitive_id = elk_ud8_grf(r, 0);
125 r += reg_unit(v.devinfo);
126 }
127
128 /* Always enable VUE handles so we can safely use pull model if needed.
129 *
130 * The push model for a GS uses a ton of register space even for trivial
131 * scenarios with just a few inputs, so just make things easier and a bit
132 * safer by always having pull model available.
133 */
134 gs_prog_data->base.include_vue_handles = true;
135
136 /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
137 icp_handle_start = elk_ud8_grf(r, 0);
138 r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
139
140 num_regs = r;
141
142 /* Use a maximum of 24 registers for push-model inputs. */
143 const unsigned max_push_components = 24;
144
145 /* If pushing our inputs would take too many registers, reduce the URB read
146 * length (which is in HWords, or 8 registers), and resort to pulling.
147 *
148 * Note that the GS reads <URB Read Length> HWords for every vertex - so we
149 * have to multiply by VerticesIn to obtain the total storage requirement.
150 */
151 if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
152 max_push_components) {
153 vue_prog_data->urb_read_length =
154 ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
155 }
156 }
157
158 static inline void
setup_fs_payload_gfx20(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target)159 setup_fs_payload_gfx20(elk_fs_thread_payload &payload,
160 const elk_fs_visitor &v,
161 bool &source_depth_to_render_target)
162 {
163 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
164 const unsigned payload_width = 16;
165 assert(v.dispatch_width % payload_width == 0);
166 assert(v.devinfo->ver >= 20);
167
168 for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
169 /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
170 payload.num_regs++;
171 payload.subspan_coord_reg[j] = payload.num_regs++;
172 }
173
174 for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
175 /* R2-13: Barycentric interpolation coordinates. These appear
176 * in the same order that they appear in the elk_barycentric_mode
177 * enum. Each set of coordinates occupies 2 64B registers per
178 * SIMD16 half. Coordinates only appear if they were enabled
179 * using the "Barycentric Interpolation Mode" bits in WM_STATE.
180 */
181 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
182 if (prog_data->barycentric_interp_modes & (1 << i)) {
183 payload.barycentric_coord_reg[i][j] = payload.num_regs;
184 payload.num_regs += payload_width / 4;
185 }
186 }
187
188 /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
189 if (prog_data->uses_src_depth) {
190 payload.source_depth_reg[j] = payload.num_regs;
191 payload.num_regs += payload_width / 8;
192 }
193
194 /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
195 if (prog_data->uses_src_w) {
196 payload.source_w_reg[j] = payload.num_regs;
197 payload.num_regs += payload_width / 8;
198 }
199
200 /* R16: MSAA input coverage mask if "Pixel Shader Uses Input
201 * Coverage Mask" is set.
202 */
203 if (prog_data->uses_sample_mask) {
204 payload.sample_mask_in_reg[j] = payload.num_regs;
205 payload.num_regs += payload_width / 8;
206 }
207
208 /* R19: MSAA position XY offsets if "Position XY Offset Select"
209 * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE. Note that
210 * this is delivered as a single SIMD32 vector, inconsistently
211 * with most other PS payload fields.
212 */
213 if (prog_data->uses_pos_offset && j == 0) {
214 for (unsigned k = 0; k < 2; k++) {
215 payload.sample_pos_reg[k] = payload.num_regs;
216 payload.num_regs++;
217 }
218 }
219 }
220
221 if (prog_data->uses_depth_w_coefficients) {
222 assert(v.max_polygons == 1);
223 payload.depth_w_coef_reg = payload.num_regs;
224 payload.num_regs += 2;
225 }
226
227 if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
228 source_depth_to_render_target = true;
229 }
230 }
231
232 static inline void
setup_fs_payload_gfx6(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target)233 setup_fs_payload_gfx6(elk_fs_thread_payload &payload,
234 const elk_fs_visitor &v,
235 bool &source_depth_to_render_target)
236 {
237 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
238
239 const unsigned payload_width = MIN2(16, v.dispatch_width);
240 assert(v.dispatch_width % payload_width == 0);
241 assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
242
243 payload.num_regs = 0;
244
245 /* R0: PS thread payload header. */
246 payload.num_regs++;
247
248 for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
249 /* R1: masks, pixel X/Y coordinates. */
250 payload.subspan_coord_reg[j] = payload.num_regs++;
251 }
252
253 for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
254 /* R3-26: barycentric interpolation coordinates. These appear in the
255 * same order that they appear in the elk_barycentric_mode enum. Each
256 * set of coordinates occupies 2 registers if dispatch width == 8 and 4
257 * registers if dispatch width == 16. Coordinates only appear if they
258 * were enabled using the "Barycentric Interpolation Mode" bits in
259 * WM_STATE.
260 */
261 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
262 if (prog_data->barycentric_interp_modes & (1 << i)) {
263 payload.barycentric_coord_reg[i][j] = payload.num_regs;
264 payload.num_regs += payload_width / 4;
265 }
266 }
267
268 /* R27-28: interpolated depth if uses source depth */
269 if (prog_data->uses_src_depth) {
270 payload.source_depth_reg[j] = payload.num_regs;
271 payload.num_regs += payload_width / 8;
272 }
273
274 /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
275 if (prog_data->uses_src_w) {
276 payload.source_w_reg[j] = payload.num_regs;
277 payload.num_regs += payload_width / 8;
278 }
279
280 /* R31: MSAA position offsets. */
281 if (prog_data->uses_pos_offset) {
282 payload.sample_pos_reg[j] = payload.num_regs;
283 payload.num_regs++;
284 }
285
286 /* R32-33: MSAA input coverage mask */
287 if (prog_data->uses_sample_mask) {
288 assert(v.devinfo->ver >= 7);
289 payload.sample_mask_in_reg[j] = payload.num_regs;
290 payload.num_regs += payload_width / 8;
291 }
292 }
293
294 /* R66: Source Depth and/or W Attribute Vertex Deltas */
295 if (prog_data->uses_depth_w_coefficients) {
296 assert(v.max_polygons == 1);
297 payload.depth_w_coef_reg = payload.num_regs;
298 payload.num_regs++;
299 }
300
301 if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
302 source_depth_to_render_target = true;
303 }
304 }
305
306 #undef P /* prompted depth */
307 #undef C /* computed */
308 #undef N /* non-promoted? */
309
310 #define P 0
311 #define C 1
312 #define N 2
313
314 static const struct {
315 GLuint mode:2;
316 GLuint sd_present:1;
317 GLuint sd_to_rt:1;
318 GLuint dd_present:1;
319 GLuint ds_present:1;
320 } wm_iz_table[ELK_WM_IZ_BIT_MAX] =
321 {
322 { P, 0, 0, 0, 0 },
323 { P, 0, 0, 0, 0 },
324 { P, 0, 0, 0, 0 },
325 { P, 0, 0, 0, 0 },
326 { P, 0, 0, 0, 0 },
327 { N, 1, 1, 0, 0 },
328 { N, 0, 1, 0, 0 },
329 { N, 0, 1, 0, 0 },
330 { P, 0, 0, 0, 0 },
331 { P, 0, 0, 0, 0 },
332 { C, 0, 1, 1, 0 },
333 { C, 0, 1, 1, 0 },
334 { P, 0, 0, 0, 0 },
335 { N, 1, 1, 0, 0 },
336 { C, 0, 1, 1, 0 },
337 { C, 0, 1, 1, 0 },
338 { P, 0, 0, 0, 0 },
339 { P, 0, 0, 0, 0 },
340 { P, 0, 0, 0, 0 },
341 { P, 0, 0, 0, 0 },
342 { P, 0, 0, 0, 0 },
343 { N, 1, 1, 0, 0 },
344 { N, 0, 1, 0, 0 },
345 { N, 0, 1, 0, 0 },
346 { P, 0, 0, 0, 0 },
347 { P, 0, 0, 0, 0 },
348 { C, 0, 1, 1, 0 },
349 { C, 0, 1, 1, 0 },
350 { P, 0, 0, 0, 0 },
351 { N, 1, 1, 0, 0 },
352 { C, 0, 1, 1, 0 },
353 { C, 0, 1, 1, 0 },
354 { P, 0, 0, 0, 0 },
355 { P, 0, 0, 0, 0 },
356 { P, 0, 0, 0, 0 },
357 { P, 0, 0, 0, 0 },
358 { P, 0, 0, 0, 0 },
359 { N, 1, 1, 0, 1 },
360 { N, 0, 1, 0, 1 },
361 { N, 0, 1, 0, 1 },
362 { P, 0, 0, 0, 0 },
363 { P, 0, 0, 0, 0 },
364 { C, 0, 1, 1, 1 },
365 { C, 0, 1, 1, 1 },
366 { P, 0, 0, 0, 0 },
367 { N, 1, 1, 0, 1 },
368 { C, 0, 1, 1, 1 },
369 { C, 0, 1, 1, 1 },
370 { P, 0, 0, 0, 0 },
371 { C, 0, 0, 0, 1 },
372 { P, 0, 0, 0, 0 },
373 { C, 0, 1, 0, 1 },
374 { P, 0, 0, 0, 0 },
375 { C, 1, 1, 0, 1 },
376 { C, 0, 1, 0, 1 },
377 { C, 0, 1, 0, 1 },
378 { P, 0, 0, 0, 0 },
379 { C, 1, 1, 1, 1 },
380 { C, 0, 1, 1, 1 },
381 { C, 0, 1, 1, 1 },
382 { P, 0, 0, 0, 0 },
383 { C, 1, 1, 1, 1 },
384 { C, 0, 1, 1, 1 },
385 { C, 0, 1, 1, 1 }
386 };
387
388 /**
389 * \param line_aa ELK_NEVER, ELK_ALWAYS or ELK_SOMETIMES
390 * \param lookup bitmask of ELK_WM_IZ_* flags
391 */
392 static inline void
setup_fs_payload_gfx4(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)393 setup_fs_payload_gfx4(elk_fs_thread_payload &payload,
394 const elk_fs_visitor &v,
395 bool &source_depth_to_render_target,
396 bool &runtime_check_aads_emit)
397 {
398 assert(v.dispatch_width <= 16);
399
400 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
401 elk_wm_prog_key *key = (elk_wm_prog_key *) v.key;
402
403 GLuint reg = 1;
404 bool kill_stats_promoted_workaround = false;
405 int lookup = key->iz_lookup;
406
407 assert(lookup < ELK_WM_IZ_BIT_MAX);
408
409 /* Crazy workaround in the windowizer, which we need to track in
410 * our register allocation and render target writes. See the "If
411 * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
412 * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
413 */
414 if (key->stats_wm &&
415 (lookup & ELK_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
416 wm_iz_table[lookup].mode == P) {
417 kill_stats_promoted_workaround = true;
418 }
419
420 payload.subspan_coord_reg[0] = reg++;
421
422 if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
423 kill_stats_promoted_workaround) {
424 payload.source_depth_reg[0] = reg;
425 reg += 2;
426 }
427
428 if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
429 source_depth_to_render_target = true;
430
431 if (wm_iz_table[lookup].ds_present || key->line_aa != ELK_NEVER) {
432 payload.aa_dest_stencil_reg[0] = reg;
433 runtime_check_aads_emit =
434 !wm_iz_table[lookup].ds_present && key->line_aa == ELK_SOMETIMES;
435 reg++;
436 }
437
438 if (wm_iz_table[lookup].dd_present) {
439 payload.dest_depth_reg[0] = reg;
440 reg+=2;
441 }
442
443 payload.num_regs = reg;
444 }
445
446 #undef P /* prompted depth */
447 #undef C /* computed */
448 #undef N /* non-promoted? */
449
elk_fs_thread_payload(const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)450 elk_fs_thread_payload::elk_fs_thread_payload(const elk_fs_visitor &v,
451 bool &source_depth_to_render_target,
452 bool &runtime_check_aads_emit)
453 : subspan_coord_reg(),
454 source_depth_reg(),
455 source_w_reg(),
456 aa_dest_stencil_reg(),
457 dest_depth_reg(),
458 sample_pos_reg(),
459 sample_mask_in_reg(),
460 depth_w_coef_reg(),
461 barycentric_coord_reg()
462 {
463 if (v.devinfo->ver >= 20)
464 setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
465 else if (v.devinfo->ver >= 6)
466 setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
467 else
468 setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
469 runtime_check_aads_emit);
470 }
471
elk_cs_thread_payload(const elk_fs_visitor & v)472 elk_cs_thread_payload::elk_cs_thread_payload(const elk_fs_visitor &v)
473 {
474 struct elk_cs_prog_data *prog_data = elk_cs_prog_data(v.prog_data);
475
476 unsigned r = reg_unit(v.devinfo);
477
478 /* See nir_setup_uniforms for subgroup_id in earlier versions. */
479 if (v.devinfo->verx10 >= 125) {
480 subgroup_id_ = elk_ud1_grf(0, 2);
481
482 for (int i = 0; i < 3; i++) {
483 if (prog_data->generate_local_id & (1 << i)) {
484 local_invocation_id[i] = elk_uw8_grf(r, 0);
485 r += reg_unit(v.devinfo);
486 if (v.devinfo->ver < 20 && v.dispatch_width == 32)
487 r += reg_unit(v.devinfo);
488 } else {
489 local_invocation_id[i] = elk_imm_uw(0);
490 }
491 }
492
493 /* TODO: Fill out uses_btd_stack_ids automatically */
494 if (prog_data->uses_btd_stack_ids)
495 r += reg_unit(v.devinfo);
496 }
497
498 num_regs = r;
499 }
500
501 void
load_subgroup_id(const fs_builder & bld,elk_fs_reg & dest) const502 elk_cs_thread_payload::load_subgroup_id(const fs_builder &bld,
503 elk_fs_reg &dest) const
504 {
505 auto devinfo = bld.shader->devinfo;
506 dest = retype(dest, ELK_REGISTER_TYPE_UD);
507
508 if (subgroup_id_.file != BAD_FILE) {
509 assert(devinfo->verx10 >= 125);
510 bld.AND(dest, subgroup_id_, elk_imm_ud(INTEL_MASK(7, 0)));
511 } else {
512 assert(devinfo->verx10 < 125);
513 assert(gl_shader_stage_is_compute(bld.shader->stage));
514 int index = elk_get_subgroup_id_param_index(devinfo,
515 bld.shader->stage_prog_data);
516 bld.MOV(dest, elk_fs_reg(UNIFORM, index, ELK_REGISTER_TYPE_UD));
517 }
518 }
519
520