• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2006-2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 
27 using namespace elk;
28 
elk_vs_thread_payload(const elk_fs_visitor & v)29 elk_vs_thread_payload::elk_vs_thread_payload(const elk_fs_visitor &v)
30 {
31    unsigned r = 0;
32 
33    /* R0: Thread header. */
34    r += reg_unit(v.devinfo);
35 
36    /* R1: URB handles. */
37    urb_handles = elk_ud8_grf(r, 0);
38    r += reg_unit(v.devinfo);
39 
40    num_regs = r;
41 }
42 
elk_tcs_thread_payload(const elk_fs_visitor & v)43 elk_tcs_thread_payload::elk_tcs_thread_payload(const elk_fs_visitor &v)
44 {
45    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
46    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(v.prog_data);
47    struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) v.key;
48 
49    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
50       patch_urb_output = elk_ud1_grf(0, 0);
51       primitive_id = elk_vec1_grf(0, 1);
52 
53       /* r1-r4 contain the ICP handles. */
54       icp_handle_start = elk_ud8_grf(1, 0);
55 
56       num_regs = 5;
57    } else {
58       assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
59       assert(tcs_key->input_vertices <= ELK_MAX_TCS_INPUT_VERTICES);
60 
61       unsigned r = 0;
62 
63       r += reg_unit(v.devinfo);
64 
65       patch_urb_output = elk_ud8_grf(r, 0);
66       r += reg_unit(v.devinfo);
67 
68       if (tcs_prog_data->include_primitive_id) {
69          primitive_id = elk_vec8_grf(r, 0);
70          r += reg_unit(v.devinfo);
71       }
72 
73       /* ICP handles occupy the next 1-32 registers. */
74       icp_handle_start = elk_ud8_grf(r, 0);
75       r += elk_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
76 
77       num_regs = r;
78    }
79 }
80 
elk_tes_thread_payload(const elk_fs_visitor & v)81 elk_tes_thread_payload::elk_tes_thread_payload(const elk_fs_visitor &v)
82 {
83    unsigned r = 0;
84 
85    /* R0: Thread Header. */
86    patch_urb_input = retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD);
87    primitive_id = elk_vec1_grf(0, 1);
88    r += reg_unit(v.devinfo);
89 
90    /* R1-3: gl_TessCoord.xyz. */
91    for (unsigned i = 0; i < 3; i++) {
92       coords[i] = elk_vec8_grf(r, 0);
93       r += reg_unit(v.devinfo);
94    }
95 
96    /* R4: URB output handles. */
97    urb_output = elk_ud8_grf(r, 0);
98    r += reg_unit(v.devinfo);
99 
100    num_regs = r;
101 }
102 
elk_gs_thread_payload(elk_fs_visitor & v)103 elk_gs_thread_payload::elk_gs_thread_payload(elk_fs_visitor &v)
104 {
105    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(v.prog_data);
106    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(v.prog_data);
107    const fs_builder bld = fs_builder(&v).at_end();
108 
109    /* R0: thread header. */
110    unsigned r = reg_unit(v.devinfo);
111 
112    /* R1: output URB handles. */
113    urb_handles = bld.vgrf(ELK_REGISTER_TYPE_UD);
114    bld.AND(urb_handles, elk_ud8_grf(r, 0),
115          v.devinfo->ver >= 20 ? elk_imm_ud(0xFFFFFF) : elk_imm_ud(0xFFFF));
116 
117    /* R1: Instance ID stored in bits 31:27 */
118    instance_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
119    bld.SHR(instance_id, elk_ud8_grf(r, 0), elk_imm_ud(27u));
120 
121    r += reg_unit(v.devinfo);
122 
123    if (gs_prog_data->include_primitive_id) {
124       primitive_id = elk_ud8_grf(r, 0);
125       r += reg_unit(v.devinfo);
126    }
127 
128    /* Always enable VUE handles so we can safely use pull model if needed.
129     *
130     * The push model for a GS uses a ton of register space even for trivial
131     * scenarios with just a few inputs, so just make things easier and a bit
132     * safer by always having pull model available.
133     */
134    gs_prog_data->base.include_vue_handles = true;
135 
136    /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
137    icp_handle_start = elk_ud8_grf(r, 0);
138    r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
139 
140    num_regs = r;
141 
142    /* Use a maximum of 24 registers for push-model inputs. */
143    const unsigned max_push_components = 24;
144 
145    /* If pushing our inputs would take too many registers, reduce the URB read
146     * length (which is in HWords, or 8 registers), and resort to pulling.
147     *
148     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
149     * have to multiply by VerticesIn to obtain the total storage requirement.
150     */
151    if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
152        max_push_components) {
153       vue_prog_data->urb_read_length =
154          ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
155    }
156 }
157 
158 static inline void
setup_fs_payload_gfx20(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target)159 setup_fs_payload_gfx20(elk_fs_thread_payload &payload,
160                        const elk_fs_visitor &v,
161                        bool &source_depth_to_render_target)
162 {
163    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
164    const unsigned payload_width = 16;
165    assert(v.dispatch_width % payload_width == 0);
166    assert(v.devinfo->ver >= 20);
167 
168    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
169       /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
170       payload.num_regs++;
171       payload.subspan_coord_reg[j] = payload.num_regs++;
172    }
173 
174    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
175       /* R2-13: Barycentric interpolation coordinates.  These appear
176        * in the same order that they appear in the elk_barycentric_mode
177        * enum.  Each set of coordinates occupies 2 64B registers per
178        * SIMD16 half.  Coordinates only appear if they were enabled
179        * using the "Barycentric Interpolation Mode" bits in WM_STATE.
180        */
181       for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
182          if (prog_data->barycentric_interp_modes & (1 << i)) {
183             payload.barycentric_coord_reg[i][j] = payload.num_regs;
184             payload.num_regs += payload_width / 4;
185          }
186       }
187 
188       /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
189       if (prog_data->uses_src_depth) {
190          payload.source_depth_reg[j] = payload.num_regs;
191          payload.num_regs += payload_width / 8;
192       }
193 
194       /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
195       if (prog_data->uses_src_w) {
196          payload.source_w_reg[j] = payload.num_regs;
197          payload.num_regs += payload_width / 8;
198       }
199 
200       /* R16: MSAA input coverage mask if "Pixel Shader Uses Input
201        * Coverage Mask" is set.
202        */
203       if (prog_data->uses_sample_mask) {
204          payload.sample_mask_in_reg[j] = payload.num_regs;
205          payload.num_regs += payload_width / 8;
206       }
207 
208       /* R19: MSAA position XY offsets if "Position XY Offset Select"
209        * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE.  Note that
210        * this is delivered as a single SIMD32 vector, inconsistently
211        * with most other PS payload fields.
212        */
213       if (prog_data->uses_pos_offset && j == 0) {
214          for (unsigned k = 0; k < 2; k++) {
215             payload.sample_pos_reg[k] = payload.num_regs;
216             payload.num_regs++;
217          }
218       }
219    }
220 
221    if (prog_data->uses_depth_w_coefficients) {
222       assert(v.max_polygons == 1);
223       payload.depth_w_coef_reg = payload.num_regs;
224       payload.num_regs += 2;
225    }
226 
227    if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
228       source_depth_to_render_target = true;
229    }
230 }
231 
232 static inline void
setup_fs_payload_gfx6(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target)233 setup_fs_payload_gfx6(elk_fs_thread_payload &payload,
234                       const elk_fs_visitor &v,
235                       bool &source_depth_to_render_target)
236 {
237    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
238 
239    const unsigned payload_width = MIN2(16, v.dispatch_width);
240    assert(v.dispatch_width % payload_width == 0);
241    assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
242 
243    payload.num_regs = 0;
244 
245    /* R0: PS thread payload header. */
246    payload.num_regs++;
247 
248    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
249       /* R1: masks, pixel X/Y coordinates. */
250       payload.subspan_coord_reg[j] = payload.num_regs++;
251    }
252 
253    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
254       /* R3-26: barycentric interpolation coordinates.  These appear in the
255        * same order that they appear in the elk_barycentric_mode enum.  Each
256        * set of coordinates occupies 2 registers if dispatch width == 8 and 4
257        * registers if dispatch width == 16.  Coordinates only appear if they
258        * were enabled using the "Barycentric Interpolation Mode" bits in
259        * WM_STATE.
260        */
261       for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
262          if (prog_data->barycentric_interp_modes & (1 << i)) {
263             payload.barycentric_coord_reg[i][j] = payload.num_regs;
264             payload.num_regs += payload_width / 4;
265          }
266       }
267 
268       /* R27-28: interpolated depth if uses source depth */
269       if (prog_data->uses_src_depth) {
270          payload.source_depth_reg[j] = payload.num_regs;
271          payload.num_regs += payload_width / 8;
272       }
273 
274       /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
275       if (prog_data->uses_src_w) {
276          payload.source_w_reg[j] = payload.num_regs;
277          payload.num_regs += payload_width / 8;
278       }
279 
280       /* R31: MSAA position offsets. */
281       if (prog_data->uses_pos_offset) {
282          payload.sample_pos_reg[j] = payload.num_regs;
283          payload.num_regs++;
284       }
285 
286       /* R32-33: MSAA input coverage mask */
287       if (prog_data->uses_sample_mask) {
288          assert(v.devinfo->ver >= 7);
289          payload.sample_mask_in_reg[j] = payload.num_regs;
290          payload.num_regs += payload_width / 8;
291       }
292    }
293 
294    /* R66: Source Depth and/or W Attribute Vertex Deltas */
295    if (prog_data->uses_depth_w_coefficients) {
296       assert(v.max_polygons == 1);
297       payload.depth_w_coef_reg = payload.num_regs;
298       payload.num_regs++;
299    }
300 
301    if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
302       source_depth_to_render_target = true;
303    }
304 }
305 
306 #undef P                        /* prompted depth */
307 #undef C                        /* computed */
308 #undef N                        /* non-promoted? */
309 
310 #define P 0
311 #define C 1
312 #define N 2
313 
314 static const struct {
315    GLuint mode:2;
316    GLuint sd_present:1;
317    GLuint sd_to_rt:1;
318    GLuint dd_present:1;
319    GLuint ds_present:1;
320 } wm_iz_table[ELK_WM_IZ_BIT_MAX] =
321 {
322  { P, 0, 0, 0, 0 },
323  { P, 0, 0, 0, 0 },
324  { P, 0, 0, 0, 0 },
325  { P, 0, 0, 0, 0 },
326  { P, 0, 0, 0, 0 },
327  { N, 1, 1, 0, 0 },
328  { N, 0, 1, 0, 0 },
329  { N, 0, 1, 0, 0 },
330  { P, 0, 0, 0, 0 },
331  { P, 0, 0, 0, 0 },
332  { C, 0, 1, 1, 0 },
333  { C, 0, 1, 1, 0 },
334  { P, 0, 0, 0, 0 },
335  { N, 1, 1, 0, 0 },
336  { C, 0, 1, 1, 0 },
337  { C, 0, 1, 1, 0 },
338  { P, 0, 0, 0, 0 },
339  { P, 0, 0, 0, 0 },
340  { P, 0, 0, 0, 0 },
341  { P, 0, 0, 0, 0 },
342  { P, 0, 0, 0, 0 },
343  { N, 1, 1, 0, 0 },
344  { N, 0, 1, 0, 0 },
345  { N, 0, 1, 0, 0 },
346  { P, 0, 0, 0, 0 },
347  { P, 0, 0, 0, 0 },
348  { C, 0, 1, 1, 0 },
349  { C, 0, 1, 1, 0 },
350  { P, 0, 0, 0, 0 },
351  { N, 1, 1, 0, 0 },
352  { C, 0, 1, 1, 0 },
353  { C, 0, 1, 1, 0 },
354  { P, 0, 0, 0, 0 },
355  { P, 0, 0, 0, 0 },
356  { P, 0, 0, 0, 0 },
357  { P, 0, 0, 0, 0 },
358  { P, 0, 0, 0, 0 },
359  { N, 1, 1, 0, 1 },
360  { N, 0, 1, 0, 1 },
361  { N, 0, 1, 0, 1 },
362  { P, 0, 0, 0, 0 },
363  { P, 0, 0, 0, 0 },
364  { C, 0, 1, 1, 1 },
365  { C, 0, 1, 1, 1 },
366  { P, 0, 0, 0, 0 },
367  { N, 1, 1, 0, 1 },
368  { C, 0, 1, 1, 1 },
369  { C, 0, 1, 1, 1 },
370  { P, 0, 0, 0, 0 },
371  { C, 0, 0, 0, 1 },
372  { P, 0, 0, 0, 0 },
373  { C, 0, 1, 0, 1 },
374  { P, 0, 0, 0, 0 },
375  { C, 1, 1, 0, 1 },
376  { C, 0, 1, 0, 1 },
377  { C, 0, 1, 0, 1 },
378  { P, 0, 0, 0, 0 },
379  { C, 1, 1, 1, 1 },
380  { C, 0, 1, 1, 1 },
381  { C, 0, 1, 1, 1 },
382  { P, 0, 0, 0, 0 },
383  { C, 1, 1, 1, 1 },
384  { C, 0, 1, 1, 1 },
385  { C, 0, 1, 1, 1 }
386 };
387 
388 /**
389  * \param line_aa  ELK_NEVER, ELK_ALWAYS or ELK_SOMETIMES
390  * \param lookup  bitmask of ELK_WM_IZ_* flags
391  */
392 static inline void
setup_fs_payload_gfx4(elk_fs_thread_payload & payload,const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)393 setup_fs_payload_gfx4(elk_fs_thread_payload &payload,
394                       const elk_fs_visitor &v,
395                       bool &source_depth_to_render_target,
396                       bool &runtime_check_aads_emit)
397 {
398    assert(v.dispatch_width <= 16);
399 
400    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(v.prog_data);
401    elk_wm_prog_key *key = (elk_wm_prog_key *) v.key;
402 
403    GLuint reg = 1;
404    bool kill_stats_promoted_workaround = false;
405    int lookup = key->iz_lookup;
406 
407    assert(lookup < ELK_WM_IZ_BIT_MAX);
408 
409    /* Crazy workaround in the windowizer, which we need to track in
410     * our register allocation and render target writes.  See the "If
411     * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
412     * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
413     */
414    if (key->stats_wm &&
415        (lookup & ELK_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
416        wm_iz_table[lookup].mode == P) {
417       kill_stats_promoted_workaround = true;
418    }
419 
420    payload.subspan_coord_reg[0] = reg++;
421 
422    if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
423        kill_stats_promoted_workaround) {
424       payload.source_depth_reg[0] = reg;
425       reg += 2;
426    }
427 
428    if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
429       source_depth_to_render_target = true;
430 
431    if (wm_iz_table[lookup].ds_present || key->line_aa != ELK_NEVER) {
432       payload.aa_dest_stencil_reg[0] = reg;
433       runtime_check_aads_emit =
434          !wm_iz_table[lookup].ds_present && key->line_aa == ELK_SOMETIMES;
435       reg++;
436    }
437 
438    if (wm_iz_table[lookup].dd_present) {
439       payload.dest_depth_reg[0] = reg;
440       reg+=2;
441    }
442 
443    payload.num_regs = reg;
444 }
445 
446 #undef P                        /* prompted depth */
447 #undef C                        /* computed */
448 #undef N                        /* non-promoted? */
449 
elk_fs_thread_payload(const elk_fs_visitor & v,bool & source_depth_to_render_target,bool & runtime_check_aads_emit)450 elk_fs_thread_payload::elk_fs_thread_payload(const elk_fs_visitor &v,
451                                      bool &source_depth_to_render_target,
452                                      bool &runtime_check_aads_emit)
453   : subspan_coord_reg(),
454     source_depth_reg(),
455     source_w_reg(),
456     aa_dest_stencil_reg(),
457     dest_depth_reg(),
458     sample_pos_reg(),
459     sample_mask_in_reg(),
460     depth_w_coef_reg(),
461     barycentric_coord_reg()
462 {
463    if (v.devinfo->ver >= 20)
464       setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
465    else if (v.devinfo->ver >= 6)
466       setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
467    else
468       setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
469                             runtime_check_aads_emit);
470 }
471 
elk_cs_thread_payload(const elk_fs_visitor & v)472 elk_cs_thread_payload::elk_cs_thread_payload(const elk_fs_visitor &v)
473 {
474    struct elk_cs_prog_data *prog_data = elk_cs_prog_data(v.prog_data);
475 
476    unsigned r = reg_unit(v.devinfo);
477 
478    /* See nir_setup_uniforms for subgroup_id in earlier versions. */
479    if (v.devinfo->verx10 >= 125) {
480       subgroup_id_ = elk_ud1_grf(0, 2);
481 
482       for (int i = 0; i < 3; i++) {
483          if (prog_data->generate_local_id & (1 << i)) {
484             local_invocation_id[i] = elk_uw8_grf(r, 0);
485             r += reg_unit(v.devinfo);
486             if (v.devinfo->ver < 20 && v.dispatch_width == 32)
487                r += reg_unit(v.devinfo);
488          } else {
489             local_invocation_id[i] = elk_imm_uw(0);
490          }
491       }
492 
493       /* TODO: Fill out uses_btd_stack_ids automatically */
494       if (prog_data->uses_btd_stack_ids)
495          r += reg_unit(v.devinfo);
496    }
497 
498    num_regs = r;
499 }
500 
501 void
load_subgroup_id(const fs_builder & bld,elk_fs_reg & dest) const502 elk_cs_thread_payload::load_subgroup_id(const fs_builder &bld,
503                                     elk_fs_reg &dest) const
504 {
505    auto devinfo = bld.shader->devinfo;
506    dest = retype(dest, ELK_REGISTER_TYPE_UD);
507 
508    if (subgroup_id_.file != BAD_FILE) {
509       assert(devinfo->verx10 >= 125);
510       bld.AND(dest, subgroup_id_, elk_imm_ud(INTEL_MASK(7, 0)));
511    } else {
512       assert(devinfo->verx10 < 125);
513       assert(gl_shader_stage_is_compute(bld.shader->stage));
514       int index = elk_get_subgroup_id_param_index(devinfo,
515                                                   bld.shader->stage_prog_data);
516       bld.MOV(dest, elk_fs_reg(UNIFORM, index, ELK_REGISTER_TYPE_UD));
517    }
518 }
519 
520