• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_linker.h"
7 #include <stddef.h>
8 #include <stdint.h>
9 #include "util/ralloc.h"
10 #include "agx_abi.h"
11 #include "agx_compile.h"
12 #include "agx_device.h"
13 #include "agx_pack.h"
14 #include "agx_scratch.h"
15 
16 /*
17  * When sample shading is used with a non-monolithic fragment shader, we
18  * fast-link a program with the following structure:
19  *
20  *    Fragment prolog;
21  *
22  *    for (u16 sample_bit = 1; sample_bit < (1 << # of samples); ++sample_bit) {
23  *       API fragment shader;
24  *       Fragment epilog;
25  *    }
26  *
27  * This means the prolog runs per-pixel but the fragment shader and epilog run
28  * per-sample. To do this, we need to generate the loop on the fly. The
29  * following binary sequences form the relevant loop.
30  */
31 
32 static_assert(AGX_ABI_FIN_SAMPLE_MASK == 2, "r1l known");
33 
34 /* clang-format off */
35 static const uint8_t sample_loop_header[] = {
36    /* mov_imm r0l, 0x0, 0b0 */
37    0x62, 0x00, 0x00, 0x00,
38 
39    /* mov_imm r1l, 0x0, 0b0 */
40    0x62, 0x04, 0x01, 0x00,
41 };
42 
43 #define STOP                                                                   \
44    /* stop */                                                                  \
45    0x88, 0x00,                                                                 \
46                                                                                \
47    /* trap */                                                                  \
48    0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00,                             \
49    0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00,
50 
51 static const uint8_t stop[] = {STOP};
52 
53 static const uint8_t sample_loop_footer[] = {
54    /* iadd r1l, 0, r1l, lsl 1 */
55    0x0e, 0x04, 0x00, 0x20, 0x84, 0x00, 0x00, 0x00,
56 
57    /* while_icmp r0l, ult, r1h, 0, 1 */
58    0x52, 0x2c, 0x42, 0x00, 0x00, 0x00,
59 
60    /* jmp_exec_any */
61    0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
62 
63    /* pop_exec r0l, 1 */
64    0x52, 0x0e, 0x00, 0x00, 0x00, 0x00,
65 
66    STOP
67 };
68 
69 /* Offset in sample_loop_footer to the jmp_exec_any's target */
70 #define SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS (16)
71 
72 /* Offset of the jmp_exec_any, for calculating the PC offsets */
73 #define SAMPLE_LOOP_FOOTER_JMP_OFFS (14)
74 
75 /* Offset in sample_loop_footer to the while_icmp's sample count immediate. Bit
76  * position in the byte given by the shift.
77  */
78 #define SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS (11)
79 #define SAMPLE_LOOP_FOOTER_COUNT_SHIFT (4)
80 /* clang-format on */
81 
82 void
agx_fast_link(struct agx_linked_shader * linked,struct agx_device * dev,bool fragment,struct agx_shader_part * main,struct agx_shader_part * prolog,struct agx_shader_part * epilog,unsigned nr_samples_shaded)83 agx_fast_link(struct agx_linked_shader *linked, struct agx_device *dev,
84               bool fragment, struct agx_shader_part *main,
85               struct agx_shader_part *prolog, struct agx_shader_part *epilog,
86               unsigned nr_samples_shaded)
87 {
88    size_t size = 0;
89    unsigned nr_gprs = 0, scratch_size = 0;
90    bool reads_tib = false, writes_sample_mask = false,
91         disable_tri_merging = false, tag_write_disable = true;
92 
93    if (nr_samples_shaded) {
94       size += sizeof(sample_loop_header);
95 
96       if (nr_samples_shaded > 1)
97          size += sizeof(sample_loop_footer);
98       else
99          size += sizeof(stop);
100    }
101 
102    struct agx_shader_part *parts[] = {prolog, main, epilog};
103 
104    for (unsigned i = 0; i < ARRAY_SIZE(parts); ++i) {
105       struct agx_shader_part *part = parts[i];
106       if (!part)
107          continue;
108 
109       size += part->info.main_size;
110 
111       nr_gprs = MAX2(nr_gprs, part->info.nr_gprs);
112       scratch_size = MAX2(scratch_size, part->info.scratch_size);
113       reads_tib |= part->info.reads_tib;
114       writes_sample_mask |= part->info.writes_sample_mask;
115       disable_tri_merging |= part->info.disable_tri_merging;
116       linked->uses_base_param |= part->info.uses_base_param;
117       linked->uses_txf |= part->info.uses_txf;
118       tag_write_disable &= part->info.tag_write_disable;
119    }
120 
121    assert(size > 0 && "must stop");
122 
123    linked->bo = agx_bo_create(dev, size, 0, AGX_BO_EXEC | AGX_BO_LOW_VA,
124                               "Linked executable");
125    uint8_t *linked_map = agx_bo_map(linked->bo);
126 
127    size_t offset = 0;
128 
129    /* FS prolog happens per-pixel, outside the sample loop */
130    if (prolog) {
131       size_t sz = prolog->info.main_size;
132       memcpy(linked_map + offset, prolog->binary, sz);
133       offset += sz;
134    }
135 
136    if (nr_samples_shaded) {
137       memcpy(linked_map + offset, sample_loop_header,
138              sizeof(sample_loop_header));
139       offset += sizeof(sample_loop_header);
140    }
141 
142    size_t sample_loop_begin = offset;
143 
144    /* Main shader and epilog happen in the sample loop, so start from i=1 */
145    for (unsigned i = 1; i < ARRAY_SIZE(parts); ++i) {
146       struct agx_shader_part *part = parts[i];
147       if (!part)
148          continue;
149 
150       size_t sz = part->info.main_size;
151       memcpy(linked_map + offset, part->binary + part->info.main_offset, sz);
152       offset += sz;
153    }
154 
155    if (nr_samples_shaded > 1) {
156       assert(sample_loop_footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] == 0);
157 
158       /* Make a stack copy of the footer so we can efficiently patch it */
159       uint8_t footer[sizeof(sample_loop_footer)];
160       memcpy(footer, sample_loop_footer, sizeof(footer));
161 
162       /* Patch in sample end */
163       uint8_t end = (1u << nr_samples_shaded) - 1;
164       footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] =
165          end << SAMPLE_LOOP_FOOTER_COUNT_SHIFT;
166 
167       /* Patch in the branch target */
168       int32_t loop_size = offset - sample_loop_begin;
169       int32_t branch_offs = -(SAMPLE_LOOP_FOOTER_JMP_OFFS + loop_size);
170       int32_t *target = (int32_t *)(footer + SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS);
171       *target = branch_offs;
172 
173       /* Copy in the patched footer */
174       memcpy(linked_map + offset, footer, sizeof(footer));
175       offset += sizeof(footer);
176    } else if (nr_samples_shaded) {
177       /* Just end after the first sample, no need to loop for a single sample */
178       memcpy(linked_map + offset, stop, sizeof(stop));
179       offset += sizeof(stop);
180    }
181 
182    assert(offset == size);
183 
184    agx_pack(&linked->shader, USC_SHADER, cfg) {
185       cfg.code = agx_usc_addr(dev, linked->bo->va->addr);
186       cfg.unk_2 = fragment ? 2 : 3;
187 
188       if (fragment)
189          cfg.loads_varyings = linked->cf.nr_bindings > 0;
190    }
191 
192    agx_pack(&linked->regs, USC_REGISTERS, cfg) {
193       cfg.register_count = nr_gprs;
194       cfg.unk_1 = fragment;
195       cfg.spill_size = scratch_size ? agx_scratch_get_bucket(scratch_size) : 0;
196       cfg.unk_4 = 1;
197    }
198 
199    if (fragment) {
200       agx_pack(&linked->fragment_props, USC_FRAGMENT_PROPERTIES, cfg) {
201          cfg.early_z_testing = !writes_sample_mask;
202          cfg.unk_2 = true;
203          cfg.unk_3 = 0xf;
204          cfg.unk_4 = 0x2;
205          cfg.unk_5 = 0x0;
206       }
207 
208       agx_pack(&linked->fragment_control, FRAGMENT_CONTROL, cfg) {
209          cfg.tag_write_disable = tag_write_disable;
210          cfg.disable_tri_merging = disable_tri_merging;
211 
212          if (reads_tib && writes_sample_mask)
213             cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT_PUNCH_THROUGH;
214          else if (reads_tib)
215             cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT;
216          else if (writes_sample_mask)
217             cfg.pass_type = AGX_PASS_TYPE_PUNCH_THROUGH;
218          else
219             cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
220       }
221 
222       /* Merge the CF binding lists from the prolog to handle cull distance */
223       memcpy(&linked->cf, &main->info.varyings.fs,
224              sizeof(struct agx_varyings_fs));
225 
226       struct agx_varyings_fs *prolog_vary =
227          prolog ? &prolog->info.varyings.fs : NULL;
228 
229       if (prolog_vary && prolog_vary->nr_bindings) {
230          assert(!prolog_vary->reads_z);
231          linked->cf.nr_cf = MAX2(linked->cf.nr_cf, prolog_vary->nr_cf);
232 
233          assert(linked->cf.nr_bindings + prolog_vary->nr_bindings <=
234                    ARRAY_SIZE(linked->cf.bindings) &&
235                 "bounded by # of coeff registers");
236 
237          memcpy(linked->cf.bindings + linked->cf.nr_bindings,
238                 prolog_vary->bindings,
239                 sizeof(struct agx_cf_binding) * prolog_vary->nr_bindings);
240 
241          linked->cf.nr_bindings += prolog_vary->nr_bindings;
242       }
243 
244       agx_pack(&linked->osel, OUTPUT_SELECT, cfg) {
245          cfg.varyings = linked->cf.nr_bindings > 0;
246          cfg.frag_coord_z = linked->cf.reads_z;
247       }
248    }
249 }
250