1 /*
2 * Copyright 2024 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_linker.h"
7 #include <stddef.h>
8 #include <stdint.h>
9 #include "util/ralloc.h"
10 #include "agx_abi.h"
11 #include "agx_compile.h"
12 #include "agx_device.h"
13 #include "agx_pack.h"
14 #include "agx_scratch.h"
15
16 /*
17 * When sample shading is used with a non-monolithic fragment shader, we
18 * fast-link a program with the following structure:
19 *
20 * Fragment prolog;
21 *
22 * for (u16 sample_bit = 1; sample_bit < (1 << # of samples); ++sample_bit) {
23 * API fragment shader;
24 * Fragment epilog;
25 * }
26 *
27 * This means the prolog runs per-pixel but the fragment shader and epilog run
28 * per-sample. To do this, we need to generate the loop on the fly. The
29 * following binary sequences form the relevant loop.
30 */
31
32 static_assert(AGX_ABI_FIN_SAMPLE_MASK == 2, "r1l known");
33
34 /* clang-format off */
35 static const uint8_t sample_loop_header[] = {
36 /* mov_imm r0l, 0x0, 0b0 */
37 0x62, 0x00, 0x00, 0x00,
38
39 /* mov_imm r1l, 0x0, 0b0 */
40 0x62, 0x04, 0x01, 0x00,
41 };
42
43 #define STOP \
44 /* stop */ \
45 0x88, 0x00, \
46 \
47 /* trap */ \
48 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, \
49 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00,
50
51 static const uint8_t stop[] = {STOP};
52
53 static const uint8_t sample_loop_footer[] = {
54 /* iadd r1l, 0, r1l, lsl 1 */
55 0x0e, 0x04, 0x00, 0x20, 0x84, 0x00, 0x00, 0x00,
56
57 /* while_icmp r0l, ult, r1h, 0, 1 */
58 0x52, 0x2c, 0x42, 0x00, 0x00, 0x00,
59
60 /* jmp_exec_any */
61 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
62
63 /* pop_exec r0l, 1 */
64 0x52, 0x0e, 0x00, 0x00, 0x00, 0x00,
65
66 STOP
67 };
68
69 /* Offset in sample_loop_footer to the jmp_exec_any's target */
70 #define SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS (16)
71
72 /* Offset of the jmp_exec_any, for calculating the PC offsets */
73 #define SAMPLE_LOOP_FOOTER_JMP_OFFS (14)
74
75 /* Offset in sample_loop_footer to the while_icmp's sample count immediate. Bit
76 * position in the byte given by the shift.
77 */
78 #define SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS (11)
79 #define SAMPLE_LOOP_FOOTER_COUNT_SHIFT (4)
80 /* clang-format on */
81
82 void
agx_fast_link(struct agx_linked_shader * linked,struct agx_device * dev,bool fragment,struct agx_shader_part * main,struct agx_shader_part * prolog,struct agx_shader_part * epilog,unsigned nr_samples_shaded)83 agx_fast_link(struct agx_linked_shader *linked, struct agx_device *dev,
84 bool fragment, struct agx_shader_part *main,
85 struct agx_shader_part *prolog, struct agx_shader_part *epilog,
86 unsigned nr_samples_shaded)
87 {
88 size_t size = 0;
89 unsigned nr_gprs = 0, scratch_size = 0;
90 bool reads_tib = false, writes_sample_mask = false,
91 disable_tri_merging = false, tag_write_disable = true;
92
93 if (nr_samples_shaded) {
94 size += sizeof(sample_loop_header);
95
96 if (nr_samples_shaded > 1)
97 size += sizeof(sample_loop_footer);
98 else
99 size += sizeof(stop);
100 }
101
102 struct agx_shader_part *parts[] = {prolog, main, epilog};
103
104 for (unsigned i = 0; i < ARRAY_SIZE(parts); ++i) {
105 struct agx_shader_part *part = parts[i];
106 if (!part)
107 continue;
108
109 size += part->info.main_size;
110
111 nr_gprs = MAX2(nr_gprs, part->info.nr_gprs);
112 scratch_size = MAX2(scratch_size, part->info.scratch_size);
113 reads_tib |= part->info.reads_tib;
114 writes_sample_mask |= part->info.writes_sample_mask;
115 disable_tri_merging |= part->info.disable_tri_merging;
116 linked->uses_base_param |= part->info.uses_base_param;
117 linked->uses_txf |= part->info.uses_txf;
118 tag_write_disable &= part->info.tag_write_disable;
119 }
120
121 assert(size > 0 && "must stop");
122
123 linked->bo = agx_bo_create(dev, size, 0, AGX_BO_EXEC | AGX_BO_LOW_VA,
124 "Linked executable");
125 uint8_t *linked_map = agx_bo_map(linked->bo);
126
127 size_t offset = 0;
128
129 /* FS prolog happens per-pixel, outside the sample loop */
130 if (prolog) {
131 size_t sz = prolog->info.main_size;
132 memcpy(linked_map + offset, prolog->binary, sz);
133 offset += sz;
134 }
135
136 if (nr_samples_shaded) {
137 memcpy(linked_map + offset, sample_loop_header,
138 sizeof(sample_loop_header));
139 offset += sizeof(sample_loop_header);
140 }
141
142 size_t sample_loop_begin = offset;
143
144 /* Main shader and epilog happen in the sample loop, so start from i=1 */
145 for (unsigned i = 1; i < ARRAY_SIZE(parts); ++i) {
146 struct agx_shader_part *part = parts[i];
147 if (!part)
148 continue;
149
150 size_t sz = part->info.main_size;
151 memcpy(linked_map + offset, part->binary + part->info.main_offset, sz);
152 offset += sz;
153 }
154
155 if (nr_samples_shaded > 1) {
156 assert(sample_loop_footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] == 0);
157
158 /* Make a stack copy of the footer so we can efficiently patch it */
159 uint8_t footer[sizeof(sample_loop_footer)];
160 memcpy(footer, sample_loop_footer, sizeof(footer));
161
162 /* Patch in sample end */
163 uint8_t end = (1u << nr_samples_shaded) - 1;
164 footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] =
165 end << SAMPLE_LOOP_FOOTER_COUNT_SHIFT;
166
167 /* Patch in the branch target */
168 int32_t loop_size = offset - sample_loop_begin;
169 int32_t branch_offs = -(SAMPLE_LOOP_FOOTER_JMP_OFFS + loop_size);
170 int32_t *target = (int32_t *)(footer + SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS);
171 *target = branch_offs;
172
173 /* Copy in the patched footer */
174 memcpy(linked_map + offset, footer, sizeof(footer));
175 offset += sizeof(footer);
176 } else if (nr_samples_shaded) {
177 /* Just end after the first sample, no need to loop for a single sample */
178 memcpy(linked_map + offset, stop, sizeof(stop));
179 offset += sizeof(stop);
180 }
181
182 assert(offset == size);
183
184 agx_pack(&linked->shader, USC_SHADER, cfg) {
185 cfg.code = agx_usc_addr(dev, linked->bo->va->addr);
186 cfg.unk_2 = fragment ? 2 : 3;
187
188 if (fragment)
189 cfg.loads_varyings = linked->cf.nr_bindings > 0;
190 }
191
192 agx_pack(&linked->regs, USC_REGISTERS, cfg) {
193 cfg.register_count = nr_gprs;
194 cfg.unk_1 = fragment;
195 cfg.spill_size = scratch_size ? agx_scratch_get_bucket(scratch_size) : 0;
196 cfg.unk_4 = 1;
197 }
198
199 if (fragment) {
200 agx_pack(&linked->fragment_props, USC_FRAGMENT_PROPERTIES, cfg) {
201 cfg.early_z_testing = !writes_sample_mask;
202 cfg.unk_2 = true;
203 cfg.unk_3 = 0xf;
204 cfg.unk_4 = 0x2;
205 cfg.unk_5 = 0x0;
206 }
207
208 agx_pack(&linked->fragment_control, FRAGMENT_CONTROL, cfg) {
209 cfg.tag_write_disable = tag_write_disable;
210 cfg.disable_tri_merging = disable_tri_merging;
211
212 if (reads_tib && writes_sample_mask)
213 cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT_PUNCH_THROUGH;
214 else if (reads_tib)
215 cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT;
216 else if (writes_sample_mask)
217 cfg.pass_type = AGX_PASS_TYPE_PUNCH_THROUGH;
218 else
219 cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
220 }
221
222 /* Merge the CF binding lists from the prolog to handle cull distance */
223 memcpy(&linked->cf, &main->info.varyings.fs,
224 sizeof(struct agx_varyings_fs));
225
226 struct agx_varyings_fs *prolog_vary =
227 prolog ? &prolog->info.varyings.fs : NULL;
228
229 if (prolog_vary && prolog_vary->nr_bindings) {
230 assert(!prolog_vary->reads_z);
231 linked->cf.nr_cf = MAX2(linked->cf.nr_cf, prolog_vary->nr_cf);
232
233 assert(linked->cf.nr_bindings + prolog_vary->nr_bindings <=
234 ARRAY_SIZE(linked->cf.bindings) &&
235 "bounded by # of coeff registers");
236
237 memcpy(linked->cf.bindings + linked->cf.nr_bindings,
238 prolog_vary->bindings,
239 sizeof(struct agx_cf_binding) * prolog_vary->nr_bindings);
240
241 linked->cf.nr_bindings += prolog_vary->nr_bindings;
242 }
243
244 agx_pack(&linked->osel, OUTPUT_SELECT, cfg) {
245 cfg.varyings = linked->cf.nr_bindings > 0;
246 cfg.frag_coord_z = linked->cf.reads_z;
247 }
248 }
249 }
250