1 /*
2 * Copyright 2024 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #pragma once
7
8 #include "compiler/libcl/libcl.h"
9 #include "compiler/shader_enums.h"
10 #include "agx_pack.h"
11
12 #define agx_push(ptr, T, cfg) \
13 for (unsigned _loop = 0; _loop < 1; \
14 ++_loop, ptr = (GLOBAL void *)(((uintptr_t)ptr) + AGX_##T##_LENGTH)) \
15 agx_pack(ptr, T, cfg)
16
17 #define agx_push_packed(ptr, src, T) \
18 static_assert(sizeof(src) == AGX_##T##_LENGTH); \
19 memcpy(ptr, &src, sizeof(src)); \
20 ptr = (GLOBAL void *)(((uintptr_t)ptr) + sizeof(src));
21
22 static inline enum agx_index_size
agx_translate_index_size(uint8_t size_B)23 agx_translate_index_size(uint8_t size_B)
24 {
25 /* Index sizes are encoded logarithmically */
26 static_assert(__builtin_ctz(1) == AGX_INDEX_SIZE_U8);
27 static_assert(__builtin_ctz(2) == AGX_INDEX_SIZE_U16);
28 static_assert(__builtin_ctz(4) == AGX_INDEX_SIZE_U32);
29
30 assert((size_B == 1) || (size_B == 2) || (size_B == 4));
31 return __builtin_ctz(size_B);
32 }
33
34 static inline unsigned
agx_indices_to_B(unsigned x,enum agx_index_size size)35 agx_indices_to_B(unsigned x, enum agx_index_size size)
36 {
37 return x << size;
38 }
39
40 static inline uint8_t
agx_index_size_to_B(enum agx_index_size size)41 agx_index_size_to_B(enum agx_index_size size)
42 {
43 return agx_indices_to_B(1, size);
44 }
45
46 struct agx_workgroup {
47 uint32_t x, y, z;
48 };
49
50 static inline struct agx_workgroup
agx_workgroup(uint32_t x,uint32_t y,uint32_t z)51 agx_workgroup(uint32_t x, uint32_t y, uint32_t z)
52 {
53 return (struct agx_workgroup){.x = x, .y = y, .z = z};
54 }
55
56 static inline unsigned
agx_workgroup_threads(struct agx_workgroup wg)57 agx_workgroup_threads(struct agx_workgroup wg)
58 {
59 return wg.x * wg.y * wg.z;
60 }
61
62 struct agx_grid {
63 enum agx_cdm_mode mode;
64 union {
65 uint32_t count[3];
66 uint64_t ptr;
67 };
68 };
69
70 static struct agx_grid
agx_3d(uint32_t x,uint32_t y,uint32_t z)71 agx_3d(uint32_t x, uint32_t y, uint32_t z)
72 {
73 return (struct agx_grid){.mode = AGX_CDM_MODE_DIRECT, .count = {x, y, z}};
74 }
75
76 static struct agx_grid
agx_1d(uint32_t x)77 agx_1d(uint32_t x)
78 {
79 return agx_3d(x, 1, 1);
80 }
81
82 static struct agx_grid
agx_grid_indirect(uint64_t ptr)83 agx_grid_indirect(uint64_t ptr)
84 {
85 return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_GLOBAL, .ptr = ptr};
86 }
87
88 static struct agx_grid
agx_grid_indirect_local(uint64_t ptr)89 agx_grid_indirect_local(uint64_t ptr)
90 {
91 return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_LOCAL, .ptr = ptr};
92 }
93
94 static inline bool
agx_is_indirect(struct agx_grid grid)95 agx_is_indirect(struct agx_grid grid)
96 {
97 return grid.mode != AGX_CDM_MODE_DIRECT;
98 }
99
100 enum agx_barrier {
101 /* No barrier/cache operations needed */
102 AGX_BARRIER_NONE = 0,
103
104 /* Catch-all for all defined barriers. Because we have not yet
105 * reverse-engineered the finer details here, this is the only barrier we
106 * have....
107 */
108 AGX_BARRIER_ALL = (1 << 0),
109 };
110
111 struct agx_draw {
112 struct agx_grid b;
113 uint64_t index_buffer;
114 uint32_t index_buffer_range_B;
115 uint32_t start;
116 uint32_t index_bias;
117 uint32_t start_instance;
118
119 /* Primitive restart enabled. If true, implies indexed */
120 bool restart;
121 enum agx_index_size index_size;
122
123 /* TODO: Optimize this boolean. We can't just check if index_buffer != 0
124 * because that breaks with null index buffers.
125 */
126 bool indexed;
127 };
128
129 static inline struct agx_draw
agx_draw_indirect(uint64_t ptr)130 agx_draw_indirect(uint64_t ptr)
131 {
132 return (struct agx_draw){.b = agx_grid_indirect(ptr)};
133 }
134
135 static inline struct agx_draw
agx_draw_indexed(uint32_t index_count,uint32_t instance_count,uint32_t first_index,uint32_t index_bias,uint32_t first_instance,uint64_t buf,uint32_t range_B,enum agx_index_size index_size,bool restart)136 agx_draw_indexed(uint32_t index_count, uint32_t instance_count,
137 uint32_t first_index, uint32_t index_bias,
138 uint32_t first_instance, uint64_t buf, uint32_t range_B,
139 enum agx_index_size index_size, bool restart)
140 {
141 return (struct agx_draw){
142 .b = agx_3d(index_count, instance_count, 1),
143 .index_buffer = buf,
144 .index_buffer_range_B = range_B,
145 .start = first_index,
146 .index_bias = index_bias,
147 .start_instance = first_instance,
148 .index_size = index_size,
149 .restart = restart,
150 .indexed = true,
151 };
152 }
153
154 static inline struct agx_draw
agx_draw_indexed_indirect(uint64_t ptr,uint64_t buf,uint32_t range_B,enum agx_index_size index_size,bool restart)155 agx_draw_indexed_indirect(uint64_t ptr, uint64_t buf, uint32_t range_B,
156 enum agx_index_size index_size, bool restart)
157 {
158 return (struct agx_draw){
159 .b = agx_grid_indirect(ptr),
160 .index_buffer = buf,
161 .index_buffer_range_B = range_B,
162 .index_size = index_size,
163 .restart = restart,
164 .indexed = true,
165 };
166 }
167
168 static inline unsigned
agx_draw_index_range_B(struct agx_draw d)169 agx_draw_index_range_B(struct agx_draw d)
170 {
171 uint range_B = d.index_buffer_range_B;
172 if (!agx_is_indirect(d.b))
173 range_B -= agx_indices_to_B(d.start, d.index_size);
174
175 return range_B;
176 }
177
178 static inline unsigned
agx_draw_index_range_el(struct agx_draw d)179 agx_draw_index_range_el(struct agx_draw d)
180 {
181 assert(d.indexed);
182 return agx_draw_index_range_B(d) >> d.index_size;
183 }
184
185 static inline uint64_t
agx_draw_index_buffer(struct agx_draw d)186 agx_draw_index_buffer(struct agx_draw d)
187 {
188 assert(d.indexed);
189
190 uint64_t ib = d.index_buffer;
191 if (!agx_is_indirect(d.b))
192 ib += agx_indices_to_B(d.start, d.index_size);
193
194 return ib;
195 }
196
197 static bool
agx_direct_draw_overreads_indices(struct agx_draw d)198 agx_direct_draw_overreads_indices(struct agx_draw d)
199 {
200 uint32_t range_B = agx_indices_to_B(d.start + d.b.count[0], d.index_size);
201 return range_B > d.index_buffer_range_B;
202 }
203
204 enum agx_chip {
205 AGX_CHIP_G13G,
206 AGX_CHIP_G13X,
207 AGX_CHIP_G14G,
208 AGX_CHIP_G14X,
209 };
210
211 static inline GLOBAL uint32_t *
agx_cdm_launch(GLOBAL uint32_t * out,enum agx_chip chip,struct agx_grid grid,struct agx_workgroup wg,struct agx_cdm_launch_word_0_packed launch,uint32_t usc)212 agx_cdm_launch(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_grid grid,
213 struct agx_workgroup wg,
214 struct agx_cdm_launch_word_0_packed launch, uint32_t usc)
215 {
216 #ifndef __OPENCL_VERSION__
217 struct agx_cdm_launch_word_0_packed mode;
218 agx_pack(&mode, CDM_LAUNCH_WORD_0, cfg) {
219 cfg.mode = grid.mode;
220 }
221
222 agx_merge(launch, mode, CDM_LAUNCH_WORD_0);
223 #endif
224
225 agx_push_packed(out, launch, CDM_LAUNCH_WORD_0);
226
227 agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
228 cfg.pipeline = usc;
229 }
230
231 if (chip == AGX_CHIP_G14X) {
232 agx_push(out, CDM_UNK_G14X, cfg)
233 ;
234 }
235
236 if (agx_is_indirect(grid)) {
237 agx_push(out, CDM_INDIRECT, cfg) {
238 cfg.address_hi = grid.ptr >> 32;
239 cfg.address_lo = grid.ptr;
240 }
241 } else {
242 agx_push(out, CDM_GLOBAL_SIZE, cfg) {
243 cfg.x = grid.count[0];
244 cfg.y = grid.count[1];
245 cfg.z = grid.count[2];
246 }
247 }
248
249 if (grid.mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
250 agx_push(out, CDM_LOCAL_SIZE, cfg) {
251 cfg.x = wg.x;
252 cfg.y = wg.y;
253 cfg.z = wg.z;
254 }
255 }
256
257 return out;
258 }
259
260 static inline GLOBAL uint32_t *
agx_vdm_draw(GLOBAL uint32_t * out,enum agx_chip chip,struct agx_draw draw,enum agx_primitive topology)261 agx_vdm_draw(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_draw draw,
262 enum agx_primitive topology)
263 {
264 uint64_t ib = draw.indexed ? agx_draw_index_buffer(draw) : 0;
265
266 agx_push(out, INDEX_LIST, cfg) {
267 cfg.primitive = topology;
268
269 if (agx_is_indirect(draw.b)) {
270 cfg.indirect_buffer_present = true;
271 } else {
272 cfg.instance_count_present = true;
273 cfg.index_count_present = true;
274 cfg.start_present = true;
275 }
276
277 if (draw.indexed) {
278 cfg.restart_enable = draw.restart;
279 cfg.index_buffer_hi = ib >> 32;
280 cfg.index_size = draw.index_size;
281
282 cfg.index_buffer_present = true;
283 cfg.index_buffer_size_present = true;
284 }
285 }
286
287 if (draw.indexed) {
288 agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
289 cfg.buffer_lo = ib;
290 }
291 }
292
293 if (agx_is_indirect(draw.b)) {
294 agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
295 cfg.address_hi = draw.b.ptr >> 32;
296 cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
297 }
298 } else {
299 agx_push(out, INDEX_LIST_COUNT, cfg) {
300 cfg.count = draw.b.count[0];
301 }
302
303 agx_push(out, INDEX_LIST_INSTANCES, cfg) {
304 cfg.count = draw.b.count[1];
305 }
306
307 agx_push(out, INDEX_LIST_START, cfg) {
308 cfg.start = draw.indexed ? draw.index_bias : draw.start;
309 }
310 }
311
312 if (draw.indexed) {
313 agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
314 cfg.size = align(agx_draw_index_range_B(draw), 4);
315 }
316 }
317
318 return out;
319 }
320
321 static inline uint32_t
agx_vdm_draw_size(enum agx_chip chip,struct agx_draw draw)322 agx_vdm_draw_size(enum agx_chip chip, struct agx_draw draw)
323 {
324 uint32_t size = AGX_INDEX_LIST_LENGTH;
325
326 if (agx_is_indirect(draw.b)) {
327 size += AGX_INDEX_LIST_INDIRECT_BUFFER_LENGTH;
328 } else {
329 size += AGX_INDEX_LIST_COUNT_LENGTH;
330 size += AGX_INDEX_LIST_INSTANCES_LENGTH;
331 size += AGX_INDEX_LIST_START_LENGTH;
332 }
333
334 if (draw.indexed) {
335 size += AGX_INDEX_LIST_BUFFER_LO_LENGTH;
336 size += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH;
337 }
338
339 return size;
340 }
341
342 static inline GLOBAL uint32_t *
agx_cdm_barrier(GLOBAL uint32_t * out,enum agx_chip chip)343 agx_cdm_barrier(GLOBAL uint32_t *out, enum agx_chip chip)
344 {
345 agx_push(out, CDM_BARRIER, cfg) {
346 cfg.unk_5 = true;
347 cfg.unk_6 = true;
348 cfg.unk_8 = true;
349 // cfg.unk_11 = true;
350 // cfg.unk_20 = true;
351 // cfg.unk_24 = true; if clustered?
352 if (chip == AGX_CHIP_G13X) {
353 cfg.unk_4 = true;
354 // cfg.unk_26 = true;
355 }
356
357 /* With multiple launches in the same CDM stream, we can get cache
358 * coherency (? or sync?) issues. We hit this with blits, which need - in
359 * between dispatches - need the PBE cache to be flushed and the texture
360 * cache to be invalidated. Until we know what bits mean what exactly,
361 * let's just set these after every launch to be safe. We can revisit in
362 * the future when we figure out what the bits mean.
363 */
364 cfg.unk_0 = true;
365 cfg.unk_1 = true;
366 cfg.unk_2 = true;
367 cfg.usc_cache_inval = true;
368 cfg.unk_4 = true;
369 cfg.unk_5 = true;
370 cfg.unk_6 = true;
371 cfg.unk_7 = true;
372 cfg.unk_8 = true;
373 cfg.unk_9 = true;
374 cfg.unk_10 = true;
375 cfg.unk_11 = true;
376 cfg.unk_12 = true;
377 cfg.unk_13 = true;
378 cfg.unk_14 = true;
379 cfg.unk_15 = true;
380 cfg.unk_16 = true;
381 cfg.unk_17 = true;
382 cfg.unk_18 = true;
383 cfg.unk_19 = true;
384 }
385
386 return out;
387 }
388
389 static inline GLOBAL uint32_t *
agx_vdm_return(GLOBAL uint32_t * out)390 agx_vdm_return(GLOBAL uint32_t *out)
391 {
392 agx_push(out, VDM_BARRIER, cfg) {
393 cfg.returns = true;
394 }
395
396 return out;
397 }
398
399 static inline GLOBAL uint32_t *
agx_cdm_return(GLOBAL uint32_t * out)400 agx_cdm_return(GLOBAL uint32_t *out)
401 {
402 agx_push(out, CDM_STREAM_RETURN, cfg)
403 ;
404
405 return out;
406 }
407
408 static inline GLOBAL uint32_t *
agx_cdm_terminate(GLOBAL uint32_t * out)409 agx_cdm_terminate(GLOBAL uint32_t *out)
410 {
411 agx_push(out, CDM_STREAM_TERMINATE, _)
412 ;
413
414 return out;
415 }
416
417 static inline GLOBAL uint32_t *
agx_vdm_terminate(GLOBAL uint32_t * out)418 agx_vdm_terminate(GLOBAL uint32_t *out)
419 {
420 agx_push(out, VDM_STREAM_TERMINATE, _)
421 ;
422
423 return out;
424 }
425
426 static inline GLOBAL uint32_t *
agx_cdm_jump(GLOBAL uint32_t * out,uint64_t target)427 agx_cdm_jump(GLOBAL uint32_t *out, uint64_t target)
428 {
429 agx_push(out, CDM_STREAM_LINK, cfg) {
430 cfg.target_lo = target & BITFIELD_MASK(32);
431 cfg.target_hi = target >> 32;
432 }
433
434 return out;
435 }
436
437 static inline GLOBAL uint32_t *
agx_vdm_jump(GLOBAL uint32_t * out,uint64_t target)438 agx_vdm_jump(GLOBAL uint32_t *out, uint64_t target)
439 {
440 agx_push(out, VDM_STREAM_LINK, cfg) {
441 cfg.target_lo = target & BITFIELD_MASK(32);
442 cfg.target_hi = target >> 32;
443 }
444
445 return out;
446 }
447
448 static inline GLOBAL uint32_t *
agx_cs_jump(GLOBAL uint32_t * out,uint64_t target,bool vdm)449 agx_cs_jump(GLOBAL uint32_t *out, uint64_t target, bool vdm)
450 {
451 return vdm ? agx_vdm_jump(out, target) : agx_cdm_jump(out, target);
452 }
453
454 static inline GLOBAL uint32_t *
agx_cdm_call(GLOBAL uint32_t * out,uint64_t target)455 agx_cdm_call(GLOBAL uint32_t *out, uint64_t target)
456 {
457 agx_push(out, CDM_STREAM_LINK, cfg) {
458 cfg.target_lo = target & BITFIELD_MASK(32);
459 cfg.target_hi = target >> 32;
460 cfg.with_return = true;
461 }
462
463 return out;
464 }
465
466 static inline GLOBAL uint32_t *
agx_vdm_call(GLOBAL uint32_t * out,uint64_t target)467 agx_vdm_call(GLOBAL uint32_t *out, uint64_t target)
468 {
469 agx_push(out, VDM_STREAM_LINK, cfg) {
470 cfg.target_lo = target & BITFIELD_MASK(32);
471 cfg.target_hi = target >> 32;
472 cfg.with_return = true;
473 }
474
475 return out;
476 }
477
478 #define AGX_MAX_LINKED_USC_SIZE \
479 (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH + \
480 AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \
481 AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9))
482
483 /*
484 * This data structure contains everything needed to dispatch a compute shader
485 * (and hopefully eventually graphics?).
486 *
487 * It is purely flat, no CPU pointers. That makes it suitable for sharing
488 * between CPU and GPU. The intention is that it is packed on the CPU side and
489 * then consumed on either host or device for dispatching work.
490 */
491 struct agx_shader {
492 struct agx_cdm_launch_word_0_packed launch;
493 struct agx_workgroup workgroup;
494
495 struct {
496 uint32_t size;
497 uint8_t data[AGX_MAX_LINKED_USC_SIZE];
498 } usc;
499 };
500
501 /* Opaque structure representing a USC program being constructed */
502 struct agx_usc_builder {
503 GLOBAL uint8_t *head;
504
505 #ifndef NDEBUG
506 uint8_t *begin;
507 size_t size;
508 #endif
509 } PACKED;
510
511 static struct agx_usc_builder
agx_usc_builder(GLOBAL void * out,ASSERTED size_t size)512 agx_usc_builder(GLOBAL void *out, ASSERTED size_t size)
513 {
514 return (struct agx_usc_builder){
515 .head = out,
516
517 #ifndef NDEBUG
518 .begin = out,
519 .size = size,
520 #endif
521 };
522 }
523
524 static bool
agx_usc_builder_validate(struct agx_usc_builder * b,size_t size)525 agx_usc_builder_validate(struct agx_usc_builder *b, size_t size)
526 {
527 #ifndef NDEBUG
528 assert(((b->head - b->begin) + size) <= b->size);
529 #endif
530
531 return true;
532 }
533
534 #define agx_usc_pack(b, struct_name, template) \
535 for (bool it = \
536 agx_usc_builder_validate((b), AGX_USC_##struct_name##_LENGTH); \
537 it; it = false, (b)->head += AGX_USC_##struct_name##_LENGTH) \
538 agx_pack((b)->head, USC_##struct_name, template)
539
540 #define agx_usc_push_blob(b, blob, length) \
541 for (bool it = agx_usc_builder_validate((b), length); it; \
542 it = false, (b)->head += length) \
543 memcpy((b)->head, blob, length);
544
545 #define agx_usc_push_packed(b, struct_name, packed) \
546 agx_usc_push_blob(b, packed.opaque, AGX_USC_##struct_name##_LENGTH);
547
548 static void
agx_usc_uniform(struct agx_usc_builder * b,unsigned start_halfs,unsigned size_halfs,uint64_t buffer)549 agx_usc_uniform(struct agx_usc_builder *b, unsigned start_halfs,
550 unsigned size_halfs, uint64_t buffer)
551 {
552 assert((start_halfs + size_halfs) <= (1 << 9) && "uniform file overflow");
553 assert(size_halfs <= 64 && "caller's responsibility to split");
554 assert(size_halfs > 0 && "no empty uniforms");
555
556 if (start_halfs & BITFIELD_BIT(8)) {
557 agx_usc_pack(b, UNIFORM_HIGH, cfg) {
558 cfg.start_halfs = start_halfs & BITFIELD_MASK(8);
559 cfg.size_halfs = size_halfs;
560 cfg.buffer = buffer;
561 }
562 } else {
563 agx_usc_pack(b, UNIFORM, cfg) {
564 cfg.start_halfs = start_halfs;
565 cfg.size_halfs = size_halfs;
566 cfg.buffer = buffer;
567 }
568 }
569 }
570
571 static inline void
agx_usc_words_precomp(GLOBAL uint32_t * out,CONST struct agx_shader * s,uint64_t data,unsigned data_size)572 agx_usc_words_precomp(GLOBAL uint32_t *out, CONST struct agx_shader *s,
573 uint64_t data, unsigned data_size)
574 {
575 /* Map the data directly as uniforms starting at u0 */
576 struct agx_usc_builder b = agx_usc_builder(out, sizeof(s->usc.data));
577 agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2), data);
578 agx_usc_push_blob(&b, s->usc.data, s->usc.size);
579 }
580
581 /* This prototype is sufficient for sizing the output */
582 static inline unsigned
libagx_draw_robust_index_vdm_size()583 libagx_draw_robust_index_vdm_size()
584 {
585 struct agx_draw draw = agx_draw_indexed(0, 0, 0, 0, 0, 0, 0, 0, 0);
586 return agx_vdm_draw_size(0, draw);
587 }
588
589 static inline unsigned
libagx_remap_adj_count(unsigned count,enum mesa_prim prim)590 libagx_remap_adj_count(unsigned count, enum mesa_prim prim)
591 {
592 if (prim == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
593 /* Spec gives formula for # of primitives in a tri strip adj */
594 unsigned c4 = count >= 4 ? count - 4 : 0;
595 return 3 * (c4 / 2);
596 } else if (prim == MESA_PRIM_LINE_STRIP_ADJACENCY) {
597 return 2 * (count >= 3 ? count - 3 : 0);
598 } else {
599 /* Adjacency lists just drop half the vertices. */
600 return count / 2;
601 }
602 }
603