• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #pragma once
7 
8 #include "compiler/libcl/libcl.h"
9 #include "compiler/shader_enums.h"
10 #include "agx_pack.h"
11 
12 #define agx_push(ptr, T, cfg)                                                  \
13    for (unsigned _loop = 0; _loop < 1;                                         \
14         ++_loop, ptr = (GLOBAL void *)(((uintptr_t)ptr) + AGX_##T##_LENGTH))   \
15       agx_pack(ptr, T, cfg)
16 
17 #define agx_push_packed(ptr, src, T)                                           \
18    static_assert(sizeof(src) == AGX_##T##_LENGTH);                             \
19    memcpy(ptr, &src, sizeof(src));                                             \
20    ptr = (GLOBAL void *)(((uintptr_t)ptr) + sizeof(src));
21 
22 static inline enum agx_index_size
agx_translate_index_size(uint8_t size_B)23 agx_translate_index_size(uint8_t size_B)
24 {
25    /* Index sizes are encoded logarithmically */
26    static_assert(__builtin_ctz(1) == AGX_INDEX_SIZE_U8);
27    static_assert(__builtin_ctz(2) == AGX_INDEX_SIZE_U16);
28    static_assert(__builtin_ctz(4) == AGX_INDEX_SIZE_U32);
29 
30    assert((size_B == 1) || (size_B == 2) || (size_B == 4));
31    return __builtin_ctz(size_B);
32 }
33 
34 static inline unsigned
agx_indices_to_B(unsigned x,enum agx_index_size size)35 agx_indices_to_B(unsigned x, enum agx_index_size size)
36 {
37    return x << size;
38 }
39 
40 static inline uint8_t
agx_index_size_to_B(enum agx_index_size size)41 agx_index_size_to_B(enum agx_index_size size)
42 {
43    return agx_indices_to_B(1, size);
44 }
45 
46 struct agx_workgroup {
47    uint32_t x, y, z;
48 };
49 
50 static inline struct agx_workgroup
agx_workgroup(uint32_t x,uint32_t y,uint32_t z)51 agx_workgroup(uint32_t x, uint32_t y, uint32_t z)
52 {
53    return (struct agx_workgroup){.x = x, .y = y, .z = z};
54 }
55 
56 static inline unsigned
agx_workgroup_threads(struct agx_workgroup wg)57 agx_workgroup_threads(struct agx_workgroup wg)
58 {
59    return wg.x * wg.y * wg.z;
60 }
61 
62 struct agx_grid {
63    enum agx_cdm_mode mode;
64    union {
65       uint32_t count[3];
66       uint64_t ptr;
67    };
68 };
69 
70 static struct agx_grid
agx_3d(uint32_t x,uint32_t y,uint32_t z)71 agx_3d(uint32_t x, uint32_t y, uint32_t z)
72 {
73    return (struct agx_grid){.mode = AGX_CDM_MODE_DIRECT, .count = {x, y, z}};
74 }
75 
76 static struct agx_grid
agx_1d(uint32_t x)77 agx_1d(uint32_t x)
78 {
79    return agx_3d(x, 1, 1);
80 }
81 
82 static struct agx_grid
agx_grid_indirect(uint64_t ptr)83 agx_grid_indirect(uint64_t ptr)
84 {
85    return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_GLOBAL, .ptr = ptr};
86 }
87 
88 static struct agx_grid
agx_grid_indirect_local(uint64_t ptr)89 agx_grid_indirect_local(uint64_t ptr)
90 {
91    return (struct agx_grid){.mode = AGX_CDM_MODE_INDIRECT_LOCAL, .ptr = ptr};
92 }
93 
94 static inline bool
agx_is_indirect(struct agx_grid grid)95 agx_is_indirect(struct agx_grid grid)
96 {
97    return grid.mode != AGX_CDM_MODE_DIRECT;
98 }
99 
100 enum agx_barrier {
101    /* No barrier/cache operations needed */
102    AGX_BARRIER_NONE = 0,
103 
104    /* Catch-all for all defined barriers. Because we have not yet
105     * reverse-engineered the finer details here, this is the only barrier we
106     * have....
107     */
108    AGX_BARRIER_ALL = (1 << 0),
109 };
110 
111 struct agx_draw {
112    struct agx_grid b;
113    uint64_t index_buffer;
114    uint32_t index_buffer_range_B;
115    uint32_t start;
116    uint32_t index_bias;
117    uint32_t start_instance;
118 
119    /* Primitive restart enabled. If true, implies indexed */
120    bool restart;
121    enum agx_index_size index_size;
122 
123    /* TODO: Optimize this boolean. We can't just check if index_buffer != 0
124     * because that breaks with null index buffers.
125     */
126    bool indexed;
127 };
128 
129 static inline struct agx_draw
agx_draw_indirect(uint64_t ptr)130 agx_draw_indirect(uint64_t ptr)
131 {
132    return (struct agx_draw){.b = agx_grid_indirect(ptr)};
133 }
134 
135 static inline struct agx_draw
agx_draw_indexed(uint32_t index_count,uint32_t instance_count,uint32_t first_index,uint32_t index_bias,uint32_t first_instance,uint64_t buf,uint32_t range_B,enum agx_index_size index_size,bool restart)136 agx_draw_indexed(uint32_t index_count, uint32_t instance_count,
137                  uint32_t first_index, uint32_t index_bias,
138                  uint32_t first_instance, uint64_t buf, uint32_t range_B,
139                  enum agx_index_size index_size, bool restart)
140 {
141    return (struct agx_draw){
142       .b = agx_3d(index_count, instance_count, 1),
143       .index_buffer = buf,
144       .index_buffer_range_B = range_B,
145       .start = first_index,
146       .index_bias = index_bias,
147       .start_instance = first_instance,
148       .index_size = index_size,
149       .restart = restart,
150       .indexed = true,
151    };
152 }
153 
154 static inline struct agx_draw
agx_draw_indexed_indirect(uint64_t ptr,uint64_t buf,uint32_t range_B,enum agx_index_size index_size,bool restart)155 agx_draw_indexed_indirect(uint64_t ptr, uint64_t buf, uint32_t range_B,
156                           enum agx_index_size index_size, bool restart)
157 {
158    return (struct agx_draw){
159       .b = agx_grid_indirect(ptr),
160       .index_buffer = buf,
161       .index_buffer_range_B = range_B,
162       .index_size = index_size,
163       .restart = restart,
164       .indexed = true,
165    };
166 }
167 
168 static inline unsigned
agx_draw_index_range_B(struct agx_draw d)169 agx_draw_index_range_B(struct agx_draw d)
170 {
171    uint range_B = d.index_buffer_range_B;
172    if (!agx_is_indirect(d.b))
173       range_B -= agx_indices_to_B(d.start, d.index_size);
174 
175    return range_B;
176 }
177 
178 static inline unsigned
agx_draw_index_range_el(struct agx_draw d)179 agx_draw_index_range_el(struct agx_draw d)
180 {
181    assert(d.indexed);
182    return agx_draw_index_range_B(d) >> d.index_size;
183 }
184 
185 static inline uint64_t
agx_draw_index_buffer(struct agx_draw d)186 agx_draw_index_buffer(struct agx_draw d)
187 {
188    assert(d.indexed);
189 
190    uint64_t ib = d.index_buffer;
191    if (!agx_is_indirect(d.b))
192       ib += agx_indices_to_B(d.start, d.index_size);
193 
194    return ib;
195 }
196 
197 static bool
agx_direct_draw_overreads_indices(struct agx_draw d)198 agx_direct_draw_overreads_indices(struct agx_draw d)
199 {
200    uint32_t range_B = agx_indices_to_B(d.start + d.b.count[0], d.index_size);
201    return range_B > d.index_buffer_range_B;
202 }
203 
204 enum agx_chip {
205    AGX_CHIP_G13G,
206    AGX_CHIP_G13X,
207    AGX_CHIP_G14G,
208    AGX_CHIP_G14X,
209 };
210 
211 static inline GLOBAL uint32_t *
agx_cdm_launch(GLOBAL uint32_t * out,enum agx_chip chip,struct agx_grid grid,struct agx_workgroup wg,struct agx_cdm_launch_word_0_packed launch,uint32_t usc)212 agx_cdm_launch(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_grid grid,
213                struct agx_workgroup wg,
214                struct agx_cdm_launch_word_0_packed launch, uint32_t usc)
215 {
216 #ifndef __OPENCL_VERSION__
217    struct agx_cdm_launch_word_0_packed mode;
218    agx_pack(&mode, CDM_LAUNCH_WORD_0, cfg) {
219       cfg.mode = grid.mode;
220    }
221 
222    agx_merge(launch, mode, CDM_LAUNCH_WORD_0);
223 #endif
224 
225    agx_push_packed(out, launch, CDM_LAUNCH_WORD_0);
226 
227    agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
228       cfg.pipeline = usc;
229    }
230 
231    if (chip == AGX_CHIP_G14X) {
232       agx_push(out, CDM_UNK_G14X, cfg)
233          ;
234    }
235 
236    if (agx_is_indirect(grid)) {
237       agx_push(out, CDM_INDIRECT, cfg) {
238          cfg.address_hi = grid.ptr >> 32;
239          cfg.address_lo = grid.ptr;
240       }
241    } else {
242       agx_push(out, CDM_GLOBAL_SIZE, cfg) {
243          cfg.x = grid.count[0];
244          cfg.y = grid.count[1];
245          cfg.z = grid.count[2];
246       }
247    }
248 
249    if (grid.mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
250       agx_push(out, CDM_LOCAL_SIZE, cfg) {
251          cfg.x = wg.x;
252          cfg.y = wg.y;
253          cfg.z = wg.z;
254       }
255    }
256 
257    return out;
258 }
259 
260 static inline GLOBAL uint32_t *
agx_vdm_draw(GLOBAL uint32_t * out,enum agx_chip chip,struct agx_draw draw,enum agx_primitive topology)261 agx_vdm_draw(GLOBAL uint32_t *out, enum agx_chip chip, struct agx_draw draw,
262              enum agx_primitive topology)
263 {
264    uint64_t ib = draw.indexed ? agx_draw_index_buffer(draw) : 0;
265 
266    agx_push(out, INDEX_LIST, cfg) {
267       cfg.primitive = topology;
268 
269       if (agx_is_indirect(draw.b)) {
270          cfg.indirect_buffer_present = true;
271       } else {
272          cfg.instance_count_present = true;
273          cfg.index_count_present = true;
274          cfg.start_present = true;
275       }
276 
277       if (draw.indexed) {
278          cfg.restart_enable = draw.restart;
279          cfg.index_buffer_hi = ib >> 32;
280          cfg.index_size = draw.index_size;
281 
282          cfg.index_buffer_present = true;
283          cfg.index_buffer_size_present = true;
284       }
285    }
286 
287    if (draw.indexed) {
288       agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
289          cfg.buffer_lo = ib;
290       }
291    }
292 
293    if (agx_is_indirect(draw.b)) {
294       agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
295          cfg.address_hi = draw.b.ptr >> 32;
296          cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
297       }
298    } else {
299       agx_push(out, INDEX_LIST_COUNT, cfg) {
300          cfg.count = draw.b.count[0];
301       }
302 
303       agx_push(out, INDEX_LIST_INSTANCES, cfg) {
304          cfg.count = draw.b.count[1];
305       }
306 
307       agx_push(out, INDEX_LIST_START, cfg) {
308          cfg.start = draw.indexed ? draw.index_bias : draw.start;
309       }
310    }
311 
312    if (draw.indexed) {
313       agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
314          cfg.size = align(agx_draw_index_range_B(draw), 4);
315       }
316    }
317 
318    return out;
319 }
320 
321 static inline uint32_t
agx_vdm_draw_size(enum agx_chip chip,struct agx_draw draw)322 agx_vdm_draw_size(enum agx_chip chip, struct agx_draw draw)
323 {
324    uint32_t size = AGX_INDEX_LIST_LENGTH;
325 
326    if (agx_is_indirect(draw.b)) {
327       size += AGX_INDEX_LIST_INDIRECT_BUFFER_LENGTH;
328    } else {
329       size += AGX_INDEX_LIST_COUNT_LENGTH;
330       size += AGX_INDEX_LIST_INSTANCES_LENGTH;
331       size += AGX_INDEX_LIST_START_LENGTH;
332    }
333 
334    if (draw.indexed) {
335       size += AGX_INDEX_LIST_BUFFER_LO_LENGTH;
336       size += AGX_INDEX_LIST_BUFFER_SIZE_LENGTH;
337    }
338 
339    return size;
340 }
341 
342 static inline GLOBAL uint32_t *
agx_cdm_barrier(GLOBAL uint32_t * out,enum agx_chip chip)343 agx_cdm_barrier(GLOBAL uint32_t *out, enum agx_chip chip)
344 {
345    agx_push(out, CDM_BARRIER, cfg) {
346       cfg.unk_5 = true;
347       cfg.unk_6 = true;
348       cfg.unk_8 = true;
349       // cfg.unk_11 = true;
350       // cfg.unk_20 = true;
351       // cfg.unk_24 = true; if clustered?
352       if (chip == AGX_CHIP_G13X) {
353          cfg.unk_4 = true;
354          // cfg.unk_26 = true;
355       }
356 
357       /* With multiple launches in the same CDM stream, we can get cache
358        * coherency (? or sync?) issues. We hit this with blits, which need - in
359        * between dispatches - need the PBE cache to be flushed and the texture
360        * cache to be invalidated. Until we know what bits mean what exactly,
361        * let's just set these after every launch to be safe. We can revisit in
362        * the future when we figure out what the bits mean.
363        */
364       cfg.unk_0 = true;
365       cfg.unk_1 = true;
366       cfg.unk_2 = true;
367       cfg.usc_cache_inval = true;
368       cfg.unk_4 = true;
369       cfg.unk_5 = true;
370       cfg.unk_6 = true;
371       cfg.unk_7 = true;
372       cfg.unk_8 = true;
373       cfg.unk_9 = true;
374       cfg.unk_10 = true;
375       cfg.unk_11 = true;
376       cfg.unk_12 = true;
377       cfg.unk_13 = true;
378       cfg.unk_14 = true;
379       cfg.unk_15 = true;
380       cfg.unk_16 = true;
381       cfg.unk_17 = true;
382       cfg.unk_18 = true;
383       cfg.unk_19 = true;
384    }
385 
386    return out;
387 }
388 
389 static inline GLOBAL uint32_t *
agx_vdm_return(GLOBAL uint32_t * out)390 agx_vdm_return(GLOBAL uint32_t *out)
391 {
392    agx_push(out, VDM_BARRIER, cfg) {
393       cfg.returns = true;
394    }
395 
396    return out;
397 }
398 
399 static inline GLOBAL uint32_t *
agx_cdm_return(GLOBAL uint32_t * out)400 agx_cdm_return(GLOBAL uint32_t *out)
401 {
402    agx_push(out, CDM_STREAM_RETURN, cfg)
403       ;
404 
405    return out;
406 }
407 
408 static inline GLOBAL uint32_t *
agx_cdm_terminate(GLOBAL uint32_t * out)409 agx_cdm_terminate(GLOBAL uint32_t *out)
410 {
411    agx_push(out, CDM_STREAM_TERMINATE, _)
412       ;
413 
414    return out;
415 }
416 
417 static inline GLOBAL uint32_t *
agx_vdm_terminate(GLOBAL uint32_t * out)418 agx_vdm_terminate(GLOBAL uint32_t *out)
419 {
420    agx_push(out, VDM_STREAM_TERMINATE, _)
421       ;
422 
423    return out;
424 }
425 
426 static inline GLOBAL uint32_t *
agx_cdm_jump(GLOBAL uint32_t * out,uint64_t target)427 agx_cdm_jump(GLOBAL uint32_t *out, uint64_t target)
428 {
429    agx_push(out, CDM_STREAM_LINK, cfg) {
430       cfg.target_lo = target & BITFIELD_MASK(32);
431       cfg.target_hi = target >> 32;
432    }
433 
434    return out;
435 }
436 
437 static inline GLOBAL uint32_t *
agx_vdm_jump(GLOBAL uint32_t * out,uint64_t target)438 agx_vdm_jump(GLOBAL uint32_t *out, uint64_t target)
439 {
440    agx_push(out, VDM_STREAM_LINK, cfg) {
441       cfg.target_lo = target & BITFIELD_MASK(32);
442       cfg.target_hi = target >> 32;
443    }
444 
445    return out;
446 }
447 
448 static inline GLOBAL uint32_t *
agx_cs_jump(GLOBAL uint32_t * out,uint64_t target,bool vdm)449 agx_cs_jump(GLOBAL uint32_t *out, uint64_t target, bool vdm)
450 {
451    return vdm ? agx_vdm_jump(out, target) : agx_cdm_jump(out, target);
452 }
453 
454 static inline GLOBAL uint32_t *
agx_cdm_call(GLOBAL uint32_t * out,uint64_t target)455 agx_cdm_call(GLOBAL uint32_t *out, uint64_t target)
456 {
457    agx_push(out, CDM_STREAM_LINK, cfg) {
458       cfg.target_lo = target & BITFIELD_MASK(32);
459       cfg.target_hi = target >> 32;
460       cfg.with_return = true;
461    }
462 
463    return out;
464 }
465 
466 static inline GLOBAL uint32_t *
agx_vdm_call(GLOBAL uint32_t * out,uint64_t target)467 agx_vdm_call(GLOBAL uint32_t *out, uint64_t target)
468 {
469    agx_push(out, VDM_STREAM_LINK, cfg) {
470       cfg.target_lo = target & BITFIELD_MASK(32);
471       cfg.target_hi = target >> 32;
472       cfg.with_return = true;
473    }
474 
475    return out;
476 }
477 
478 #define AGX_MAX_LINKED_USC_SIZE                                                \
479    (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH +            \
480     AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \
481     AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9))
482 
483 /*
484  * This data structure contains everything needed to dispatch a compute shader
485  * (and hopefully eventually graphics?).
486  *
487  * It is purely flat, no CPU pointers. That makes it suitable for sharing
488  * between CPU and GPU. The intention is that it is packed on the CPU side and
489  * then consumed on either host or device for dispatching work.
490  */
491 struct agx_shader {
492    struct agx_cdm_launch_word_0_packed launch;
493    struct agx_workgroup workgroup;
494 
495    struct {
496       uint32_t size;
497       uint8_t data[AGX_MAX_LINKED_USC_SIZE];
498    } usc;
499 };
500 
501 /* Opaque structure representing a USC program being constructed */
502 struct agx_usc_builder {
503    GLOBAL uint8_t *head;
504 
505 #ifndef NDEBUG
506    uint8_t *begin;
507    size_t size;
508 #endif
509 } PACKED;
510 
511 static struct agx_usc_builder
agx_usc_builder(GLOBAL void * out,ASSERTED size_t size)512 agx_usc_builder(GLOBAL void *out, ASSERTED size_t size)
513 {
514    return (struct agx_usc_builder){
515       .head = out,
516 
517 #ifndef NDEBUG
518       .begin = out,
519       .size = size,
520 #endif
521    };
522 }
523 
524 static bool
agx_usc_builder_validate(struct agx_usc_builder * b,size_t size)525 agx_usc_builder_validate(struct agx_usc_builder *b, size_t size)
526 {
527 #ifndef NDEBUG
528    assert(((b->head - b->begin) + size) <= b->size);
529 #endif
530 
531    return true;
532 }
533 
534 #define agx_usc_pack(b, struct_name, template)                                 \
535    for (bool it =                                                              \
536            agx_usc_builder_validate((b), AGX_USC_##struct_name##_LENGTH);      \
537         it; it = false, (b)->head += AGX_USC_##struct_name##_LENGTH)           \
538       agx_pack((b)->head, USC_##struct_name, template)
539 
540 #define agx_usc_push_blob(b, blob, length)                                     \
541    for (bool it = agx_usc_builder_validate((b), length); it;                   \
542         it = false, (b)->head += length)                                       \
543       memcpy((b)->head, blob, length);
544 
545 #define agx_usc_push_packed(b, struct_name, packed)                            \
546    agx_usc_push_blob(b, packed.opaque, AGX_USC_##struct_name##_LENGTH);
547 
548 static void
agx_usc_uniform(struct agx_usc_builder * b,unsigned start_halfs,unsigned size_halfs,uint64_t buffer)549 agx_usc_uniform(struct agx_usc_builder *b, unsigned start_halfs,
550                 unsigned size_halfs, uint64_t buffer)
551 {
552    assert((start_halfs + size_halfs) <= (1 << 9) && "uniform file overflow");
553    assert(size_halfs <= 64 && "caller's responsibility to split");
554    assert(size_halfs > 0 && "no empty uniforms");
555 
556    if (start_halfs & BITFIELD_BIT(8)) {
557       agx_usc_pack(b, UNIFORM_HIGH, cfg) {
558          cfg.start_halfs = start_halfs & BITFIELD_MASK(8);
559          cfg.size_halfs = size_halfs;
560          cfg.buffer = buffer;
561       }
562    } else {
563       agx_usc_pack(b, UNIFORM, cfg) {
564          cfg.start_halfs = start_halfs;
565          cfg.size_halfs = size_halfs;
566          cfg.buffer = buffer;
567       }
568    }
569 }
570 
571 static inline void
agx_usc_words_precomp(GLOBAL uint32_t * out,CONST struct agx_shader * s,uint64_t data,unsigned data_size)572 agx_usc_words_precomp(GLOBAL uint32_t *out, CONST struct agx_shader *s,
573                       uint64_t data, unsigned data_size)
574 {
575    /* Map the data directly as uniforms starting at u0 */
576    struct agx_usc_builder b = agx_usc_builder(out, sizeof(s->usc.data));
577    agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2), data);
578    agx_usc_push_blob(&b, s->usc.data, s->usc.size);
579 }
580 
581 /* This prototype is sufficient for sizing the output */
582 static inline unsigned
libagx_draw_robust_index_vdm_size()583 libagx_draw_robust_index_vdm_size()
584 {
585    struct agx_draw draw = agx_draw_indexed(0, 0, 0, 0, 0, 0, 0, 0, 0);
586    return agx_vdm_draw_size(0, draw);
587 }
588 
589 static inline unsigned
libagx_remap_adj_count(unsigned count,enum mesa_prim prim)590 libagx_remap_adj_count(unsigned count, enum mesa_prim prim)
591 {
592    if (prim == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
593       /* Spec gives formula for # of primitives in a tri strip adj */
594       unsigned c4 = count >= 4 ? count - 4 : 0;
595       return 3 * (c4 / 2);
596    } else if (prim == MESA_PRIM_LINE_STRIP_ADJACENCY) {
597       return 2 * (count >= 3 ? count - 3 : 0);
598    } else {
599       /* Adjacency lists just drop half the vertices. */
600       return count / 2;
601    }
602 }
603