1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * Copyright 2019-2021 Collabora, Ltd.
4 * SPDX-License-Identifier: MIT
5 */
6
7 #pragma once
8
9 #include <xf86drm.h>
10 #include "asahi/compiler/agx_compile.h"
11 #include "asahi/genxml/agx_pack.h"
12 #include "asahi/layout/layout.h"
13 #include "asahi/lib/agx_bo.h"
14 #include "asahi/lib/agx_device.h"
15 #include "asahi/lib/agx_linker.h"
16 #include "asahi/lib/agx_nir_lower_vbo.h"
17 #include "asahi/lib/agx_scratch.h"
18 #include "asahi/lib/agx_tilebuffer.h"
19 #include "asahi/lib/agx_uvs.h"
20 #include "asahi/lib/pool.h"
21 #include "asahi/lib/unstable_asahi_drm.h"
22 #include "asahi/libagx/geometry.h"
23 #include "compiler/shader_enums.h"
24 #include "gallium/auxiliary/util/u_blitter.h"
25 #include "gallium/include/pipe/p_context.h"
26 #include "gallium/include/pipe/p_screen.h"
27 #include "gallium/include/pipe/p_state.h"
28 #include "pipe/p_defines.h"
29 #include "util/bitset.h"
30 #include "util/disk_cache.h"
31 #include "util/hash_table.h"
32 #include "util/rwlock.h"
33 #include "util/u_range.h"
34 #include "agx_bg_eot.h"
35 #include "agx_helpers.h"
36 #include "agx_nir_texture.h"
37
38 #ifdef __GLIBC__
39 #include <errno.h>
40 #define agx_msg(fmt, ...) \
41 fprintf(stderr, "[%s] " fmt, program_invocation_short_name, ##__VA_ARGS__)
42 #else
43 #define agx_msg(...) fprintf(stderr, __VA_ARGS__)
44 #endif
45
46 #define AGX_NUM_TEXTURE_STATE_REGS 16
47
48 struct agx_streamout_target {
49 struct pipe_stream_output_target base;
50 struct pipe_resource *offset;
51
52 /* Current stride (bytes per vertex) */
53 uint32_t stride;
54 };
55
56 static inline struct agx_streamout_target *
agx_so_target(struct pipe_stream_output_target * target)57 agx_so_target(struct pipe_stream_output_target *target)
58 {
59 return (struct agx_streamout_target *)target;
60 }
61
62 struct agx_streamout {
63 struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
64 unsigned num_targets;
65 };
66
67 /* Shaders can access fixed-function state through system values.
68 * It is convenient to stash all of this information into a single "root"
69 * descriptor, then push individual parts as needed.
70 *
71 * In the future, we could optimize this to reduce CPU overhead, e.g. splitting
72 * into multiple descriptors for finer dirty tracking. This is not ABI with the
73 * compiler. The layout is up to us and handled by our code lowering system
74 * values to uniforms.
75 */
76 enum agx_sysval_table {
77 AGX_SYSVAL_TABLE_ROOT,
78 AGX_SYSVAL_TABLE_PARAMS,
79 AGX_SYSVAL_TABLE_GRID,
80 AGX_SYSVAL_TABLE_VS,
81 AGX_SYSVAL_TABLE_TCS,
82 AGX_SYSVAL_TABLE_TES,
83 AGX_SYSVAL_TABLE_GS,
84 AGX_SYSVAL_TABLE_FS,
85 AGX_SYSVAL_TABLE_CS,
86 AGX_NUM_SYSVAL_TABLES
87 };
88
89 #define AGX_SYSVAL_STAGE(stage) (AGX_SYSVAL_TABLE_VS + (stage))
90
91 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_VERTEX) == AGX_SYSVAL_TABLE_VS,
92 "fixed enum orderings");
93 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_CTRL) == AGX_SYSVAL_TABLE_TCS,
94 "fixed enum orderings");
95 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_EVAL) == AGX_SYSVAL_TABLE_TES,
96 "fixed enum orderings");
97 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_GEOMETRY) == AGX_SYSVAL_TABLE_GS,
98 "fixed enum orderings");
99 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_FRAGMENT) == AGX_SYSVAL_TABLE_FS,
100 "fixed enum orderings");
101 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_COMPUTE) == AGX_SYSVAL_TABLE_CS,
102 "fixed enum orderings");
103
104 /* Root system value table */
105 struct PACKED agx_draw_uniforms {
106 /* Pointers to the system value tables themselves (for indirection) */
107 uint64_t tables[AGX_NUM_SYSVAL_TABLES];
108
109 /* Vertex buffer object bases, if present. If vertex robustness is disabled,
110 * attrib_base maps VBOs directly and attrib_max_index is undefined. If
111 * vertex robustness is enabled, attrib_base maps attributes and
112 * attrib_clamp is an inclusive clamp on vertex/divided instance indices.
113 */
114 uint64_t attrib_base[PIPE_MAX_ATTRIBS];
115 uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];
116
117 /* Addresses for the results of pipeline statistics queries */
118 uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];
119
120 /* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer.
121 * Indirected so it can be written to in an indirect setup kernel. G13
122 * appears to prefetch uniforms across dispatches, but does not pre-run
123 * preambles, so this indirection saves us from splitting the batch.
124 */
125 uint64_t vertex_output_buffer_ptr;
126
127 /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
128 uint64_t vertex_outputs;
129
130 /* Address of input assembly buffer if geom/tess is used, else 0 */
131 uint64_t input_assembly;
132
133 /* Address of tessellation param buffer if tessellation is used, else 0 */
134 uint64_t tess_params;
135
136 /* Address of geometry param buffer if geometry shaders are used, else 0 */
137 uint64_t geometry_params;
138
139 /* Address of polygon stipple mask if used */
140 uint64_t polygon_stipple;
141
142 /* Blend constant if any */
143 float blend_constant[4];
144
145 /* glPointSize value */
146 float fixed_point_size;
147
148 /* Value of the multisample control register, containing sample positions in
149 * each byte (x in low nibble, y in high nibble).
150 */
151 uint32_t ppp_multisamplectl;
152
153 /* gl_DrawID for a direct multidraw */
154 uint32_t draw_id;
155
156 /* Sprite coord replacement mask */
157 uint16_t sprite_mask;
158
159 /* glSampleMask */
160 uint16_t sample_mask;
161
162 /* Nonzero for indexed draws, zero otherwise */
163 uint16_t is_indexed_draw;
164
165 /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
166 uint16_t clip_z_coeff;
167
168 /* ~0/0 boolean whether the epilog lacks any discard instrction */
169 uint16_t no_epilog_discard;
170
171 /* Provoking vertex: 0, 1, 2 */
172 uint16_t provoking_vertex;
173
174 /* Mapping from varying slots written by the last vertex stage to UVS
175 * indices. This mapping must be compatible with the fragment shader.
176 */
177 uint16_t uvs_index[VARYING_SLOT_MAX];
178 };
179
180 struct PACKED agx_stage_uniforms {
181 /* Pointer to binding table for texture descriptor, or 0 if none. This must
182 * be first so that u0_u1 is always available for lowering binding
183 * tables to bindless access.
184 */
185 uint64_t texture_base;
186
187 /* Uniform buffer objects */
188 uint64_t ubo_base[PIPE_MAX_CONSTANT_BUFFERS];
189 uint32_t ubo_size[PIPE_MAX_CONSTANT_BUFFERS];
190
191 /* Shader storage buffer objects */
192 uint64_t ssbo_base[PIPE_MAX_SHADER_BUFFERS];
193 uint32_t ssbo_size[PIPE_MAX_SHADER_BUFFERS];
194
195 /* If lowered to bindless, sampler index in the heap */
196 uint16_t sampler_handle[PIPE_MAX_SAMPLERS];
197
198 /* LOD bias as float16 */
199 uint16_t lod_bias[PIPE_MAX_SAMPLERS];
200 };
201
202 /* In the architecture, there are 512 uniform registers, each 16-bits. In a
203 * theoretical worst case, we could push to all of them. We use a worst-case
204 * maximum because the expression for a tight upper bound is too messy and easy
205 * to go out of sync with the code.
206 */
207 #define AGX_MAX_PUSH_RANGES (512)
208
209 struct agx_push_range {
210 /* Base 16-bit uniform to push to */
211 uint16_t uniform;
212
213 /* Offset into the table to push in bytes */
214 uint16_t offset;
215
216 /* Which table to push from */
217 uint8_t table;
218
219 /* Number of consecutive 16-bit uniforms to push */
220 uint8_t length;
221 };
222
223 struct agx_compiled_shader {
224 /* Base struct */
225 struct agx_shader_part b;
226
227 /* Uncompiled shader that we belong to */
228 const struct agx_uncompiled_shader *so;
229
230 /* Mapped executable memory */
231 struct agx_bo *bo;
232
233 /* Uniforms the driver must push */
234 unsigned push_range_count;
235 struct agx_push_range push[AGX_MAX_PUSH_RANGES];
236
237 /* UVS layout for the last vertex stage */
238 struct agx_unlinked_uvs_layout uvs;
239
240 /* For a vertex shader, the mask of vertex attributes read. Used to key the
241 * prolog so the prolog doesn't write components not actually read.
242 */
243 BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4);
244
245 struct agx_fs_epilog_link_info epilog_key;
246
247 /* Auxiliary programs, or NULL if not used */
248 struct agx_compiled_shader *gs_count, *pre_gs;
249 struct agx_compiled_shader *gs_copy;
250
251 /* Output primitive mode for geometry shaders */
252 enum mesa_prim gs_output_mode;
253
254 /* Number of words per primitive in the count buffer */
255 unsigned gs_count_words;
256
257 /* Logical shader stage used for descriptor access. This may differ from the
258 * physical shader stage of the compiled shader, for example when executing a
259 * tessellation eval shader as a vertex shader.
260 */
261 enum pipe_shader_type stage;
262 };
263
264 struct agx_fast_link_key {
265 union {
266 struct agx_vs_prolog_key vs;
267 struct agx_fs_prolog_key fs;
268 } prolog;
269
270 struct agx_compiled_shader *main;
271
272 union {
273 struct agx_fs_epilog_key fs;
274 } epilog;
275
276 unsigned nr_samples_shaded;
277 };
278
279 struct agx_uncompiled_shader {
280 struct pipe_shader_state base;
281 enum pipe_shader_type type;
282 struct blob early_serialized_nir;
283 struct blob serialized_nir;
284 uint8_t nir_sha1[20];
285
286 struct {
287 uint64_t inputs_flat_shaded;
288 uint64_t inputs_linear_shaded;
289 uint8_t cull_distance_size;
290 bool has_edgeflags;
291 bool uses_fbfetch;
292
293 /* Number of bindful textures, images used */
294 unsigned nr_bindful_textures, nr_bindful_images;
295 } info;
296
297 struct hash_table *variants;
298 struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
299 struct agx_uncompiled_shader *passthrough_tcs[32];
300
301 /* agx_fast_link_key -> agx_linked_shader */
302 struct hash_table *linked_shaders;
303
304 uint32_t xfb_strides[4];
305 bool has_xfb_info;
306 bool is_xfb_passthrough;
307
308 enum mesa_prim gs_mode;
309
310 /* Whether the shader accesses indexed samplers via the bindless heap */
311 bool uses_bindless_samplers;
312
313 /* Set on VS, passed to FS for linkage */
314 unsigned base_varying;
315
316 /* Tessellation info */
317 struct {
318 uint64_t per_vertex_outputs;
319 uint32_t output_stride;
320 enum gl_tess_spacing spacing;
321 enum tess_primitive_mode primitive;
322 uint8_t output_patch_size;
323 uint8_t nr_patch_outputs;
324 bool ccw;
325 bool point_mode;
326 } tess;
327 };
328
329 enum agx_stage_dirty {
330 AGX_STAGE_DIRTY_CONST = BITFIELD_BIT(0),
331 AGX_STAGE_DIRTY_SSBO = BITFIELD_BIT(1),
332 AGX_STAGE_DIRTY_IMAGE = BITFIELD_BIT(2),
333 AGX_STAGE_DIRTY_SAMPLER = BITFIELD_BIT(3),
334 };
335
336 struct agx_stage {
337 struct agx_uncompiled_shader *shader;
338 uint32_t dirty;
339
340 struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
341 uint32_t cb_mask;
342
343 struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
344 uint32_t ssbo_writable_mask;
345 uint32_t ssbo_mask;
346
347 struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES];
348 uint32_t image_mask;
349
350 /* Need full CSOs for u_blitter */
351 struct agx_sampler_state *samplers[PIPE_MAX_SAMPLERS];
352 struct agx_sampler_view *textures[PIPE_MAX_SHADER_SAMPLER_VIEWS];
353
354 /* Does any bound sampler require custom border colours? */
355 bool custom_borders;
356
357 unsigned sampler_count, texture_count;
358 uint32_t valid_samplers;
359 };
360
361 union agx_batch_result {
362 struct drm_asahi_result_render render;
363 struct drm_asahi_result_compute compute;
364 };
365
366 /* This is a firmware limit. It should be possible to raise to 2048 in the
367 * future... still not good enough for VK though :-(
368 */
369 #define AGX_SAMPLER_HEAP_SIZE (1024)
370
371 struct agx_sampler_heap {
372 struct agx_bo *bo;
373 uint16_t count;
374 };
375
376 uint16_t agx_sampler_heap_add(struct agx_device *dev,
377 struct agx_sampler_heap *heap,
378 struct agx_sampler_packed *sampler);
379
380 struct agx_encoder {
381 struct agx_bo *bo;
382 uint8_t *current;
383 uint8_t *end;
384 };
385
386 struct agx_batch {
387 struct agx_context *ctx;
388 struct pipe_framebuffer_state key;
389 uint64_t seqnum;
390 uint32_t syncobj;
391 uint32_t draws;
392
393 struct agx_tilebuffer_layout tilebuffer_layout;
394
395 /* PIPE_CLEAR_* bitmask */
396 uint32_t clear, draw, load, resolve, feedback;
397 bool initialized;
398
399 uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS];
400 double clear_depth;
401 unsigned clear_stencil;
402
403 /* Whether we're drawing points, lines, or triangles */
404 enum mesa_prim reduced_prim;
405
406 /* Whether the bound FS needs a primitive ID that is not supplied by the
407 * bound hardware VS (software GS)
408 */
409 bool generate_primitive_id;
410
411 /* Current varyings linkage structures */
412 uint32_t varyings;
413 struct agx_varyings_vs linked_varyings;
414
415 struct agx_draw_uniforms uniforms;
416 struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES];
417
418 /* Indirect buffer allocated for geometry shader */
419 uint64_t geom_indirect;
420 struct agx_bo *geom_indirect_bo;
421
422 /* Geometry state buffer if geometry/etc shaders are used */
423 uint64_t geometry_state;
424
425 /* Uploaded descriptors */
426 uint32_t texture_count[PIPE_SHADER_TYPES];
427
428 uint64_t samplers[PIPE_SHADER_TYPES];
429 uint32_t sampler_count[PIPE_SHADER_TYPES];
430
431 struct agx_sampler_heap sampler_heap;
432
433 /* Resource list requirements, represented as a bit set indexed by BO
434 * handles (GEM handles on Linux, or IOGPU's equivalent on macOS)
435 */
436 struct {
437 BITSET_WORD *set;
438 unsigned bit_count;
439 } bo_list;
440
441 /* If true, this batch contains a shader with a potentially incoherent write
442 * (e.g. image_write), needing a barrier later to access.
443 */
444 bool incoherent_writes;
445
446 struct agx_pool pool, pipeline_pool;
447
448 /* We may enqueue both CDM and VDM work, possibly to the same batch for
449 * geometry/tessellation.
450 */
451 struct agx_encoder vdm;
452 struct agx_encoder cdm;
453
454 /* Scissor and depth-bias descriptors, uploaded at GPU time */
455 struct util_dynarray scissor, depth_bias;
456
457 /* Arrays of GPU pointers that should be written with the batch timestamps */
458 struct util_dynarray timestamps;
459
460 /* Result buffer where the kernel places command execution information */
461 union agx_batch_result *result;
462 size_t result_off;
463
464 /* Actual pointer in a uniform */
465 struct agx_bo *geom_params_bo;
466
467 /* Whether each stage uses scratch */
468 bool vs_scratch;
469 bool fs_scratch;
470 bool cs_scratch;
471
472 /* Whether each stage has preambles using scratch, and if so which bucket.
473 * This just needs to be zero/nonzero for correctness, the magnitude in
474 * buckets is for statistics.
475 */
476 unsigned vs_preamble_scratch;
477 unsigned fs_preamble_scratch;
478 unsigned cs_preamble_scratch;
479 };
480
481 struct agx_zsa {
482 struct pipe_depth_stencil_alpha_state base;
483 struct agx_fragment_face_packed depth;
484 struct agx_fragment_stencil_packed front_stencil, back_stencil;
485
486 /* PIPE_CLEAR_* bitmask corresponding to this depth/stencil state */
487 uint32_t load, store;
488 };
489
490 struct agx_blend {
491 struct agx_blend_key key;
492
493 /* PIPE_CLEAR_* bitmask corresponding to this blend state */
494 uint32_t store;
495 };
496
497 struct asahi_vs_shader_key {
498 /* If true, this is running as a hardware vertex shader. If false, this is a
499 * compute job used to feed a TCS or GS.
500 */
501 bool hw;
502 };
503
504 struct agx_vertex_elements {
505 unsigned num_attribs;
506 struct agx_velem_key key[PIPE_MAX_ATTRIBS];
507
508 /* These parts do not affect the generated code so are not in the key */
509 uint16_t src_offsets[PIPE_MAX_ATTRIBS];
510 uint16_t buffers[PIPE_MAX_ATTRIBS];
511 };
512
513 struct asahi_fs_shader_key {
514 enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
515 uint8_t nr_samples;
516 bool padding[7];
517 };
518 static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");
519
520 struct asahi_gs_shader_key {
521 /* If true, this GS is run only for its side effects (including XFB) */
522 bool rasterizer_discard;
523 bool padding[7];
524 };
525 static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
526
527 union asahi_shader_key {
528 struct asahi_vs_shader_key vs;
529 struct asahi_gs_shader_key gs;
530 struct asahi_fs_shader_key fs;
531 };
532
533 enum agx_dirty {
534 AGX_DIRTY_VERTEX = BITFIELD_BIT(0),
535 AGX_DIRTY_VIEWPORT = BITFIELD_BIT(1),
536 AGX_DIRTY_SCISSOR_ZBIAS = BITFIELD_BIT(2),
537 AGX_DIRTY_ZS = BITFIELD_BIT(3),
538 AGX_DIRTY_STENCIL_REF = BITFIELD_BIT(4),
539 AGX_DIRTY_RS = BITFIELD_BIT(5),
540 AGX_DIRTY_SPRITE_COORD_MODE = BITFIELD_BIT(6),
541 AGX_DIRTY_PRIM = BITFIELD_BIT(7),
542
543 /* Vertex/fragment pipelines, including uniforms and textures */
544 AGX_DIRTY_VS = BITFIELD_BIT(8),
545 AGX_DIRTY_FS = BITFIELD_BIT(9),
546
547 /* Just the progs themselves */
548 AGX_DIRTY_VS_PROG = BITFIELD_BIT(10),
549 AGX_DIRTY_FS_PROG = BITFIELD_BIT(11),
550
551 AGX_DIRTY_BLEND = BITFIELD_BIT(12),
552 AGX_DIRTY_QUERY = BITFIELD_BIT(13),
553 AGX_DIRTY_XFB = BITFIELD_BIT(14),
554 AGX_DIRTY_SAMPLE_MASK = BITFIELD_BIT(15),
555 AGX_DIRTY_BLEND_COLOR = BITFIELD_BIT(16),
556 AGX_DIRTY_POLY_STIPPLE = BITFIELD_BIT(17),
557 };
558
559 /* Maximum number of in-progress + under-construction GPU batches.
560 * Must be large enough for silly workloads that do things like
561 * glGenerateMipmap on every frame, otherwise we end up losing performance.
562 */
563 #define AGX_MAX_BATCHES (128)
564
565 static_assert(PIPE_TEX_FILTER_NEAREST < 2, "known order");
566 static_assert(PIPE_TEX_FILTER_LINEAR < 2, "known order");
567
568 enum asahi_blit_clamp {
569 ASAHI_BLIT_CLAMP_NONE,
570 ASAHI_BLIT_CLAMP_UINT_TO_SINT,
571 ASAHI_BLIT_CLAMP_SINT_TO_UINT,
572
573 /* keep last */
574 ASAHI_BLIT_CLAMP_COUNT,
575 };
576
577 struct asahi_blit_key {
578 enum pipe_format src_format, dst_format;
579 bool array;
580 bool aligned;
581 bool pad[2];
582 };
583 static_assert(sizeof(struct asahi_blit_key) == 12, "packed");
584
585 DERIVE_HASH_TABLE(asahi_blit_key);
586
587 struct asahi_blitter {
588 bool active;
589 struct hash_table *blit_cs;
590
591 /* [filter] */
592 void *sampler[2];
593
594 struct pipe_constant_buffer saved_cb;
595
596 bool has_saved_image;
597 struct pipe_image_view saved_image;
598
599 unsigned saved_num_sampler_states;
600 void *saved_sampler_states[PIPE_MAX_SAMPLERS];
601
602 struct pipe_sampler_view *saved_sampler_view;
603
604 void *saved_cs;
605 };
606
607 struct agx_oq_heap;
608
609 struct agx_context {
610 struct pipe_context base;
611 struct agx_compiled_shader *vs, *fs, *gs, *tcs;
612 struct {
613 struct agx_linked_shader *vs, *fs;
614 } linked;
615 uint32_t dirty;
616
617 /* Heap for dynamic memory allocation for geometry/tessellation shaders */
618 struct pipe_resource *heap;
619
620 /* Occlusion query heap */
621 struct agx_oq_heap *oq;
622
623 /* Acts as a context-level shader key */
624 bool support_lod_bias;
625 bool robust;
626
627 /* Set of batches. When full, the LRU entry (the batch with the smallest
628 * seqnum) is flushed to free a slot.
629 */
630 struct {
631 uint64_t seqnum;
632 struct agx_batch slots[AGX_MAX_BATCHES];
633
634 /** Set of active batches for faster traversal */
635 BITSET_DECLARE(active, AGX_MAX_BATCHES);
636
637 /** Set of submitted batches for faster traversal */
638 BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
639
640 /* Monotonic counter for each batch incremented when resetting a batch to
641 * invalidate all associated queries. Compared to
642 * agx_query::writer_generation.
643 */
644 uint64_t generation[AGX_MAX_BATCHES];
645 } batches;
646
647 /* Queue handle */
648 uint32_t queue_id;
649
650 struct agx_batch *batch;
651 struct agx_bo *result_buf;
652
653 struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
654 uint32_t vb_mask;
655
656 unsigned patch_vertices;
657 float default_outer_level[4];
658 float default_inner_level[2];
659
660 struct agx_stage stage[PIPE_SHADER_TYPES];
661 struct agx_vertex_elements *attributes;
662 struct agx_rasterizer *rast;
663 struct agx_zsa *zs;
664 struct agx_blend *blend;
665 struct pipe_blend_color blend_color;
666 struct pipe_viewport_state viewport[AGX_MAX_VIEWPORTS];
667 struct pipe_scissor_state scissor[AGX_MAX_VIEWPORTS];
668 struct pipe_stencil_ref stencil_ref;
669 struct agx_streamout streamout;
670 uint16_t sample_mask;
671 struct pipe_framebuffer_state framebuffer;
672
673 uint32_t poly_stipple[32];
674
675 struct pipe_query *cond_query;
676 bool cond_cond;
677 enum pipe_render_cond_flag cond_mode;
678
679 struct agx_query *occlusion_query;
680 struct agx_query *prims_generated[4];
681 struct agx_query *tf_prims_generated[4];
682 struct agx_query *tf_overflow[4];
683 struct agx_query *tf_any_overflow;
684 struct agx_query *pipeline_statistics[PIPE_STAT_QUERY_TS_INVOCATIONS];
685 struct agx_query *time_elapsed;
686 bool active_queries;
687 bool active_draw_without_restart;
688
689 struct util_debug_callback debug;
690 bool is_noop;
691
692 bool in_tess;
693
694 struct blitter_context *blitter;
695 struct asahi_blitter compute_blitter;
696
697 /* Map of GEM handle to (batch index + 1) that (conservatively) writes that
698 * BO, or 0 if no writer.
699 */
700 struct util_dynarray writer;
701
702 /* Bound CL global buffers */
703 struct util_dynarray global_buffers;
704
705 struct hash_table *generic_meta;
706 struct agx_bg_eot_cache bg_eot;
707
708 bool any_faults;
709
710 uint32_t syncobj;
711 uint32_t dummy_syncobj;
712 int in_sync_fd;
713 uint32_t in_sync_obj;
714 uint64_t flush_last_seqid;
715 uint64_t flush_my_seqid;
716 uint64_t flush_other_seqid;
717
718 struct agx_scratch scratch_vs;
719 struct agx_scratch scratch_fs;
720 struct agx_scratch scratch_cs;
721 };
722
723 static inline unsigned
agx_batch_idx(struct agx_batch * batch)724 agx_batch_idx(struct agx_batch *batch)
725 {
726 return batch - batch->ctx->batches.slots;
727 }
728
729 static void
agx_writer_add(struct agx_context * ctx,uint8_t batch_index,unsigned handle)730 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
731 {
732 assert(batch_index < AGX_MAX_BATCHES && "invariant");
733 static_assert(AGX_MAX_BATCHES < 0xFF, "no overflow on addition");
734
735 /* If we need to grow, double the capacity so insertion is amortized O(1). */
736 if (unlikely(handle >= ctx->writer.size)) {
737 unsigned new_size =
738 MAX2(ctx->writer.capacity * 2, util_next_power_of_two(handle + 1));
739 unsigned grow = new_size - ctx->writer.size;
740
741 memset(util_dynarray_grow(&ctx->writer, uint8_t, grow), 0,
742 grow * sizeof(uint8_t));
743 }
744
745 /* There is now room */
746 uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
747 assert((*value) == 0 && "there should be no existing writer");
748 *value = batch_index + 1;
749 }
750
751 static struct agx_batch *
agx_writer_get(struct agx_context * ctx,unsigned handle)752 agx_writer_get(struct agx_context *ctx, unsigned handle)
753 {
754 if (handle >= ctx->writer.size)
755 return NULL;
756
757 uint8_t value = *util_dynarray_element(&ctx->writer, uint8_t, handle);
758
759 if (value > 0)
760 return &ctx->batches.slots[value - 1];
761 else
762 return NULL;
763 }
764
765 static void
agx_writer_remove(struct agx_context * ctx,unsigned handle)766 agx_writer_remove(struct agx_context *ctx, unsigned handle)
767 {
768 if (handle >= ctx->writer.size)
769 return;
770
771 uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
772 *value = 0;
773 }
774
775 static inline struct agx_context *
agx_context(struct pipe_context * pctx)776 agx_context(struct pipe_context *pctx)
777 {
778 return (struct agx_context *)pctx;
779 }
780
781 struct agx_linked_shader;
782
783 typedef void (*meta_shader_builder_t)(struct nir_builder *b, const void *key);
784
785 void agx_init_meta_shaders(struct agx_context *ctx);
786
787 void agx_destroy_meta_shaders(struct agx_context *ctx);
788
789 struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
790 meta_shader_builder_t builder,
791 void *data, size_t data_size);
792
793 void agx_launch(struct agx_batch *batch, struct agx_grid grid,
794 struct agx_workgroup wg, struct agx_compiled_shader *cs,
795 struct agx_linked_shader *linked, enum pipe_shader_type stage,
796 unsigned variable_shared_mem);
797
798 void agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
799 enum agx_barrier barrier, enum libagx_program program,
800 void *args, size_t arg_size);
801
802 #define MESA_DISPATCH_PRECOMP agx_launch_precomp
803
804 void agx_init_query_functions(struct pipe_context *ctx);
805
806 void
807 agx_primitives_update_direct(struct agx_context *ctx,
808 const struct pipe_draw_info *info,
809 const struct pipe_draw_start_count_bias *draw);
810
811 void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
812 const struct pipe_draw_info *info,
813 unsigned drawid_offset,
814 const struct pipe_draw_indirect_info *indirect);
815
816 uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
817 uint32_t *size);
818
819 void agx_init_streamout_functions(struct pipe_context *ctx);
820
821 static inline void
agx_dirty_all(struct agx_context * ctx)822 agx_dirty_all(struct agx_context *ctx)
823 {
824 ctx->dirty = ~0;
825
826 for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i)
827 ctx->stage[i].dirty = ~0;
828 }
829
830 static inline void
agx_dirty_reset_graphics(struct agx_context * ctx)831 agx_dirty_reset_graphics(struct agx_context *ctx)
832 {
833 ctx->dirty = 0;
834
835 for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) {
836 if (i != PIPE_SHADER_COMPUTE)
837 ctx->stage[i].dirty = 0;
838 }
839 }
840
841 struct agx_rasterizer {
842 struct pipe_rasterizer_state base;
843 uint8_t cull[AGX_CULL_LENGTH];
844 uint8_t line_width;
845 uint8_t polygon_mode;
846 bool depth_bias;
847 };
848
849 struct agx_query {
850 unsigned type;
851 unsigned index;
852
853 uint64_t writer_generation[AGX_MAX_BATCHES];
854 struct agx_bo *bo;
855 struct agx_ptr ptr;
856 };
857
858 struct agx_sampler_state {
859 struct pipe_sampler_state base;
860
861 /* Prepared descriptor */
862 struct agx_sampler_packed desc, desc_without_custom_border;
863
864 /* Whether a custom border colour is required */
865 bool uses_custom_border;
866
867 /* Packed custom border colour, or zero if none is required */
868 struct agx_border_packed border;
869
870 /* LOD bias packed as fp16, the form we'll pass to the shader */
871 uint16_t lod_bias_as_fp16;
872 };
873
874 struct agx_sampler_view {
875 struct pipe_sampler_view base;
876
877 /* Resource/format, may differ from base in case of separate stencil */
878 struct agx_resource *rsrc;
879 enum pipe_format format;
880
881 /* Prepared descriptor */
882 struct agx_texture_packed desc;
883 };
884
885 struct agx_screen {
886 struct pipe_screen pscreen;
887 struct agx_device dev;
888 struct disk_cache *disk_cache;
889
890 struct agx_bo *rodata;
891
892 /* Shared timeline syncobj and value to serialize flushes across contexts */
893 uint32_t flush_syncobj;
894 uint64_t flush_cur_seqid;
895 uint64_t flush_wait_seqid;
896 /* Lock to protect flush_wait_seqid updates (reads are just atomic) */
897 simple_mtx_t flush_seqid_lock;
898
899 /* Lock to protect syncobj usage vs. destruction in context destroy */
900 struct u_rwlock destroy_lock;
901 };
902
903 static inline struct agx_screen *
agx_screen(struct pipe_screen * p)904 agx_screen(struct pipe_screen *p)
905 {
906 return (struct agx_screen *)p;
907 }
908
909 static inline struct agx_device *
agx_device(struct pipe_screen * p)910 agx_device(struct pipe_screen *p)
911 {
912 return &(agx_screen(p)->dev);
913 }
914
915 #define perf_debug(dev, ...) \
916 do { \
917 if (unlikely((dev)->debug & AGX_DBG_PERF)) \
918 mesa_logw(__VA_ARGS__); \
919 } while (0)
920
921 #define perf_debug_ctx(ctx, ...) \
922 perf_debug(agx_device((ctx)->base.screen), __VA_ARGS__)
923
924 struct agx_resource {
925 struct pipe_resource base;
926 uint64_t modifier;
927
928 /* Should probably be part of the modifier. Affects the tiling algorithm, or
929 * something like that.
930 */
931 bool mipmapped;
932
933 /* Hardware backing */
934 struct agx_bo *bo;
935
936 struct renderonly_scanout *scanout;
937
938 BITSET_DECLARE(data_valid, PIPE_MAX_TEXTURE_LEVELS);
939
940 struct ail_layout layout;
941
942 /* Metal does not support packed depth/stencil formats; presumably AGX does
943 * not either. Instead, we create separate depth and stencil resources,
944 * managed by u_transfer_helper. We provide the illusion of packed
945 * resources.
946 */
947 struct agx_resource *separate_stencil;
948
949 /* Valid buffer range tracking, to optimize buffer appends */
950 struct util_range valid_buffer_range;
951
952 /* Cumulative shadowed byte count for this resource, that is, the number of
953 * times multiplied by the resource size.
954 */
955 size_t shadowed_bytes;
956 };
957
958 static inline struct agx_resource *
agx_resource(struct pipe_resource * pctx)959 agx_resource(struct pipe_resource *pctx)
960 {
961 return (struct agx_resource *)pctx;
962 }
963
964 static inline bool
agx_resource_valid(struct agx_resource * rsrc,int level)965 agx_resource_valid(struct agx_resource *rsrc, int level)
966 {
967 /* Shared BOs can always be potentially valid */
968 if (rsrc->bo && rsrc->bo->flags & AGX_BO_SHARED) {
969 assert(level == 0);
970 return true;
971 }
972
973 return BITSET_TEST(rsrc->data_valid, level);
974 }
975
976 static inline void *
agx_map_texture_cpu(struct agx_resource * rsrc,unsigned level,unsigned z)977 agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z)
978 {
979 return ((uint8_t *)agx_bo_map(rsrc->bo)) +
980 ail_get_layer_level_B(&rsrc->layout, z, level);
981 }
982
983 static inline uint64_t
agx_map_texture_gpu(struct agx_resource * rsrc,unsigned z)984 agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z)
985 {
986 return rsrc->bo->va->addr +
987 (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z);
988 }
989
990 void agx_decompress(struct agx_context *ctx, struct agx_resource *rsrc,
991 const char *reason);
992
993 void agx_legalize_compression(struct agx_context *ctx,
994 struct agx_resource *rsrc,
995 enum pipe_format format);
996
997 struct agx_transfer {
998 struct pipe_transfer base;
999 void *map;
1000 struct {
1001 struct pipe_resource *rsrc;
1002 struct pipe_box box;
1003 } staging;
1004 };
1005
1006 static inline struct agx_transfer *
agx_transfer(struct pipe_transfer * p)1007 agx_transfer(struct pipe_transfer *p)
1008 {
1009 return (struct agx_transfer *)p;
1010 }
1011
1012 void agx_upload_vbos(struct agx_batch *batch);
1013 void agx_upload_uniforms(struct agx_batch *batch);
1014
1015 void agx_set_sampler_uniforms(struct agx_batch *batch,
1016 enum pipe_shader_type stage);
1017
1018 void agx_set_cbuf_uniforms(struct agx_batch *batch,
1019 enum pipe_shader_type stage);
1020
1021 void agx_set_ssbo_uniforms(struct agx_batch *batch,
1022 enum pipe_shader_type stage);
1023
1024 bool agx_nir_lower_point_size(nir_shader *nir, bool insert_write);
1025
1026 bool agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
1027 bool lower_draw_params);
1028
1029 bool agx_nir_layout_uniforms(nir_shader *shader,
1030 struct agx_compiled_shader *compiled,
1031 unsigned *push_size);
1032
1033 bool agx_nir_lower_bindings(nir_shader *shader, bool *uses_bindless_samplers);
1034
1035 bool agx_batch_is_active(struct agx_batch *batch);
1036 bool agx_batch_is_submitted(struct agx_batch *batch);
1037
1038 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
1039 * hot paths. To achieve this we model BO lists by bit sets */
1040
1041 static bool
agx_batch_uses_bo(struct agx_batch * batch,struct agx_bo * bo)1042 agx_batch_uses_bo(struct agx_batch *batch, struct agx_bo *bo)
1043 {
1044 if (bo->handle < batch->bo_list.bit_count)
1045 return BITSET_TEST(batch->bo_list.set, bo->handle);
1046 else
1047 return false;
1048 }
1049
1050 static inline void
agx_batch_add_bo_internal(struct agx_batch * batch,struct agx_bo * bo)1051 agx_batch_add_bo_internal(struct agx_batch *batch, struct agx_bo *bo)
1052 {
1053 /* Double the size of the BO list if we run out, this is amortized O(1) */
1054 if (unlikely(bo->handle >= batch->bo_list.bit_count)) {
1055 const unsigned bits_per_word = sizeof(BITSET_WORD) * 8;
1056
1057 unsigned bit_count =
1058 MAX2(batch->bo_list.bit_count * 2,
1059 util_next_power_of_two(ALIGN_POT(bo->handle + 1, bits_per_word)));
1060
1061 batch->bo_list.set = rerzalloc(
1062 batch->ctx, batch->bo_list.set, BITSET_WORD,
1063 batch->bo_list.bit_count / bits_per_word, bit_count / bits_per_word);
1064 batch->bo_list.bit_count = bit_count;
1065 }
1066
1067 if (BITSET_TEST(batch->bo_list.set, bo->handle))
1068 return;
1069
1070 /* The batch holds a single reference to each BO in the batch, released when
1071 * the batch finishes execution.
1072 */
1073 agx_bo_reference(bo);
1074 BITSET_SET(batch->bo_list.set, bo->handle);
1075 }
1076
1077 static inline void
agx_batch_add_bo(struct agx_batch * batch,struct agx_bo * bo)1078 agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
1079 {
1080 agx_batch_add_bo_internal(batch, bo);
1081 assert(agx_batch_uses_bo(batch, bo));
1082 }
1083
1084 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) \
1085 BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)
1086
1087 struct drm_asahi_cmd_compute;
1088 struct drm_asahi_cmd_render;
1089
1090 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
1091 struct drm_asahi_cmd_compute *compute,
1092 struct drm_asahi_cmd_render *render);
1093
1094 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
1095 void agx_flush_batch_for_reason(struct agx_context *ctx,
1096 struct agx_batch *batch, const char *reason);
1097 void agx_flush_all(struct agx_context *ctx, const char *reason);
1098 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1099 const char *reason);
1100 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1101 const char *reason);
1102
1103 void agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1104 const char *reason);
1105 void agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1106 const char *reason);
1107 void agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch);
1108 void agx_sync_all(struct agx_context *ctx, const char *reason);
1109 void agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1110 const char *reason);
1111 void agx_memory_barrier(struct pipe_context *pctx, unsigned flags);
1112
1113 /* Use these instead of batch_add_bo for proper resource tracking */
1114 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
1115 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
1116 unsigned level);
1117 void agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
1118 unsigned offset, unsigned size);
1119 void agx_batch_track_image(struct agx_batch *batch,
1120 struct pipe_image_view *image);
1121
1122 bool agx_any_batch_uses_resource(struct agx_context *ctx,
1123 struct agx_resource *rsrc);
1124
1125 /* 16384 is the maximum framebuffer dimension, so we use a larger width (the
1126 * maximum uint16_t) as a sentinel to identify the compute batch. This ensures
1127 * compute batches don't mix with graphics. This is a bit of a hack but it
1128 * works.
1129 */
1130 #define AGX_COMPUTE_BATCH_WIDTH 0xFFFF
1131
1132 static inline bool
agx_batch_is_compute(struct agx_batch * batch)1133 agx_batch_is_compute(struct agx_batch *batch)
1134 {
1135 return batch->key.width == AGX_COMPUTE_BATCH_WIDTH;
1136 }
1137
1138 struct agx_batch *agx_get_batch(struct agx_context *ctx);
1139 struct agx_batch *agx_get_compute_batch(struct agx_context *ctx);
1140 void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch);
1141 int agx_cleanup_batches(struct agx_context *ctx);
1142
1143 void agx_batch_add_timestamp_query(struct agx_batch *batch,
1144 struct agx_query *q);
1145 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
1146
1147 void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
1148 uint64_t increment);
1149
1150 enum asahi_blitter_op /* bitmask */
1151 {
1152 ASAHI_SAVE_TEXTURES = 1,
1153 ASAHI_SAVE_FRAMEBUFFER = 2,
1154 ASAHI_SAVE_FRAGMENT_STATE = 4,
1155 ASAHI_SAVE_FRAGMENT_CONSTANT = 8,
1156 ASAHI_DISABLE_RENDER_COND = 16,
1157 };
1158
1159 enum {
1160 ASAHI_CLEAR = ASAHI_SAVE_FRAGMENT_STATE | ASAHI_SAVE_FRAGMENT_CONSTANT,
1161
1162 ASAHI_BLIT =
1163 ASAHI_SAVE_FRAMEBUFFER | ASAHI_SAVE_TEXTURES | ASAHI_SAVE_FRAGMENT_STATE,
1164 };
1165
1166 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
1167 enum asahi_blitter_op op);
1168
1169 void agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info);
1170
1171 void agx_resource_copy_region(struct pipe_context *pctx,
1172 struct pipe_resource *dst, unsigned dst_level,
1173 unsigned dstx, unsigned dsty, unsigned dstz,
1174 struct pipe_resource *src, unsigned src_level,
1175 const struct pipe_box *src_box);
1176
1177 /* Batch logic */
1178
1179 struct agx_encoder agx_encoder_allocate(struct agx_batch *batch,
1180 struct agx_device *dev);
1181
1182 void agx_batch_init_state(struct agx_batch *batch);
1183
1184 struct asahi_bg_eot {
1185 uint64_t usc;
1186 struct agx_counts_packed counts;
1187 };
1188
1189 struct asahi_bg_eot agx_build_bg_eot(struct agx_batch *batch, bool store,
1190 bool partial_render);
1191
1192 /* Query management */
1193 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
1194 uint64_t agx_get_query_address(struct agx_batch *batch,
1195 struct agx_query *query);
1196 uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
1197
1198 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
1199 uint64_t end_ts);
1200
1201 bool agx_render_condition_check_inner(struct agx_context *ctx);
1202
1203 static inline bool
agx_render_condition_check(struct agx_context * ctx)1204 agx_render_condition_check(struct agx_context *ctx)
1205 {
1206 if (likely(!ctx->cond_query))
1207 return true;
1208 else
1209 return agx_render_condition_check_inner(ctx);
1210 }
1211
1212 static inline uint32_t
agx_texture_buffer_size_el(enum pipe_format format,uint32_t size)1213 agx_texture_buffer_size_el(enum pipe_format format, uint32_t size)
1214 {
1215 unsigned blocksize = util_format_get_blocksize(format);
1216
1217 return MIN2(AGX_TEXTURE_BUFFER_MAX_SIZE, size / blocksize);
1218 }
1219
1220 void agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
1221 const char *reason);
1222