• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2019-2021 Collabora, Ltd.
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #pragma once
8 
9 #include <xf86drm.h>
10 #include "asahi/compiler/agx_compile.h"
11 #include "asahi/genxml/agx_pack.h"
12 #include "asahi/layout/layout.h"
13 #include "asahi/lib/agx_bo.h"
14 #include "asahi/lib/agx_device.h"
15 #include "asahi/lib/agx_linker.h"
16 #include "asahi/lib/agx_nir_lower_vbo.h"
17 #include "asahi/lib/agx_scratch.h"
18 #include "asahi/lib/agx_tilebuffer.h"
19 #include "asahi/lib/agx_uvs.h"
20 #include "asahi/lib/pool.h"
21 #include "asahi/lib/unstable_asahi_drm.h"
22 #include "asahi/libagx/geometry.h"
23 #include "compiler/shader_enums.h"
24 #include "gallium/auxiliary/util/u_blitter.h"
25 #include "gallium/include/pipe/p_context.h"
26 #include "gallium/include/pipe/p_screen.h"
27 #include "gallium/include/pipe/p_state.h"
28 #include "pipe/p_defines.h"
29 #include "util/bitset.h"
30 #include "util/disk_cache.h"
31 #include "util/hash_table.h"
32 #include "util/rwlock.h"
33 #include "util/u_range.h"
34 #include "agx_bg_eot.h"
35 #include "agx_helpers.h"
36 #include "agx_nir_texture.h"
37 
38 #ifdef __GLIBC__
39 #include <errno.h>
40 #define agx_msg(fmt, ...)                                                      \
41    fprintf(stderr, "[%s] " fmt, program_invocation_short_name, ##__VA_ARGS__)
42 #else
43 #define agx_msg(...) fprintf(stderr, __VA_ARGS__)
44 #endif
45 
46 #define AGX_NUM_TEXTURE_STATE_REGS 16
47 
48 struct agx_streamout_target {
49    struct pipe_stream_output_target base;
50    struct pipe_resource *offset;
51 
52    /* Current stride (bytes per vertex) */
53    uint32_t stride;
54 };
55 
56 static inline struct agx_streamout_target *
agx_so_target(struct pipe_stream_output_target * target)57 agx_so_target(struct pipe_stream_output_target *target)
58 {
59    return (struct agx_streamout_target *)target;
60 }
61 
62 struct agx_streamout {
63    struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
64    unsigned num_targets;
65 };
66 
67 /* Shaders can access fixed-function state through system values.
68  * It is convenient to stash all of this information into a single "root"
69  * descriptor, then push individual parts as needed.
70  *
71  * In the future, we could optimize this to reduce CPU overhead, e.g. splitting
72  * into multiple descriptors for finer dirty tracking. This is not ABI with the
73  * compiler. The layout is up to us and handled by our code lowering system
74  * values to uniforms.
75  */
76 enum agx_sysval_table {
77    AGX_SYSVAL_TABLE_ROOT,
78    AGX_SYSVAL_TABLE_PARAMS,
79    AGX_SYSVAL_TABLE_GRID,
80    AGX_SYSVAL_TABLE_VS,
81    AGX_SYSVAL_TABLE_TCS,
82    AGX_SYSVAL_TABLE_TES,
83    AGX_SYSVAL_TABLE_GS,
84    AGX_SYSVAL_TABLE_FS,
85    AGX_SYSVAL_TABLE_CS,
86    AGX_NUM_SYSVAL_TABLES
87 };
88 
89 #define AGX_SYSVAL_STAGE(stage) (AGX_SYSVAL_TABLE_VS + (stage))
90 
91 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_VERTEX) == AGX_SYSVAL_TABLE_VS,
92               "fixed enum orderings");
93 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_CTRL) == AGX_SYSVAL_TABLE_TCS,
94               "fixed enum orderings");
95 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_EVAL) == AGX_SYSVAL_TABLE_TES,
96               "fixed enum orderings");
97 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_GEOMETRY) == AGX_SYSVAL_TABLE_GS,
98               "fixed enum orderings");
99 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_FRAGMENT) == AGX_SYSVAL_TABLE_FS,
100               "fixed enum orderings");
101 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_COMPUTE) == AGX_SYSVAL_TABLE_CS,
102               "fixed enum orderings");
103 
104 /* Root system value table */
105 struct PACKED agx_draw_uniforms {
106    /* Pointers to the system value tables themselves (for indirection) */
107    uint64_t tables[AGX_NUM_SYSVAL_TABLES];
108 
109    /* Vertex buffer object bases, if present. If vertex robustness is disabled,
110     * attrib_base maps VBOs directly and attrib_max_index is undefined. If
111     * vertex robustness is enabled, attrib_base maps attributes and
112     * attrib_clamp is an inclusive clamp on vertex/divided instance indices.
113     */
114    uint64_t attrib_base[PIPE_MAX_ATTRIBS];
115    uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];
116 
117    /* Addresses for the results of pipeline statistics queries */
118    uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];
119 
120    /* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer.
121     * Indirected so it can be written to in an indirect setup kernel. G13
122     * appears to prefetch uniforms across dispatches, but does not pre-run
123     * preambles, so this indirection saves us from splitting the batch.
124     */
125    uint64_t vertex_output_buffer_ptr;
126 
127    /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
128    uint64_t vertex_outputs;
129 
130    /* Address of input assembly buffer if geom/tess is used, else 0 */
131    uint64_t input_assembly;
132 
133    /* Address of tessellation param buffer if tessellation is used, else 0 */
134    uint64_t tess_params;
135 
136    /* Address of geometry param buffer if geometry shaders are used, else 0 */
137    uint64_t geometry_params;
138 
139    /* Address of polygon stipple mask if used */
140    uint64_t polygon_stipple;
141 
142    /* Blend constant if any */
143    float blend_constant[4];
144 
145    /* glPointSize value */
146    float fixed_point_size;
147 
148    /* Value of the multisample control register, containing sample positions in
149     * each byte (x in low nibble, y in high nibble).
150     */
151    uint32_t ppp_multisamplectl;
152 
153    /* gl_DrawID for a direct multidraw */
154    uint32_t draw_id;
155 
156    /* Sprite coord replacement mask */
157    uint16_t sprite_mask;
158 
159    /* glSampleMask */
160    uint16_t sample_mask;
161 
162    /* Nonzero for indexed draws, zero otherwise */
163    uint16_t is_indexed_draw;
164 
165    /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
166    uint16_t clip_z_coeff;
167 
168    /* ~0/0 boolean whether the epilog lacks any discard instrction */
169    uint16_t no_epilog_discard;
170 
171    /* Provoking vertex: 0, 1, 2 */
172    uint16_t provoking_vertex;
173 
174    /* Mapping from varying slots written by the last vertex stage to UVS
175     * indices. This mapping must be compatible with the fragment shader.
176     */
177    uint16_t uvs_index[VARYING_SLOT_MAX];
178 };
179 
180 struct PACKED agx_stage_uniforms {
181    /* Pointer to binding table for texture descriptor, or 0 if none. This must
182     * be first so that u0_u1 is always available for lowering binding
183     * tables to bindless access.
184     */
185    uint64_t texture_base;
186 
187    /* Uniform buffer objects */
188    uint64_t ubo_base[PIPE_MAX_CONSTANT_BUFFERS];
189    uint32_t ubo_size[PIPE_MAX_CONSTANT_BUFFERS];
190 
191    /* Shader storage buffer objects */
192    uint64_t ssbo_base[PIPE_MAX_SHADER_BUFFERS];
193    uint32_t ssbo_size[PIPE_MAX_SHADER_BUFFERS];
194 
195    /* If lowered to bindless, sampler index in the heap */
196    uint16_t sampler_handle[PIPE_MAX_SAMPLERS];
197 
198    /* LOD bias as float16 */
199    uint16_t lod_bias[PIPE_MAX_SAMPLERS];
200 };
201 
202 /* In the architecture, there are 512 uniform registers, each 16-bits. In a
203  * theoretical worst case, we could push to all of them. We use a worst-case
204  * maximum because the expression for a tight upper bound is too messy and easy
205  * to go out of sync with the code.
206  */
207 #define AGX_MAX_PUSH_RANGES (512)
208 
209 struct agx_push_range {
210    /* Base 16-bit uniform to push to */
211    uint16_t uniform;
212 
213    /* Offset into the table to push in bytes */
214    uint16_t offset;
215 
216    /* Which table to push from */
217    uint8_t table;
218 
219    /* Number of consecutive 16-bit uniforms to push */
220    uint8_t length;
221 };
222 
223 struct agx_compiled_shader {
224    /* Base struct */
225    struct agx_shader_part b;
226 
227    /* Uncompiled shader that we belong to */
228    const struct agx_uncompiled_shader *so;
229 
230    /* Mapped executable memory */
231    struct agx_bo *bo;
232 
233    /* Uniforms the driver must push */
234    unsigned push_range_count;
235    struct agx_push_range push[AGX_MAX_PUSH_RANGES];
236 
237    /* UVS layout for the last vertex stage */
238    struct agx_unlinked_uvs_layout uvs;
239 
240    /* For a vertex shader, the mask of vertex attributes read. Used to key the
241     * prolog so the prolog doesn't write components not actually read.
242     */
243    BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4);
244 
245    struct agx_fs_epilog_link_info epilog_key;
246 
247    /* Auxiliary programs, or NULL if not used */
248    struct agx_compiled_shader *gs_count, *pre_gs;
249    struct agx_compiled_shader *gs_copy;
250 
251    /* Output primitive mode for geometry shaders */
252    enum mesa_prim gs_output_mode;
253 
254    /* Number of words per primitive in the count buffer */
255    unsigned gs_count_words;
256 
257    /* Logical shader stage used for descriptor access. This may differ from the
258     * physical shader stage of the compiled shader, for example when executing a
259     * tessellation eval shader as a vertex shader.
260     */
261    enum pipe_shader_type stage;
262 };
263 
264 struct agx_fast_link_key {
265    union {
266       struct agx_vs_prolog_key vs;
267       struct agx_fs_prolog_key fs;
268    } prolog;
269 
270    struct agx_compiled_shader *main;
271 
272    union {
273       struct agx_fs_epilog_key fs;
274    } epilog;
275 
276    unsigned nr_samples_shaded;
277 };
278 
279 struct agx_uncompiled_shader {
280    struct pipe_shader_state base;
281    enum pipe_shader_type type;
282    struct blob early_serialized_nir;
283    struct blob serialized_nir;
284    uint8_t nir_sha1[20];
285 
286    struct {
287       uint64_t inputs_flat_shaded;
288       uint64_t inputs_linear_shaded;
289       uint8_t cull_distance_size;
290       bool has_edgeflags;
291       bool uses_fbfetch;
292 
293       /* Number of bindful textures, images used */
294       unsigned nr_bindful_textures, nr_bindful_images;
295    } info;
296 
297    struct hash_table *variants;
298    struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
299    struct agx_uncompiled_shader *passthrough_tcs[32];
300 
301    /* agx_fast_link_key -> agx_linked_shader */
302    struct hash_table *linked_shaders;
303 
304    uint32_t xfb_strides[4];
305    bool has_xfb_info;
306    bool is_xfb_passthrough;
307 
308    enum mesa_prim gs_mode;
309 
310    /* Whether the shader accesses indexed samplers via the bindless heap */
311    bool uses_bindless_samplers;
312 
313    /* Set on VS, passed to FS for linkage */
314    unsigned base_varying;
315 
316    /* Tessellation info */
317    struct {
318       uint64_t per_vertex_outputs;
319       uint32_t output_stride;
320       enum gl_tess_spacing spacing;
321       enum tess_primitive_mode primitive;
322       uint8_t output_patch_size;
323       uint8_t nr_patch_outputs;
324       bool ccw;
325       bool point_mode;
326    } tess;
327 };
328 
329 enum agx_stage_dirty {
330    AGX_STAGE_DIRTY_CONST = BITFIELD_BIT(0),
331    AGX_STAGE_DIRTY_SSBO = BITFIELD_BIT(1),
332    AGX_STAGE_DIRTY_IMAGE = BITFIELD_BIT(2),
333    AGX_STAGE_DIRTY_SAMPLER = BITFIELD_BIT(3),
334 };
335 
336 struct agx_stage {
337    struct agx_uncompiled_shader *shader;
338    uint32_t dirty;
339 
340    struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
341    uint32_t cb_mask;
342 
343    struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
344    uint32_t ssbo_writable_mask;
345    uint32_t ssbo_mask;
346 
347    struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES];
348    uint32_t image_mask;
349 
350    /* Need full CSOs for u_blitter */
351    struct agx_sampler_state *samplers[PIPE_MAX_SAMPLERS];
352    struct agx_sampler_view *textures[PIPE_MAX_SHADER_SAMPLER_VIEWS];
353 
354    /* Does any bound sampler require custom border colours? */
355    bool custom_borders;
356 
357    unsigned sampler_count, texture_count;
358    uint32_t valid_samplers;
359 };
360 
361 union agx_batch_result {
362    struct drm_asahi_result_render render;
363    struct drm_asahi_result_compute compute;
364 };
365 
366 /* This is a firmware limit. It should be possible to raise to 2048 in the
367  * future... still not good enough for VK though :-(
368  */
369 #define AGX_SAMPLER_HEAP_SIZE (1024)
370 
371 struct agx_sampler_heap {
372    struct agx_bo *bo;
373    uint16_t count;
374 };
375 
376 uint16_t agx_sampler_heap_add(struct agx_device *dev,
377                               struct agx_sampler_heap *heap,
378                               struct agx_sampler_packed *sampler);
379 
380 struct agx_encoder {
381    struct agx_bo *bo;
382    uint8_t *current;
383    uint8_t *end;
384 };
385 
386 struct agx_batch {
387    struct agx_context *ctx;
388    struct pipe_framebuffer_state key;
389    uint64_t seqnum;
390    uint32_t syncobj;
391    uint32_t draws;
392 
393    struct agx_tilebuffer_layout tilebuffer_layout;
394 
395    /* PIPE_CLEAR_* bitmask */
396    uint32_t clear, draw, load, resolve, feedback;
397    bool initialized;
398 
399    uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS];
400    double clear_depth;
401    unsigned clear_stencil;
402 
403    /* Whether we're drawing points, lines, or triangles */
404    enum mesa_prim reduced_prim;
405 
406    /* Whether the bound FS needs a primitive ID that is not supplied by the
407     * bound hardware VS (software GS)
408     */
409    bool generate_primitive_id;
410 
411    /* Current varyings linkage structures */
412    uint32_t varyings;
413    struct agx_varyings_vs linked_varyings;
414 
415    struct agx_draw_uniforms uniforms;
416    struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES];
417 
418    /* Indirect buffer allocated for geometry shader */
419    uint64_t geom_indirect;
420    struct agx_bo *geom_indirect_bo;
421 
422    /* Geometry state buffer if geometry/etc shaders are used */
423    uint64_t geometry_state;
424 
425    /* Uploaded descriptors */
426    uint32_t texture_count[PIPE_SHADER_TYPES];
427 
428    uint64_t samplers[PIPE_SHADER_TYPES];
429    uint32_t sampler_count[PIPE_SHADER_TYPES];
430 
431    struct agx_sampler_heap sampler_heap;
432 
433    /* Resource list requirements, represented as a bit set indexed by BO
434     * handles (GEM handles on Linux, or IOGPU's equivalent on macOS)
435     */
436    struct {
437       BITSET_WORD *set;
438       unsigned bit_count;
439    } bo_list;
440 
441    /* If true, this batch contains a shader with a potentially incoherent write
442     * (e.g. image_write), needing a barrier later to access.
443     */
444    bool incoherent_writes;
445 
446    struct agx_pool pool, pipeline_pool;
447 
448    /* We may enqueue both CDM and VDM work, possibly to the same batch for
449     * geometry/tessellation.
450     */
451    struct agx_encoder vdm;
452    struct agx_encoder cdm;
453 
454    /* Scissor and depth-bias descriptors, uploaded at GPU time */
455    struct util_dynarray scissor, depth_bias;
456 
457    /* Arrays of GPU pointers that should be written with the batch timestamps */
458    struct util_dynarray timestamps;
459 
460    /* Result buffer where the kernel places command execution information */
461    union agx_batch_result *result;
462    size_t result_off;
463 
464    /* Actual pointer in a uniform */
465    struct agx_bo *geom_params_bo;
466 
467    /* Whether each stage uses scratch */
468    bool vs_scratch;
469    bool fs_scratch;
470    bool cs_scratch;
471 
472    /* Whether each stage has preambles using scratch, and if so which bucket.
473     * This just needs to be zero/nonzero for correctness, the magnitude in
474     * buckets is for statistics.
475     */
476    unsigned vs_preamble_scratch;
477    unsigned fs_preamble_scratch;
478    unsigned cs_preamble_scratch;
479 };
480 
481 struct agx_zsa {
482    struct pipe_depth_stencil_alpha_state base;
483    struct agx_fragment_face_packed depth;
484    struct agx_fragment_stencil_packed front_stencil, back_stencil;
485 
486    /* PIPE_CLEAR_* bitmask corresponding to this depth/stencil state */
487    uint32_t load, store;
488 };
489 
490 struct agx_blend {
491    struct agx_blend_key key;
492 
493    /* PIPE_CLEAR_* bitmask corresponding to this blend state */
494    uint32_t store;
495 };
496 
497 struct asahi_vs_shader_key {
498    /* If true, this is running as a hardware vertex shader. If false, this is a
499     * compute job used to feed a TCS or GS.
500     */
501    bool hw;
502 };
503 
504 struct agx_vertex_elements {
505    unsigned num_attribs;
506    struct agx_velem_key key[PIPE_MAX_ATTRIBS];
507 
508    /* These parts do not affect the generated code so are not in the key */
509    uint16_t src_offsets[PIPE_MAX_ATTRIBS];
510    uint16_t buffers[PIPE_MAX_ATTRIBS];
511 };
512 
513 struct asahi_fs_shader_key {
514    enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
515    uint8_t nr_samples;
516    bool padding[7];
517 };
518 static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");
519 
520 struct asahi_gs_shader_key {
521    /* If true, this GS is run only for its side effects (including XFB) */
522    bool rasterizer_discard;
523    bool padding[7];
524 };
525 static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
526 
527 union asahi_shader_key {
528    struct asahi_vs_shader_key vs;
529    struct asahi_gs_shader_key gs;
530    struct asahi_fs_shader_key fs;
531 };
532 
533 enum agx_dirty {
534    AGX_DIRTY_VERTEX = BITFIELD_BIT(0),
535    AGX_DIRTY_VIEWPORT = BITFIELD_BIT(1),
536    AGX_DIRTY_SCISSOR_ZBIAS = BITFIELD_BIT(2),
537    AGX_DIRTY_ZS = BITFIELD_BIT(3),
538    AGX_DIRTY_STENCIL_REF = BITFIELD_BIT(4),
539    AGX_DIRTY_RS = BITFIELD_BIT(5),
540    AGX_DIRTY_SPRITE_COORD_MODE = BITFIELD_BIT(6),
541    AGX_DIRTY_PRIM = BITFIELD_BIT(7),
542 
543    /* Vertex/fragment pipelines, including uniforms and textures */
544    AGX_DIRTY_VS = BITFIELD_BIT(8),
545    AGX_DIRTY_FS = BITFIELD_BIT(9),
546 
547    /* Just the progs themselves */
548    AGX_DIRTY_VS_PROG = BITFIELD_BIT(10),
549    AGX_DIRTY_FS_PROG = BITFIELD_BIT(11),
550 
551    AGX_DIRTY_BLEND = BITFIELD_BIT(12),
552    AGX_DIRTY_QUERY = BITFIELD_BIT(13),
553    AGX_DIRTY_XFB = BITFIELD_BIT(14),
554    AGX_DIRTY_SAMPLE_MASK = BITFIELD_BIT(15),
555    AGX_DIRTY_BLEND_COLOR = BITFIELD_BIT(16),
556    AGX_DIRTY_POLY_STIPPLE = BITFIELD_BIT(17),
557 };
558 
559 /* Maximum number of in-progress + under-construction GPU batches.
560  * Must be large enough for silly workloads that do things like
561  * glGenerateMipmap on every frame, otherwise we end up losing performance.
562  */
563 #define AGX_MAX_BATCHES (128)
564 
565 static_assert(PIPE_TEX_FILTER_NEAREST < 2, "known order");
566 static_assert(PIPE_TEX_FILTER_LINEAR < 2, "known order");
567 
568 enum asahi_blit_clamp {
569    ASAHI_BLIT_CLAMP_NONE,
570    ASAHI_BLIT_CLAMP_UINT_TO_SINT,
571    ASAHI_BLIT_CLAMP_SINT_TO_UINT,
572 
573    /* keep last */
574    ASAHI_BLIT_CLAMP_COUNT,
575 };
576 
577 struct asahi_blit_key {
578    enum pipe_format src_format, dst_format;
579    bool array;
580    bool aligned;
581    bool pad[2];
582 };
583 static_assert(sizeof(struct asahi_blit_key) == 12, "packed");
584 
585 DERIVE_HASH_TABLE(asahi_blit_key);
586 
587 struct asahi_blitter {
588    bool active;
589    struct hash_table *blit_cs;
590 
591    /* [filter] */
592    void *sampler[2];
593 
594    struct pipe_constant_buffer saved_cb;
595 
596    bool has_saved_image;
597    struct pipe_image_view saved_image;
598 
599    unsigned saved_num_sampler_states;
600    void *saved_sampler_states[PIPE_MAX_SAMPLERS];
601 
602    struct pipe_sampler_view *saved_sampler_view;
603 
604    void *saved_cs;
605 };
606 
607 struct agx_oq_heap;
608 
609 struct agx_context {
610    struct pipe_context base;
611    struct agx_compiled_shader *vs, *fs, *gs, *tcs;
612    struct {
613       struct agx_linked_shader *vs, *fs;
614    } linked;
615    uint32_t dirty;
616 
617    /* Heap for dynamic memory allocation for geometry/tessellation shaders */
618    struct pipe_resource *heap;
619 
620    /* Occlusion query heap */
621    struct agx_oq_heap *oq;
622 
623    /* Acts as a context-level shader key */
624    bool support_lod_bias;
625    bool robust;
626 
627    /* Set of batches. When full, the LRU entry (the batch with the smallest
628     * seqnum) is flushed to free a slot.
629     */
630    struct {
631       uint64_t seqnum;
632       struct agx_batch slots[AGX_MAX_BATCHES];
633 
634       /** Set of active batches for faster traversal */
635       BITSET_DECLARE(active, AGX_MAX_BATCHES);
636 
637       /** Set of submitted batches for faster traversal */
638       BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
639 
640       /* Monotonic counter for each batch incremented when resetting a batch to
641        * invalidate all associated queries. Compared to
642        * agx_query::writer_generation.
643        */
644       uint64_t generation[AGX_MAX_BATCHES];
645    } batches;
646 
647    /* Queue handle */
648    uint32_t queue_id;
649 
650    struct agx_batch *batch;
651    struct agx_bo *result_buf;
652 
653    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
654    uint32_t vb_mask;
655 
656    unsigned patch_vertices;
657    float default_outer_level[4];
658    float default_inner_level[2];
659 
660    struct agx_stage stage[PIPE_SHADER_TYPES];
661    struct agx_vertex_elements *attributes;
662    struct agx_rasterizer *rast;
663    struct agx_zsa *zs;
664    struct agx_blend *blend;
665    struct pipe_blend_color blend_color;
666    struct pipe_viewport_state viewport[AGX_MAX_VIEWPORTS];
667    struct pipe_scissor_state scissor[AGX_MAX_VIEWPORTS];
668    struct pipe_stencil_ref stencil_ref;
669    struct agx_streamout streamout;
670    uint16_t sample_mask;
671    struct pipe_framebuffer_state framebuffer;
672 
673    uint32_t poly_stipple[32];
674 
675    struct pipe_query *cond_query;
676    bool cond_cond;
677    enum pipe_render_cond_flag cond_mode;
678 
679    struct agx_query *occlusion_query;
680    struct agx_query *prims_generated[4];
681    struct agx_query *tf_prims_generated[4];
682    struct agx_query *tf_overflow[4];
683    struct agx_query *tf_any_overflow;
684    struct agx_query *pipeline_statistics[PIPE_STAT_QUERY_TS_INVOCATIONS];
685    struct agx_query *time_elapsed;
686    bool active_queries;
687    bool active_draw_without_restart;
688 
689    struct util_debug_callback debug;
690    bool is_noop;
691 
692    bool in_tess;
693 
694    struct blitter_context *blitter;
695    struct asahi_blitter compute_blitter;
696 
697    /* Map of GEM handle to (batch index + 1) that (conservatively) writes that
698     * BO, or 0 if no writer.
699     */
700    struct util_dynarray writer;
701 
702    /* Bound CL global buffers */
703    struct util_dynarray global_buffers;
704 
705    struct hash_table *generic_meta;
706    struct agx_bg_eot_cache bg_eot;
707 
708    bool any_faults;
709 
710    uint32_t syncobj;
711    uint32_t dummy_syncobj;
712    int in_sync_fd;
713    uint32_t in_sync_obj;
714    uint64_t flush_last_seqid;
715    uint64_t flush_my_seqid;
716    uint64_t flush_other_seqid;
717 
718    struct agx_scratch scratch_vs;
719    struct agx_scratch scratch_fs;
720    struct agx_scratch scratch_cs;
721 };
722 
723 static inline unsigned
agx_batch_idx(struct agx_batch * batch)724 agx_batch_idx(struct agx_batch *batch)
725 {
726    return batch - batch->ctx->batches.slots;
727 }
728 
729 static void
agx_writer_add(struct agx_context * ctx,uint8_t batch_index,unsigned handle)730 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
731 {
732    assert(batch_index < AGX_MAX_BATCHES && "invariant");
733    static_assert(AGX_MAX_BATCHES < 0xFF, "no overflow on addition");
734 
735    /* If we need to grow, double the capacity so insertion is amortized O(1). */
736    if (unlikely(handle >= ctx->writer.size)) {
737       unsigned new_size =
738          MAX2(ctx->writer.capacity * 2, util_next_power_of_two(handle + 1));
739       unsigned grow = new_size - ctx->writer.size;
740 
741       memset(util_dynarray_grow(&ctx->writer, uint8_t, grow), 0,
742              grow * sizeof(uint8_t));
743    }
744 
745    /* There is now room */
746    uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
747    assert((*value) == 0 && "there should be no existing writer");
748    *value = batch_index + 1;
749 }
750 
751 static struct agx_batch *
agx_writer_get(struct agx_context * ctx,unsigned handle)752 agx_writer_get(struct agx_context *ctx, unsigned handle)
753 {
754    if (handle >= ctx->writer.size)
755       return NULL;
756 
757    uint8_t value = *util_dynarray_element(&ctx->writer, uint8_t, handle);
758 
759    if (value > 0)
760       return &ctx->batches.slots[value - 1];
761    else
762       return NULL;
763 }
764 
765 static void
agx_writer_remove(struct agx_context * ctx,unsigned handle)766 agx_writer_remove(struct agx_context *ctx, unsigned handle)
767 {
768    if (handle >= ctx->writer.size)
769       return;
770 
771    uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
772    *value = 0;
773 }
774 
775 static inline struct agx_context *
agx_context(struct pipe_context * pctx)776 agx_context(struct pipe_context *pctx)
777 {
778    return (struct agx_context *)pctx;
779 }
780 
781 struct agx_linked_shader;
782 
783 typedef void (*meta_shader_builder_t)(struct nir_builder *b, const void *key);
784 
785 void agx_init_meta_shaders(struct agx_context *ctx);
786 
787 void agx_destroy_meta_shaders(struct agx_context *ctx);
788 
789 struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
790                                                   meta_shader_builder_t builder,
791                                                   void *data, size_t data_size);
792 
793 void agx_launch(struct agx_batch *batch, struct agx_grid grid,
794                 struct agx_workgroup wg, struct agx_compiled_shader *cs,
795                 struct agx_linked_shader *linked, enum pipe_shader_type stage,
796                 unsigned variable_shared_mem);
797 
798 void agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
799                         enum agx_barrier barrier, enum libagx_program program,
800                         void *args, size_t arg_size);
801 
802 #define MESA_DISPATCH_PRECOMP agx_launch_precomp
803 
804 void agx_init_query_functions(struct pipe_context *ctx);
805 
806 void
807 agx_primitives_update_direct(struct agx_context *ctx,
808                              const struct pipe_draw_info *info,
809                              const struct pipe_draw_start_count_bias *draw);
810 
811 void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
812                            const struct pipe_draw_info *info,
813                            unsigned drawid_offset,
814                            const struct pipe_draw_indirect_info *indirect);
815 
816 uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
817                                   uint32_t *size);
818 
819 void agx_init_streamout_functions(struct pipe_context *ctx);
820 
821 static inline void
agx_dirty_all(struct agx_context * ctx)822 agx_dirty_all(struct agx_context *ctx)
823 {
824    ctx->dirty = ~0;
825 
826    for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i)
827       ctx->stage[i].dirty = ~0;
828 }
829 
830 static inline void
agx_dirty_reset_graphics(struct agx_context * ctx)831 agx_dirty_reset_graphics(struct agx_context *ctx)
832 {
833    ctx->dirty = 0;
834 
835    for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) {
836       if (i != PIPE_SHADER_COMPUTE)
837          ctx->stage[i].dirty = 0;
838    }
839 }
840 
841 struct agx_rasterizer {
842    struct pipe_rasterizer_state base;
843    uint8_t cull[AGX_CULL_LENGTH];
844    uint8_t line_width;
845    uint8_t polygon_mode;
846    bool depth_bias;
847 };
848 
849 struct agx_query {
850    unsigned type;
851    unsigned index;
852 
853    uint64_t writer_generation[AGX_MAX_BATCHES];
854    struct agx_bo *bo;
855    struct agx_ptr ptr;
856 };
857 
858 struct agx_sampler_state {
859    struct pipe_sampler_state base;
860 
861    /* Prepared descriptor */
862    struct agx_sampler_packed desc, desc_without_custom_border;
863 
864    /* Whether a custom border colour is required */
865    bool uses_custom_border;
866 
867    /* Packed custom border colour, or zero if none is required */
868    struct agx_border_packed border;
869 
870    /* LOD bias packed as fp16, the form we'll pass to the shader */
871    uint16_t lod_bias_as_fp16;
872 };
873 
874 struct agx_sampler_view {
875    struct pipe_sampler_view base;
876 
877    /* Resource/format, may differ from base in case of separate stencil */
878    struct agx_resource *rsrc;
879    enum pipe_format format;
880 
881    /* Prepared descriptor */
882    struct agx_texture_packed desc;
883 };
884 
885 struct agx_screen {
886    struct pipe_screen pscreen;
887    struct agx_device dev;
888    struct disk_cache *disk_cache;
889 
890    struct agx_bo *rodata;
891 
892    /* Shared timeline syncobj and value to serialize flushes across contexts */
893    uint32_t flush_syncobj;
894    uint64_t flush_cur_seqid;
895    uint64_t flush_wait_seqid;
896    /* Lock to protect flush_wait_seqid updates (reads are just atomic) */
897    simple_mtx_t flush_seqid_lock;
898 
899    /* Lock to protect syncobj usage vs. destruction in context destroy */
900    struct u_rwlock destroy_lock;
901 };
902 
903 static inline struct agx_screen *
agx_screen(struct pipe_screen * p)904 agx_screen(struct pipe_screen *p)
905 {
906    return (struct agx_screen *)p;
907 }
908 
909 static inline struct agx_device *
agx_device(struct pipe_screen * p)910 agx_device(struct pipe_screen *p)
911 {
912    return &(agx_screen(p)->dev);
913 }
914 
915 #define perf_debug(dev, ...)                                                   \
916    do {                                                                        \
917       if (unlikely((dev)->debug & AGX_DBG_PERF))                               \
918          mesa_logw(__VA_ARGS__);                                               \
919    } while (0)
920 
921 #define perf_debug_ctx(ctx, ...)                                               \
922    perf_debug(agx_device((ctx)->base.screen), __VA_ARGS__)
923 
924 struct agx_resource {
925    struct pipe_resource base;
926    uint64_t modifier;
927 
928    /* Should probably be part of the modifier. Affects the tiling algorithm, or
929     * something like that.
930     */
931    bool mipmapped;
932 
933    /* Hardware backing */
934    struct agx_bo *bo;
935 
936    struct renderonly_scanout *scanout;
937 
938    BITSET_DECLARE(data_valid, PIPE_MAX_TEXTURE_LEVELS);
939 
940    struct ail_layout layout;
941 
942    /* Metal does not support packed depth/stencil formats; presumably AGX does
943     * not either. Instead, we create separate depth and stencil resources,
944     * managed by u_transfer_helper.  We provide the illusion of packed
945     * resources.
946     */
947    struct agx_resource *separate_stencil;
948 
949    /* Valid buffer range tracking, to optimize buffer appends */
950    struct util_range valid_buffer_range;
951 
952    /* Cumulative shadowed byte count for this resource, that is, the number of
953     * times multiplied by the resource size.
954     */
955    size_t shadowed_bytes;
956 };
957 
958 static inline struct agx_resource *
agx_resource(struct pipe_resource * pctx)959 agx_resource(struct pipe_resource *pctx)
960 {
961    return (struct agx_resource *)pctx;
962 }
963 
964 static inline bool
agx_resource_valid(struct agx_resource * rsrc,int level)965 agx_resource_valid(struct agx_resource *rsrc, int level)
966 {
967    /* Shared BOs can always be potentially valid */
968    if (rsrc->bo && rsrc->bo->flags & AGX_BO_SHARED) {
969       assert(level == 0);
970       return true;
971    }
972 
973    return BITSET_TEST(rsrc->data_valid, level);
974 }
975 
976 static inline void *
agx_map_texture_cpu(struct agx_resource * rsrc,unsigned level,unsigned z)977 agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z)
978 {
979    return ((uint8_t *)agx_bo_map(rsrc->bo)) +
980           ail_get_layer_level_B(&rsrc->layout, z, level);
981 }
982 
983 static inline uint64_t
agx_map_texture_gpu(struct agx_resource * rsrc,unsigned z)984 agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z)
985 {
986    return rsrc->bo->va->addr +
987           (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z);
988 }
989 
990 void agx_decompress(struct agx_context *ctx, struct agx_resource *rsrc,
991                     const char *reason);
992 
993 void agx_legalize_compression(struct agx_context *ctx,
994                               struct agx_resource *rsrc,
995                               enum pipe_format format);
996 
997 struct agx_transfer {
998    struct pipe_transfer base;
999    void *map;
1000    struct {
1001       struct pipe_resource *rsrc;
1002       struct pipe_box box;
1003    } staging;
1004 };
1005 
1006 static inline struct agx_transfer *
agx_transfer(struct pipe_transfer * p)1007 agx_transfer(struct pipe_transfer *p)
1008 {
1009    return (struct agx_transfer *)p;
1010 }
1011 
1012 void agx_upload_vbos(struct agx_batch *batch);
1013 void agx_upload_uniforms(struct agx_batch *batch);
1014 
1015 void agx_set_sampler_uniforms(struct agx_batch *batch,
1016                               enum pipe_shader_type stage);
1017 
1018 void agx_set_cbuf_uniforms(struct agx_batch *batch,
1019                            enum pipe_shader_type stage);
1020 
1021 void agx_set_ssbo_uniforms(struct agx_batch *batch,
1022                            enum pipe_shader_type stage);
1023 
1024 bool agx_nir_lower_point_size(nir_shader *nir, bool insert_write);
1025 
1026 bool agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
1027                            bool lower_draw_params);
1028 
1029 bool agx_nir_layout_uniforms(nir_shader *shader,
1030                              struct agx_compiled_shader *compiled,
1031                              unsigned *push_size);
1032 
1033 bool agx_nir_lower_bindings(nir_shader *shader, bool *uses_bindless_samplers);
1034 
1035 bool agx_batch_is_active(struct agx_batch *batch);
1036 bool agx_batch_is_submitted(struct agx_batch *batch);
1037 
1038 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
1039  * hot paths. To achieve this we model BO lists by bit sets */
1040 
1041 static bool
agx_batch_uses_bo(struct agx_batch * batch,struct agx_bo * bo)1042 agx_batch_uses_bo(struct agx_batch *batch, struct agx_bo *bo)
1043 {
1044    if (bo->handle < batch->bo_list.bit_count)
1045       return BITSET_TEST(batch->bo_list.set, bo->handle);
1046    else
1047       return false;
1048 }
1049 
1050 static inline void
agx_batch_add_bo_internal(struct agx_batch * batch,struct agx_bo * bo)1051 agx_batch_add_bo_internal(struct agx_batch *batch, struct agx_bo *bo)
1052 {
1053    /* Double the size of the BO list if we run out, this is amortized O(1) */
1054    if (unlikely(bo->handle >= batch->bo_list.bit_count)) {
1055       const unsigned bits_per_word = sizeof(BITSET_WORD) * 8;
1056 
1057       unsigned bit_count =
1058          MAX2(batch->bo_list.bit_count * 2,
1059               util_next_power_of_two(ALIGN_POT(bo->handle + 1, bits_per_word)));
1060 
1061       batch->bo_list.set = rerzalloc(
1062          batch->ctx, batch->bo_list.set, BITSET_WORD,
1063          batch->bo_list.bit_count / bits_per_word, bit_count / bits_per_word);
1064       batch->bo_list.bit_count = bit_count;
1065    }
1066 
1067    if (BITSET_TEST(batch->bo_list.set, bo->handle))
1068       return;
1069 
1070    /* The batch holds a single reference to each BO in the batch, released when
1071     * the batch finishes execution.
1072     */
1073    agx_bo_reference(bo);
1074    BITSET_SET(batch->bo_list.set, bo->handle);
1075 }
1076 
1077 static inline void
agx_batch_add_bo(struct agx_batch * batch,struct agx_bo * bo)1078 agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
1079 {
1080    agx_batch_add_bo_internal(batch, bo);
1081    assert(agx_batch_uses_bo(batch, bo));
1082 }
1083 
1084 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle)                             \
1085    BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)
1086 
1087 struct drm_asahi_cmd_compute;
1088 struct drm_asahi_cmd_render;
1089 
1090 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
1091                       struct drm_asahi_cmd_compute *compute,
1092                       struct drm_asahi_cmd_render *render);
1093 
1094 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
1095 void agx_flush_batch_for_reason(struct agx_context *ctx,
1096                                 struct agx_batch *batch, const char *reason);
1097 void agx_flush_all(struct agx_context *ctx, const char *reason);
1098 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1099                        const char *reason);
1100 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1101                       const char *reason);
1102 
1103 void agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1104                      const char *reason);
1105 void agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1106                       const char *reason);
1107 void agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch);
1108 void agx_sync_all(struct agx_context *ctx, const char *reason);
1109 void agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1110                                const char *reason);
1111 void agx_memory_barrier(struct pipe_context *pctx, unsigned flags);
1112 
1113 /* Use these instead of batch_add_bo for proper resource tracking */
1114 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
1115 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
1116                       unsigned level);
1117 void agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
1118                             unsigned offset, unsigned size);
1119 void agx_batch_track_image(struct agx_batch *batch,
1120                            struct pipe_image_view *image);
1121 
1122 bool agx_any_batch_uses_resource(struct agx_context *ctx,
1123                                  struct agx_resource *rsrc);
1124 
1125 /* 16384 is the maximum framebuffer dimension, so we use a larger width (the
1126  * maximum uint16_t) as a sentinel to identify the compute batch. This ensures
1127  * compute batches don't mix with graphics. This is a bit of a hack but it
1128  * works.
1129  */
1130 #define AGX_COMPUTE_BATCH_WIDTH 0xFFFF
1131 
1132 static inline bool
agx_batch_is_compute(struct agx_batch * batch)1133 agx_batch_is_compute(struct agx_batch *batch)
1134 {
1135    return batch->key.width == AGX_COMPUTE_BATCH_WIDTH;
1136 }
1137 
1138 struct agx_batch *agx_get_batch(struct agx_context *ctx);
1139 struct agx_batch *agx_get_compute_batch(struct agx_context *ctx);
1140 void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch);
1141 int agx_cleanup_batches(struct agx_context *ctx);
1142 
1143 void agx_batch_add_timestamp_query(struct agx_batch *batch,
1144                                    struct agx_query *q);
1145 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
1146 
1147 void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
1148                              uint64_t increment);
1149 
1150 enum asahi_blitter_op /* bitmask */
1151 {
1152    ASAHI_SAVE_TEXTURES = 1,
1153    ASAHI_SAVE_FRAMEBUFFER = 2,
1154    ASAHI_SAVE_FRAGMENT_STATE = 4,
1155    ASAHI_SAVE_FRAGMENT_CONSTANT = 8,
1156    ASAHI_DISABLE_RENDER_COND = 16,
1157 };
1158 
1159 enum {
1160    ASAHI_CLEAR = ASAHI_SAVE_FRAGMENT_STATE | ASAHI_SAVE_FRAGMENT_CONSTANT,
1161 
1162    ASAHI_BLIT =
1163       ASAHI_SAVE_FRAMEBUFFER | ASAHI_SAVE_TEXTURES | ASAHI_SAVE_FRAGMENT_STATE,
1164 };
1165 
1166 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
1167                       enum asahi_blitter_op op);
1168 
1169 void agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info);
1170 
1171 void agx_resource_copy_region(struct pipe_context *pctx,
1172                               struct pipe_resource *dst, unsigned dst_level,
1173                               unsigned dstx, unsigned dsty, unsigned dstz,
1174                               struct pipe_resource *src, unsigned src_level,
1175                               const struct pipe_box *src_box);
1176 
1177 /* Batch logic */
1178 
1179 struct agx_encoder agx_encoder_allocate(struct agx_batch *batch,
1180                                         struct agx_device *dev);
1181 
1182 void agx_batch_init_state(struct agx_batch *batch);
1183 
1184 struct asahi_bg_eot {
1185    uint64_t usc;
1186    struct agx_counts_packed counts;
1187 };
1188 
1189 struct asahi_bg_eot agx_build_bg_eot(struct agx_batch *batch, bool store,
1190                                      bool partial_render);
1191 
1192 /* Query management */
1193 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
1194 uint64_t agx_get_query_address(struct agx_batch *batch,
1195                                struct agx_query *query);
1196 uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
1197 
1198 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
1199                               uint64_t end_ts);
1200 
1201 bool agx_render_condition_check_inner(struct agx_context *ctx);
1202 
1203 static inline bool
agx_render_condition_check(struct agx_context * ctx)1204 agx_render_condition_check(struct agx_context *ctx)
1205 {
1206    if (likely(!ctx->cond_query))
1207       return true;
1208    else
1209       return agx_render_condition_check_inner(ctx);
1210 }
1211 
1212 static inline uint32_t
agx_texture_buffer_size_el(enum pipe_format format,uint32_t size)1213 agx_texture_buffer_size_el(enum pipe_format format, uint32_t size)
1214 {
1215    unsigned blocksize = util_format_get_blocksize(format);
1216 
1217    return MIN2(AGX_TEXTURE_BUFFER_MAX_SIZE, size / blocksize);
1218 }
1219 
1220 void agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
1221                             const char *reason);
1222