1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * Copyright 2019-2021 Collabora, Ltd.
4 * SPDX-License-Identifier: MIT
5 */
6
7 #pragma once
8
9 #include "asahi/compiler/agx_compile.h"
10 #include "asahi/genxml/agx_pack.h"
11 #include "asahi/layout/layout.h"
12 #include "asahi/lib/agx_bo.h"
13 #include "asahi/lib/agx_device.h"
14 #include "asahi/lib/agx_nir_lower_vbo.h"
15 #include "asahi/lib/agx_scratch.h"
16 #include "asahi/lib/agx_tilebuffer.h"
17 #include "asahi/lib/pool.h"
18 #include "asahi/lib/shaders/geometry.h"
19 #include "compiler/nir/nir_lower_blend.h"
20 #include "compiler/shader_enums.h"
21 #include "gallium/auxiliary/util/u_blitter.h"
22 #include "gallium/include/pipe/p_context.h"
23 #include "gallium/include/pipe/p_screen.h"
24 #include "gallium/include/pipe/p_state.h"
25 #include "pipe/p_defines.h"
26 #include "util/bitset.h"
27 #include "util/disk_cache.h"
28 #include "util/hash_table.h"
29 #include "util/u_range.h"
30 #include "agx_helpers.h"
31 #include "agx_meta.h"
32
33 #ifdef __GLIBC__
34 #include <errno.h>
35 #define agx_msg(fmt, ...) \
36 fprintf(stderr, "[%s] " fmt, program_invocation_short_name, ##__VA_ARGS__)
37 #else
38 #define agx_msg(...) fprintf(stderr, __VA_ARGS__)
39 #endif
40
41 #define AGX_NUM_TEXTURE_STATE_REGS 16
42
43 struct agx_streamout_target {
44 struct pipe_stream_output_target base;
45 struct pipe_resource *offset;
46
47 /* Current stride (bytes per vertex) */
48 uint32_t stride;
49 };
50
51 static inline struct agx_streamout_target *
agx_so_target(struct pipe_stream_output_target * target)52 agx_so_target(struct pipe_stream_output_target *target)
53 {
54 return (struct agx_streamout_target *)target;
55 }
56
57 struct agx_streamout {
58 struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
59 unsigned num_targets;
60 };
61
62 /* Shaders can access fixed-function state through system values.
63 * It is convenient to stash all of this information into a single "root"
64 * descriptor, then push individual parts as needed.
65 *
66 * In the future, we could optimize this to reduce CPU overhead, e.g. splitting
67 * into multiple descriptors for finer dirty tracking. This is not ABI with the
68 * compiler. The layout is up to us and handled by our code lowering system
69 * values to uniforms.
70 */
71 enum agx_sysval_table {
72 AGX_SYSVAL_TABLE_ROOT,
73 AGX_SYSVAL_TABLE_PARAMS,
74 AGX_SYSVAL_TABLE_GRID,
75 AGX_SYSVAL_TABLE_VS,
76 AGX_SYSVAL_TABLE_TCS,
77 AGX_SYSVAL_TABLE_TES,
78 AGX_SYSVAL_TABLE_GS,
79 AGX_SYSVAL_TABLE_FS,
80 AGX_SYSVAL_TABLE_CS,
81 AGX_NUM_SYSVAL_TABLES
82 };
83
84 #define AGX_SYSVAL_STAGE(stage) (AGX_SYSVAL_TABLE_VS + (stage))
85
86 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_VERTEX) == AGX_SYSVAL_TABLE_VS,
87 "fixed enum orderings");
88 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_CTRL) == AGX_SYSVAL_TABLE_TCS,
89 "fixed enum orderings");
90 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_EVAL) == AGX_SYSVAL_TABLE_TES,
91 "fixed enum orderings");
92 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_GEOMETRY) == AGX_SYSVAL_TABLE_GS,
93 "fixed enum orderings");
94 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_FRAGMENT) == AGX_SYSVAL_TABLE_FS,
95 "fixed enum orderings");
96 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_COMPUTE) == AGX_SYSVAL_TABLE_CS,
97 "fixed enum orderings");
98
99 /* Root system value table */
100 struct PACKED agx_draw_uniforms {
101 /* Pointers to the system value tables themselves (for indirection) */
102 uint64_t tables[AGX_NUM_SYSVAL_TABLES];
103
104 /* Vertex buffer object bases, if present. If vertex robustness is disabled,
105 * attrib_base maps VBOs directly and attrib_max_index is undefined. If
106 * vertex robustness is enabled, attrib_base maps attributes and
107 * attrib_clamp is an inclusive clamp on vertex/divided instance indices.
108 */
109 uint64_t attrib_base[PIPE_MAX_ATTRIBS];
110 uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];
111
112 /* Addresses for the results of pipeline statistics queries */
113 uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];
114
115 /* Address of input assembly buffer if geom/tess is used, else 0 */
116 uint64_t input_assembly;
117
118 /* Address of tessellation param buffer if tessellation is used, else 0 */
119 uint64_t tess_params;
120
121 /* Address of geometry param buffer if geometry shaders are used, else 0 */
122 uint64_t geometry_params;
123
124 /* Address of polygon stipple mask if used */
125 uint64_t polygon_stipple;
126
127 /* Blend constant if any */
128 float blend_constant[4];
129
130 /* glPointSize value */
131 float fixed_point_size;
132
133 /* Value of the multisample control register, containing sample positions in
134 * each byte (x in low nibble, y in high nibble).
135 */
136 uint32_t ppp_multisamplectl;
137
138 /* gl_DrawID for a direct multidraw */
139 uint32_t draw_id;
140
141 /* Sprite coord replacement mask */
142 uint16_t sprite_mask;
143
144 /* glSampleMask */
145 uint16_t sample_mask;
146
147 /* Nonzero if the last vertex stage writes the layer ID, zero otherwise */
148 uint16_t layer_id_written;
149
150 /* Nonzero for indexed draws, zero otherwise */
151 uint16_t is_indexed_draw;
152
153 /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
154 uint16_t clip_z_coeff;
155 };
156
157 struct PACKED agx_stage_uniforms {
158 /* Pointer to binding table for texture descriptor, or 0 if none. This must
159 * be first so that u0_u1 is always available for lowering binding
160 * tables to bindless access.
161 */
162 uint64_t texture_base;
163
164 /* Uniform buffer objects */
165 uint64_t ubo_base[PIPE_MAX_CONSTANT_BUFFERS];
166 uint32_t ubo_size[PIPE_MAX_CONSTANT_BUFFERS];
167
168 /* Shader storage buffer objects */
169 uint64_t ssbo_base[PIPE_MAX_SHADER_BUFFERS];
170 uint32_t ssbo_size[PIPE_MAX_SHADER_BUFFERS];
171
172 /* If lowered to bindless, sampler index in the heap */
173 uint16_t sampler_handle[PIPE_MAX_SAMPLERS];
174
175 /* LOD bias as float16 */
176 uint16_t lod_bias[PIPE_MAX_SAMPLERS];
177 };
178
179 /* In the architecture, there are 512 uniform registers, each 16-bits. In a
180 * theoretical worst case, we could push to all of them. We use a worst-case
181 * maximum because the expression for a tight upper bound is too messy and easy
182 * to go out of sync with the code.
183 */
184 #define AGX_MAX_PUSH_RANGES (512)
185
186 struct agx_push_range {
187 /* Base 16-bit uniform to push to */
188 uint16_t uniform;
189
190 /* Offset into the table to push in bytes */
191 uint16_t offset;
192
193 /* Which table to push from */
194 uint8_t table;
195
196 /* Number of consecutive 16-bit uniforms to push */
197 uint8_t length;
198 };
199
200 struct agx_compiled_shader {
201 /* Uncompiled shader that we belong to */
202 const struct agx_uncompiled_shader *so;
203
204 /* Mapped executable memory */
205 struct agx_bo *bo;
206
207 /* Metadata returned from the compiler */
208 struct agx_shader_info info;
209
210 /* Uniforms the driver must push */
211 unsigned push_range_count;
212 struct agx_push_range push[AGX_MAX_PUSH_RANGES];
213
214 /* Auxiliary programs, or NULL if not used */
215 struct agx_compiled_shader *gs_count, *pre_gs;
216 struct agx_compiled_shader *gs_copy;
217
218 /* Output primitive mode for geometry shaders */
219 enum mesa_prim gs_output_mode;
220
221 /* Number of words per primitive in the count buffer */
222 unsigned gs_count_words;
223
224 /* Logical shader stage used for descriptor access. This may differ from the
225 * physical shader stage of the compiled shader, for example when executing a
226 * tessellation eval shader as a vertex shader.
227 */
228 enum pipe_shader_type stage;
229 };
230
231 struct agx_uncompiled_shader {
232 struct pipe_shader_state base;
233 enum pipe_shader_type type;
234 struct blob early_serialized_nir;
235 struct blob serialized_nir;
236 uint8_t nir_sha1[20];
237 struct agx_uncompiled_shader_info info;
238 struct hash_table *variants;
239 struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
240 struct agx_uncompiled_shader *passthrough_tcs[32];
241
242 uint32_t xfb_strides[4];
243 bool has_xfb_info;
244 bool is_xfb_passthrough;
245
246 enum mesa_prim gs_mode;
247
248 /* Whether the shader accesses indexed samplers via the bindless heap */
249 bool uses_bindless_samplers;
250
251 /* Set on VS, passed to FS for linkage */
252 unsigned base_varying;
253
254 /* Tessellation info */
255 struct {
256 uint64_t per_vertex_outputs;
257 uint32_t output_stride;
258 enum gl_tess_spacing spacing;
259 enum tess_primitive_mode primitive;
260 uint8_t output_patch_size;
261 uint8_t nr_patch_outputs;
262 bool ccw;
263 bool point_mode;
264 } tess;
265 };
266
267 enum agx_stage_dirty {
268 AGX_STAGE_DIRTY_CONST = BITFIELD_BIT(0),
269 AGX_STAGE_DIRTY_SSBO = BITFIELD_BIT(1),
270 AGX_STAGE_DIRTY_IMAGE = BITFIELD_BIT(2),
271 AGX_STAGE_DIRTY_SAMPLER = BITFIELD_BIT(3),
272 };
273
274 struct agx_stage {
275 struct agx_uncompiled_shader *shader;
276 uint32_t dirty;
277
278 struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
279 uint32_t cb_mask;
280
281 struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
282 uint32_t ssbo_writable_mask;
283 uint32_t ssbo_mask;
284
285 struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES];
286 uint32_t image_mask;
287
288 /* Need full CSOs for u_blitter */
289 struct agx_sampler_state *samplers[PIPE_MAX_SAMPLERS];
290 struct agx_sampler_view *textures[PIPE_MAX_SHADER_SAMPLER_VIEWS];
291
292 /* Does any bound sampler require custom border colours? */
293 bool custom_borders;
294
295 unsigned sampler_count, texture_count;
296 uint32_t valid_samplers;
297 };
298
299 union agx_batch_result {
300 };
301
302 /* This is a firmware limit. It should be possible to raise to 2048 in the
303 * future... still not good enough for VK though :-(
304 */
305 #define AGX_SAMPLER_HEAP_SIZE (1024)
306
307 struct agx_sampler_heap {
308 struct agx_bo *bo;
309 uint16_t count;
310 };
311
312 uint16_t agx_sampler_heap_add(struct agx_device *dev,
313 struct agx_sampler_heap *heap,
314 struct agx_sampler_packed *sampler);
315
316 struct agx_encoder {
317 struct agx_bo *bo;
318 uint8_t *current;
319 uint8_t *end;
320 };
321
322 struct agx_batch {
323 struct agx_context *ctx;
324 struct pipe_framebuffer_state key;
325 uint64_t seqnum;
326 uint32_t syncobj;
327 uint32_t draws;
328
329 struct agx_tilebuffer_layout tilebuffer_layout;
330
331 /* PIPE_CLEAR_* bitmask */
332 uint32_t clear, draw, load, resolve;
333 bool initialized;
334
335 uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS];
336 double clear_depth;
337 unsigned clear_stencil;
338
339 /* Whether we're drawing points, lines, or triangles */
340 enum mesa_prim reduced_prim;
341
342 /* Whether the bound FS needs a primitive ID that is not supplied by the
343 * bound hardware VS (software GS)
344 */
345 bool generate_primitive_id;
346
347 /* Current varyings linkage structures */
348 uint32_t varyings;
349
350 struct agx_draw_uniforms uniforms;
351 struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES];
352
353 /* Indirect buffer allocated for geometry shader */
354 uint64_t geom_indirect;
355 struct agx_bo *geom_indirect_bo;
356
357 /* Geometry state buffer if geometry/etc shaders are used */
358 uint64_t geometry_state;
359
360 /* Uploaded descriptors */
361 uint32_t texture_count[PIPE_SHADER_TYPES];
362
363 uint64_t samplers[PIPE_SHADER_TYPES];
364 uint32_t sampler_count[PIPE_SHADER_TYPES];
365
366 struct agx_sampler_heap sampler_heap;
367
368 /* Resource list requirements, represented as a bit set indexed by BO
369 * handles (GEM handles on Linux, or IOGPU's equivalent on macOS)
370 */
371 struct {
372 BITSET_WORD *set;
373 unsigned bit_count;
374 } bo_list;
375
376 /* If true, this batch contains a shader with a potentially incoherent write
377 * (e.g. image_write), needing a barrier later to access.
378 */
379 bool incoherent_writes;
380
381 struct agx_pool pool, pipeline_pool;
382
383 /* We may enqueue both CDM and VDM work, possibly to the same batch for
384 * geometry/tessellation.
385 */
386 struct agx_encoder vdm;
387 struct agx_encoder cdm;
388
389 /* Scissor and depth-bias descriptors, uploaded at GPU time */
390 struct util_dynarray scissor, depth_bias;
391
392 /* Arrays of GPU pointers that should be written with the batch timestamps */
393 struct util_dynarray timestamps;
394
395 /* Result buffer where the kernel places command execution information */
396 union agx_batch_result *result;
397 size_t result_off;
398
399 /* Actual pointer in a uniform */
400 struct agx_bo *geom_params_bo;
401
402 /* Whether each stage uses scratch */
403 bool vs_scratch;
404 bool fs_scratch;
405 bool cs_scratch;
406
407 /* Whether each stage has preambles using scratch, and if so which bucket.
408 * This just needs to be zero/nonzero for correctness, the magnitude in
409 * buckets is for statistics.
410 */
411 unsigned vs_preamble_scratch;
412 unsigned fs_preamble_scratch;
413 unsigned cs_preamble_scratch;
414 };
415
416 struct agx_zsa {
417 struct pipe_depth_stencil_alpha_state base;
418 struct agx_fragment_face_packed depth;
419 struct agx_fragment_stencil_packed front_stencil, back_stencil;
420
421 /* PIPE_CLEAR_* bitmask corresponding to this depth/stencil state */
422 uint32_t load, store;
423 };
424
425 struct agx_blend_key {
426 nir_lower_blend_rt rt[8];
427 unsigned logicop_func;
428 bool alpha_to_coverage, alpha_to_one;
429 };
430
431 struct agx_blend {
432 struct agx_blend_key key;
433
434 /* PIPE_CLEAR_* bitmask corresponding to this blend state */
435 uint32_t store;
436 };
437
438 /* These parts of the vertex element affect the generated code */
439 struct agx_velem_key {
440 uint32_t divisor;
441 uint16_t stride;
442 uint8_t format;
443 uint8_t pad;
444 };
445
446 enum asahi_vs_next_stage {
447 ASAHI_VS_FS,
448 ASAHI_VS_GS,
449 ASAHI_VS_TCS,
450 };
451
452 struct asahi_vs_shader_key {
453 struct agx_velem_key attribs[AGX_MAX_VBUFS];
454 enum asahi_vs_next_stage next_stage;
455
456 union {
457 struct {
458 uint8_t index_size_B;
459 } gs;
460
461 struct {
462 bool fixed_point_size;
463 uint64_t outputs_flat_shaded;
464 uint64_t outputs_linear_shaded;
465 } fs;
466 } next;
467 };
468
469 struct agx_vertex_elements {
470 unsigned num_attribs;
471 struct agx_velem_key key[PIPE_MAX_ATTRIBS];
472
473 /* These parts do not affect the generated code so are not in the key */
474 uint16_t src_offsets[PIPE_MAX_ATTRIBS];
475 uint16_t buffers[PIPE_MAX_ATTRIBS];
476 };
477
478 struct asahi_fs_shader_key {
479 struct agx_blend_key blend;
480
481 /* Need to count FRAGMENT_SHADER_INVOCATIONS */
482 bool statistics;
483
484 /* Set if glSampleMask() is used with a mask other than all-1s. If not, we
485 * don't want to emit lowering code for it, since it would disable early-Z.
486 */
487 bool api_sample_mask;
488 bool polygon_stipple;
489
490 uint8_t cull_distance_size;
491 uint8_t nr_samples;
492 enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
493 };
494
495 struct asahi_tcs_shader_key {
496 /* Input assembly key. Simplified because we know we're operating on patches.
497 */
498 uint8_t index_size_B;
499
500 /* Vertex shader key */
501 struct agx_velem_key attribs[AGX_MAX_VBUFS];
502
503 /* Tessellation control shaders must be linked with a vertex shader. */
504 uint8_t input_nir_sha1[20];
505 };
506
507 struct asahi_gs_shader_key {
508 /* Rasterizer shader key */
509 uint64_t outputs_flat_shaded;
510 uint64_t outputs_linear_shaded;
511 bool fixed_point_size;
512
513 /* If true, this GS is run only for its side effects (including XFB) */
514 bool rasterizer_discard;
515 bool padding[6];
516 };
517 static_assert(sizeof(struct asahi_gs_shader_key) == 24, "no holes");
518
519 union asahi_shader_key {
520 struct asahi_vs_shader_key vs;
521 struct asahi_tcs_shader_key tcs;
522 struct asahi_gs_shader_key gs;
523 struct asahi_fs_shader_key fs;
524 };
525
526 enum agx_dirty {
527 AGX_DIRTY_VERTEX = BITFIELD_BIT(0),
528 AGX_DIRTY_VIEWPORT = BITFIELD_BIT(1),
529 AGX_DIRTY_SCISSOR_ZBIAS = BITFIELD_BIT(2),
530 AGX_DIRTY_ZS = BITFIELD_BIT(3),
531 AGX_DIRTY_STENCIL_REF = BITFIELD_BIT(4),
532 AGX_DIRTY_RS = BITFIELD_BIT(5),
533 AGX_DIRTY_SPRITE_COORD_MODE = BITFIELD_BIT(6),
534 AGX_DIRTY_PRIM = BITFIELD_BIT(7),
535
536 /* Vertex/fragment pipelines, including uniforms and textures */
537 AGX_DIRTY_VS = BITFIELD_BIT(8),
538 AGX_DIRTY_FS = BITFIELD_BIT(9),
539
540 /* Just the progs themselves */
541 AGX_DIRTY_VS_PROG = BITFIELD_BIT(10),
542 AGX_DIRTY_FS_PROG = BITFIELD_BIT(11),
543
544 AGX_DIRTY_BLEND = BITFIELD_BIT(12),
545 AGX_DIRTY_QUERY = BITFIELD_BIT(13),
546 AGX_DIRTY_XFB = BITFIELD_BIT(14),
547 AGX_DIRTY_SAMPLE_MASK = BITFIELD_BIT(15),
548 AGX_DIRTY_BLEND_COLOR = BITFIELD_BIT(16),
549 AGX_DIRTY_POLY_STIPPLE = BITFIELD_BIT(17),
550 };
551
552 /* Maximum number of in-progress + under-construction GPU batches.
553 * Must be large enough for silly workloads that do things like
554 * glGenerateMipmap on every frame, otherwise we end up losing performance.
555 */
556 #define AGX_MAX_BATCHES (128)
557
558 static_assert(PIPE_TEX_FILTER_NEAREST < 2, "known order");
559 static_assert(PIPE_TEX_FILTER_LINEAR < 2, "known order");
560
561 enum asahi_blit_clamp {
562 ASAHI_BLIT_CLAMP_NONE,
563 ASAHI_BLIT_CLAMP_UINT_TO_SINT,
564 ASAHI_BLIT_CLAMP_SINT_TO_UINT,
565
566 /* keep last */
567 ASAHI_BLIT_CLAMP_COUNT,
568 };
569
570 struct asahi_blitter {
571 bool active;
572
573 /* [clamp_type][is_array] */
574 void *blit_cs[ASAHI_BLIT_CLAMP_COUNT][2];
575
576 /* [filter] */
577 void *sampler[2];
578
579 struct pipe_constant_buffer saved_cb;
580
581 bool has_saved_image;
582 struct pipe_image_view saved_image;
583
584 unsigned saved_num_sampler_states;
585 void *saved_sampler_states[PIPE_MAX_SAMPLERS];
586
587 struct pipe_sampler_view *saved_sampler_view;
588
589 void *saved_cs;
590 };
591
592 struct agx_oq_heap;
593
594 struct agx_context {
595 struct pipe_context base;
596 struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
597 uint32_t dirty;
598
599 /* Heap for dynamic memory allocation for geometry/tessellation shaders */
600 struct pipe_resource *heap;
601
602 /* Occlusion query heap */
603 struct agx_oq_heap *oq;
604
605 /* Acts as a context-level shader key */
606 bool support_lod_bias;
607 bool robust;
608
609 /* Set of batches. When full, the LRU entry (the batch with the smallest
610 * seqnum) is flushed to free a slot.
611 */
612 struct {
613 uint64_t seqnum;
614 struct agx_batch slots[AGX_MAX_BATCHES];
615
616 /** Set of active batches for faster traversal */
617 BITSET_DECLARE(active, AGX_MAX_BATCHES);
618
619 /** Set of submitted batches for faster traversal */
620 BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
621
622 /* Monotonic counter for each batch incremented when resetting a batch to
623 * invalidate all associated queries. Compared to
624 * agx_query::writer_generation.
625 */
626 uint64_t generation[AGX_MAX_BATCHES];
627 } batches;
628
629 struct agx_batch *batch;
630 struct agx_bo *result_buf;
631
632 struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
633 uint32_t vb_mask;
634
635 unsigned patch_vertices;
636 float default_outer_level[4];
637 float default_inner_level[2];
638
639 struct agx_stage stage[PIPE_SHADER_TYPES];
640 struct agx_vertex_elements *attributes;
641 struct agx_rasterizer *rast;
642 struct agx_zsa *zs;
643 struct agx_blend *blend;
644 struct pipe_blend_color blend_color;
645 struct pipe_viewport_state viewport[AGX_MAX_VIEWPORTS];
646 struct pipe_scissor_state scissor[AGX_MAX_VIEWPORTS];
647 struct pipe_stencil_ref stencil_ref;
648 struct agx_streamout streamout;
649 uint16_t sample_mask;
650 struct pipe_framebuffer_state framebuffer;
651
652 uint32_t poly_stipple[32];
653
654 struct pipe_query *cond_query;
655 bool cond_cond;
656 enum pipe_render_cond_flag cond_mode;
657
658 struct agx_query *occlusion_query;
659 struct agx_query *prims_generated[4];
660 struct agx_query *tf_prims_generated[4];
661 struct agx_query *tf_overflow[4];
662 struct agx_query *tf_any_overflow;
663 struct agx_query *pipeline_statistics[PIPE_STAT_QUERY_TS_INVOCATIONS];
664 struct agx_query *time_elapsed;
665 bool active_queries;
666 bool active_draw_without_restart;
667
668 struct util_debug_callback debug;
669 bool is_noop;
670
671 bool in_tess;
672
673 struct blitter_context *blitter;
674 struct asahi_blitter compute_blitter;
675
676 /* Map of GEM handle to (batch index + 1) that (conservatively) writes that
677 * BO, or 0 if no writer.
678 */
679 struct util_dynarray writer;
680
681 /* Bound CL global buffers */
682 struct util_dynarray global_buffers;
683
684 struct hash_table *generic_meta;
685 struct agx_meta_cache meta;
686
687 bool any_faults;
688
689 uint32_t syncobj;
690 uint32_t dummy_syncobj;
691 int in_sync_fd;
692 uint32_t in_sync_obj;
693
694 struct agx_scratch scratch_vs;
695 struct agx_scratch scratch_fs;
696 struct agx_scratch scratch_cs;
697 };
698
699 static inline unsigned
agx_batch_idx(struct agx_batch * batch)700 agx_batch_idx(struct agx_batch *batch)
701 {
702 return batch - batch->ctx->batches.slots;
703 }
704
705 static void
agx_writer_add(struct agx_context * ctx,uint8_t batch_index,unsigned handle)706 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
707 {
708 assert(batch_index < AGX_MAX_BATCHES && "invariant");
709 static_assert(AGX_MAX_BATCHES < 0xFF, "no overflow on addition");
710
711 /* If we need to grow, double the capacity so insertion is amortized O(1). */
712 if (unlikely(handle >= ctx->writer.size)) {
713 unsigned new_size =
714 MAX2(ctx->writer.capacity * 2, util_next_power_of_two(handle + 1));
715 unsigned grow = new_size - ctx->writer.size;
716
717 memset(util_dynarray_grow(&ctx->writer, uint8_t, grow), 0,
718 grow * sizeof(uint8_t));
719 }
720
721 /* There is now room */
722 uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
723 assert((*value) == 0 && "there should be no existing writer");
724 *value = batch_index + 1;
725 }
726
727 static struct agx_batch *
agx_writer_get(struct agx_context * ctx,unsigned handle)728 agx_writer_get(struct agx_context *ctx, unsigned handle)
729 {
730 if (handle >= ctx->writer.size)
731 return NULL;
732
733 uint8_t value = *util_dynarray_element(&ctx->writer, uint8_t, handle);
734
735 if (value > 0)
736 return &ctx->batches.slots[value - 1];
737 else
738 return NULL;
739 }
740
741 static void
agx_writer_remove(struct agx_context * ctx,unsigned handle)742 agx_writer_remove(struct agx_context *ctx, unsigned handle)
743 {
744 if (handle >= ctx->writer.size)
745 return;
746
747 uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
748 *value = 0;
749 }
750
751 static inline struct agx_context *
agx_context(struct pipe_context * pctx)752 agx_context(struct pipe_context *pctx)
753 {
754 return (struct agx_context *)pctx;
755 }
756
757 void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
758 struct agx_compiled_shader *cs, enum pipe_shader_type stage);
759
760 void agx_init_query_functions(struct pipe_context *ctx);
761
762 void
763 agx_primitives_update_direct(struct agx_context *ctx,
764 const struct pipe_draw_info *info,
765 const struct pipe_draw_start_count_bias *draw);
766
767 void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
768 const struct pipe_draw_info *info,
769 unsigned drawid_offset,
770 const struct pipe_draw_indirect_info *indirect);
771
772 uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
773 uint32_t *size);
774
775 void agx_init_streamout_functions(struct pipe_context *ctx);
776
777 static inline void
agx_dirty_all(struct agx_context * ctx)778 agx_dirty_all(struct agx_context *ctx)
779 {
780 ctx->dirty = ~0;
781
782 for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i)
783 ctx->stage[i].dirty = ~0;
784 }
785
786 static inline void
agx_dirty_reset_graphics(struct agx_context * ctx)787 agx_dirty_reset_graphics(struct agx_context *ctx)
788 {
789 ctx->dirty = 0;
790
791 for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) {
792 if (i != PIPE_SHADER_COMPUTE)
793 ctx->stage[i].dirty = 0;
794 }
795 }
796
797 struct agx_rasterizer {
798 struct pipe_rasterizer_state base;
799 uint8_t cull[AGX_CULL_LENGTH];
800 uint8_t line_width;
801 uint8_t polygon_mode;
802 };
803
804 struct agx_query {
805 unsigned type;
806 unsigned index;
807
808 uint64_t writer_generation[AGX_MAX_BATCHES];
809 struct agx_bo *bo;
810 struct agx_ptr ptr;
811 };
812
813 struct agx_sampler_state {
814 struct pipe_sampler_state base;
815
816 /* Prepared descriptor */
817 struct agx_sampler_packed desc, desc_without_custom_border;
818
819 /* Whether a custom border colour is required */
820 bool uses_custom_border;
821
822 /* Packed custom border colour, or zero if none is required */
823 struct agx_border_packed border;
824
825 /* LOD bias packed as fp16, the form we'll pass to the shader */
826 uint16_t lod_bias_as_fp16;
827 };
828
829 struct agx_sampler_view {
830 struct pipe_sampler_view base;
831
832 /* Resource/format, may differ from base in case of separate stencil */
833 struct agx_resource *rsrc;
834 enum pipe_format format;
835
836 /* Prepared descriptor */
837 struct agx_texture_packed desc;
838 };
839
840 struct agx_screen {
841 struct pipe_screen pscreen;
842 struct agx_device dev;
843 struct disk_cache *disk_cache;
844 /* Queue handle */
845 uint32_t queue_id;
846 };
847
848 static inline struct agx_screen *
agx_screen(struct pipe_screen * p)849 agx_screen(struct pipe_screen *p)
850 {
851 return (struct agx_screen *)p;
852 }
853
854 static inline struct agx_device *
agx_device(struct pipe_screen * p)855 agx_device(struct pipe_screen *p)
856 {
857 return &(agx_screen(p)->dev);
858 }
859
860 #define perf_debug(dev, ...) \
861 do { \
862 if (unlikely((dev)->debug & AGX_DBG_PERF)) \
863 mesa_logw(__VA_ARGS__); \
864 } while (0)
865
866 #define perf_debug_ctx(ctx, ...) \
867 perf_debug(agx_device((ctx)->base.screen), __VA_ARGS__)
868
869 struct agx_resource {
870 struct pipe_resource base;
871 uint64_t modifier;
872
873 /* Should probably be part of the modifier. Affects the tiling algorithm, or
874 * something like that.
875 */
876 bool mipmapped;
877
878 /* Hardware backing */
879 struct agx_bo *bo;
880
881 struct renderonly_scanout *scanout;
882
883 BITSET_DECLARE(data_valid, PIPE_MAX_TEXTURE_LEVELS);
884
885 struct ail_layout layout;
886
887 /* Metal does not support packed depth/stencil formats; presumably AGX does
888 * not either. Instead, we create separate depth and stencil resources,
889 * managed by u_transfer_helper. We provide the illusion of packed
890 * resources.
891 */
892 struct agx_resource *separate_stencil;
893
894 /* Valid buffer range tracking, to optimize buffer appends */
895 struct util_range valid_buffer_range;
896
897 /* Cumulative shadowed byte count for this resource, that is, the number of
898 * times multiplied by the resource size.
899 */
900 size_t shadowed_bytes;
901 };
902
903 static inline struct agx_resource *
agx_resource(struct pipe_resource * pctx)904 agx_resource(struct pipe_resource *pctx)
905 {
906 return (struct agx_resource *)pctx;
907 }
908
909 static inline bool
agx_resource_valid(struct agx_resource * rsrc,int level)910 agx_resource_valid(struct agx_resource *rsrc, int level)
911 {
912 /* Shared BOs can always be potentially valid */
913 if (rsrc->bo && rsrc->bo->flags & AGX_BO_SHARED) {
914 assert(level == 0);
915 return true;
916 }
917
918 return BITSET_TEST(rsrc->data_valid, level);
919 }
920
921 static inline void *
agx_map_texture_cpu(struct agx_resource * rsrc,unsigned level,unsigned z)922 agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z)
923 {
924 return ((uint8_t *)rsrc->bo->ptr.cpu) +
925 ail_get_layer_level_B(&rsrc->layout, z, level);
926 }
927
928 static inline uint64_t
agx_map_texture_gpu(struct agx_resource * rsrc,unsigned z)929 agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z)
930 {
931 return rsrc->bo->ptr.gpu +
932 (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z);
933 }
934
935 void agx_decompress(struct agx_context *ctx, struct agx_resource *rsrc,
936 const char *reason);
937
938 void agx_legalize_compression(struct agx_context *ctx,
939 struct agx_resource *rsrc,
940 enum pipe_format format);
941
942 struct agx_transfer {
943 struct pipe_transfer base;
944 void *map;
945 struct {
946 struct pipe_resource *rsrc;
947 struct pipe_box box;
948 } staging;
949 };
950
951 static inline struct agx_transfer *
agx_transfer(struct pipe_transfer * p)952 agx_transfer(struct pipe_transfer *p)
953 {
954 return (struct agx_transfer *)p;
955 }
956
957 void agx_upload_vbos(struct agx_batch *batch);
958 void agx_upload_uniforms(struct agx_batch *batch);
959
960 void agx_set_sampler_uniforms(struct agx_batch *batch,
961 enum pipe_shader_type stage);
962
963 void agx_set_cbuf_uniforms(struct agx_batch *batch,
964 enum pipe_shader_type stage);
965
966 void agx_set_ssbo_uniforms(struct agx_batch *batch,
967 enum pipe_shader_type stage);
968
969 bool agx_nir_lower_point_size(nir_shader *nir, bool fixed_point_size);
970
971 bool agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
972 bool lower_draw_params);
973
974 bool agx_nir_layout_uniforms(nir_shader *shader,
975 struct agx_compiled_shader *compiled,
976 unsigned *push_size);
977
978 bool agx_nir_lower_bindings(nir_shader *shader, bool *uses_bindless_samplers);
979
980 bool agx_batch_is_active(struct agx_batch *batch);
981 bool agx_batch_is_submitted(struct agx_batch *batch);
982
983 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
984 * hot paths. To achieve this we model BO lists by bit sets */
985
986 static bool
agx_batch_uses_bo(struct agx_batch * batch,struct agx_bo * bo)987 agx_batch_uses_bo(struct agx_batch *batch, struct agx_bo *bo)
988 {
989 if (bo->handle < batch->bo_list.bit_count)
990 return BITSET_TEST(batch->bo_list.set, bo->handle);
991 else
992 return false;
993 }
994
995 static inline void
agx_batch_add_bo(struct agx_batch * batch,struct agx_bo * bo)996 agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
997 {
998 /* Double the size of the BO list if we run out, this is amortized O(1) */
999 if (unlikely(bo->handle > batch->bo_list.bit_count)) {
1000 const unsigned bits_per_word = sizeof(BITSET_WORD) * 8;
1001
1002 unsigned bit_count =
1003 MAX2(batch->bo_list.bit_count * 2,
1004 util_next_power_of_two(ALIGN_POT(bo->handle + 1, bits_per_word)));
1005
1006 batch->bo_list.set = rerzalloc(
1007 batch->ctx, batch->bo_list.set, BITSET_WORD,
1008 batch->bo_list.bit_count / bits_per_word, bit_count / bits_per_word);
1009 batch->bo_list.bit_count = bit_count;
1010 }
1011
1012 if (BITSET_TEST(batch->bo_list.set, bo->handle))
1013 return;
1014
1015 /* The batch holds a single reference to each BO in the batch, released when
1016 * the batch finishes execution.
1017 */
1018 agx_bo_reference(bo);
1019 BITSET_SET(batch->bo_list.set, bo->handle);
1020 }
1021
1022 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) \
1023 BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)
1024
1025 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
1026 uint32_t barriers, enum drm_asahi_cmd_type cmd_type,
1027 void *cmdbuf);
1028
1029 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
1030 void agx_flush_batch_for_reason(struct agx_context *ctx,
1031 struct agx_batch *batch, const char *reason);
1032 void agx_flush_all(struct agx_context *ctx, const char *reason);
1033 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1034 const char *reason);
1035 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1036 const char *reason);
1037
1038 void agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1039 const char *reason);
1040 void agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1041 const char *reason);
1042 void agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch);
1043 void agx_sync_all(struct agx_context *ctx, const char *reason);
1044 void agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1045 const char *reason);
1046 void agx_memory_barrier(struct pipe_context *pctx, unsigned flags);
1047
1048 /* Use these instead of batch_add_bo for proper resource tracking */
1049 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
1050 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
1051 unsigned level);
1052 void agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
1053 unsigned offset, unsigned size);
1054 void agx_batch_track_image(struct agx_batch *batch,
1055 struct pipe_image_view *image);
1056
1057 bool agx_any_batch_uses_resource(struct agx_context *ctx,
1058 struct agx_resource *rsrc);
1059
1060 /* 16384 is the maximum framebuffer dimension, so we use a larger width (the
1061 * maximum uint16_t) as a sentinel to identify the compute batch. This ensures
1062 * compute batches don't mix with graphics. This is a bit of a hack but it
1063 * works.
1064 */
1065 #define AGX_COMPUTE_BATCH_WIDTH 0xFFFF
1066
1067 static inline bool
agx_batch_is_compute(struct agx_batch * batch)1068 agx_batch_is_compute(struct agx_batch *batch)
1069 {
1070 return batch->key.width == AGX_COMPUTE_BATCH_WIDTH;
1071 }
1072
1073 struct agx_batch *agx_get_batch(struct agx_context *ctx);
1074 struct agx_batch *agx_get_compute_batch(struct agx_context *ctx);
1075 void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch);
1076 int agx_cleanup_batches(struct agx_context *ctx);
1077
1078 void agx_batch_add_timestamp_query(struct agx_batch *batch,
1079 struct agx_query *q);
1080 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
1081
1082 void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
1083 uint64_t increment);
1084
1085 /* Blit shaders */
1086 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
1087 bool render_cond);
1088
1089 void agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info);
1090
1091 void agx_resource_copy_region(struct pipe_context *pctx,
1092 struct pipe_resource *dst, unsigned dst_level,
1093 unsigned dstx, unsigned dsty, unsigned dstz,
1094 struct pipe_resource *src, unsigned src_level,
1095 const struct pipe_box *src_box);
1096
1097 /* Batch logic */
1098
1099 struct agx_encoder agx_encoder_allocate(struct agx_batch *batch,
1100 struct agx_device *dev);
1101
1102 void agx_batch_init_state(struct agx_batch *batch);
1103
1104 uint64_t agx_build_meta(struct agx_batch *batch, bool store,
1105 bool partial_render);
1106
1107 /* Query management */
1108 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
1109 uint64_t agx_get_query_address(struct agx_batch *batch,
1110 struct agx_query *query);
1111 uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
1112
1113 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
1114 uint64_t end_ts);
1115
1116 bool agx_render_condition_check_inner(struct agx_context *ctx);
1117
1118 static inline bool
agx_render_condition_check(struct agx_context * ctx)1119 agx_render_condition_check(struct agx_context *ctx)
1120 {
1121 if (likely(!ctx->cond_query))
1122 return true;
1123 else
1124 return agx_render_condition_check_inner(ctx);
1125 }
1126
1127 /* Texel buffers lowered to (at most) 1024x16384 2D textures */
1128 #define AGX_TEXTURE_BUFFER_WIDTH 1024
1129 #define AGX_TEXTURE_BUFFER_MAX_HEIGHT 16384
1130 #define AGX_TEXTURE_BUFFER_MAX_SIZE \
1131 (AGX_TEXTURE_BUFFER_WIDTH * AGX_TEXTURE_BUFFER_MAX_HEIGHT)
1132
1133 static inline uint32_t
agx_texture_buffer_size_el(enum pipe_format format,uint32_t size)1134 agx_texture_buffer_size_el(enum pipe_format format, uint32_t size)
1135 {
1136 unsigned blocksize = util_format_get_blocksize(format);
1137
1138 return MIN2(AGX_TEXTURE_BUFFER_MAX_SIZE, size / blocksize);
1139 }
1140
1141 typedef void (*meta_shader_builder_t)(struct nir_builder *b, const void *key);
1142
1143 void agx_init_meta_shaders(struct agx_context *ctx);
1144
1145 void agx_destroy_meta_shaders(struct agx_context *ctx);
1146