1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12
13 #include "tu_common.h"
14
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21
22 enum tu_draw_state_group_id
23 {
24 TU_DRAW_STATE_PROGRAM_CONFIG,
25 TU_DRAW_STATE_VS,
26 TU_DRAW_STATE_VS_BINNING,
27 TU_DRAW_STATE_HS,
28 TU_DRAW_STATE_DS,
29 TU_DRAW_STATE_GS,
30 TU_DRAW_STATE_GS_BINNING,
31 TU_DRAW_STATE_VPC,
32 TU_DRAW_STATE_FS,
33 TU_DRAW_STATE_VB,
34 TU_DRAW_STATE_CONST,
35 TU_DRAW_STATE_DESC_SETS,
36 TU_DRAW_STATE_DESC_SETS_LOAD,
37 TU_DRAW_STATE_VS_PARAMS,
38 TU_DRAW_STATE_FS_PARAMS,
39 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42 TU_DRAW_STATE_PRIM_MODE_GMEM,
43
44 /* dynamic state related draw states */
45 TU_DRAW_STATE_DYNAMIC,
46 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48
49 struct tu_descriptor_state
50 {
51 struct tu_descriptor_set *sets[MAX_SETS];
52 struct tu_descriptor_set push_set;
53 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54 uint64_t set_iova[MAX_SETS];
55 uint32_t max_sets_bound;
56 uint32_t max_dynamic_offset_size;
57 };
58
59 enum tu_cmd_dirty_bits
60 {
61 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62 TU_CMD_DIRTY_DESC_SETS = BIT(1),
63 TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64 TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65 TU_CMD_DIRTY_LRZ = BIT(4),
66 TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67 TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68 TU_CMD_DIRTY_SUBPASS = BIT(7),
69 TU_CMD_DIRTY_FDM = BIT(8),
70 TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71 TU_CMD_DIRTY_TES = BIT(10),
72 TU_CMD_DIRTY_PROGRAM = BIT(11),
73 TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74 TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75 TU_CMD_DIRTY_FS = BIT(14),
76 TU_CMD_DIRTY_SHADING_RATE = BIT(15),
77 /* all draw states were disabled and need to be re-enabled: */
78 TU_CMD_DIRTY_DRAW_STATE = BIT(16)
79 };
80
81 /* There are only three cache domains we have to care about: the CCU, or
82 * color cache unit, which is used for color and depth/stencil attachments
83 * and copy/blit destinations, and is split conceptually into color and depth,
84 * and the universal cache or UCHE which is used for pretty much everything
85 * else, except for the CP (uncached) and host. We need to flush whenever data
86 * crosses these boundaries.
87 */
88
89 enum tu_cmd_access_mask {
90 TU_ACCESS_NONE = 0,
91 TU_ACCESS_UCHE_READ = 1 << 0,
92 TU_ACCESS_UCHE_WRITE = 1 << 1,
93 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
94 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
95 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
96 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
97
98 /* Experiments have shown that while it's safe to avoid flushing the CCU
99 * after each blit/renderpass, it's not safe to assume that subsequent
100 * lookups with a different attachment state will hit unflushed cache
101 * entries. That is, the CCU needs to be flushed and possibly invalidated
102 * when accessing memory with a different attachment state. Writing to an
103 * attachment under the following conditions after clearing using the
104 * normal 2d engine path is known to have issues:
105 *
106 * - It isn't the 0'th layer.
107 * - There are more than one attachment, and this isn't the 0'th attachment
108 * (this seems to also depend on the cpp of the attachments).
109 *
110 * Our best guess is that the layer/MRT state is used when computing
111 * the location of a cache entry in CCU, to avoid conflicts. We assume that
112 * any access in a renderpass after or before an access by a transfer needs
113 * a flush/invalidate, and use the _INCOHERENT variants to represent access
114 * by a renderpass.
115 */
116 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
117 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
118 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
119 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
120
121 /* Accesses which bypasses any cache. e.g. writes via the host,
122 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
123 */
124 TU_ACCESS_SYSMEM_READ = 1 << 10,
125 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
126
127 /* Memory writes from the CP start in-order with draws and event writes,
128 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
129 */
130 TU_ACCESS_CP_WRITE = 1 << 12,
131
132 /* Descriptors are read through UCHE but are also prefetched via
133 * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
134 * when they change.
135 */
136 TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
137
138 /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
139 TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
140
141 /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
142 TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
143
144 /* The CCHE is a write-through cache which sits behind UCHE, with multiple
145 * incoherent copies. Because it's write-through we only have to worry
146 * about invalidating it for reads. It's invalidated by "ccinv" in the
147 * shader and CP_CCHE_INVALIDATE in the command stream.
148 */
149 TU_ACCESS_CCHE_READ = 1 << 16,
150
151 TU_ACCESS_READ =
152 TU_ACCESS_UCHE_READ |
153 TU_ACCESS_CCU_COLOR_READ |
154 TU_ACCESS_CCU_DEPTH_READ |
155 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
156 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
157 TU_ACCESS_SYSMEM_READ |
158 TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
159 TU_ACCESS_CCHE_READ,
160
161 TU_ACCESS_WRITE =
162 TU_ACCESS_UCHE_WRITE |
163 TU_ACCESS_CCU_COLOR_WRITE |
164 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
165 TU_ACCESS_CCU_DEPTH_WRITE |
166 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
167 TU_ACCESS_SYSMEM_WRITE |
168 TU_ACCESS_CP_WRITE,
169
170 TU_ACCESS_ALL =
171 TU_ACCESS_READ |
172 TU_ACCESS_WRITE,
173 };
174
175 /* From the driver's point of view, we only need to distinguish between things
176 * which won't start until a WFI is complete and things which additionally
177 * need a WAIT_FOR_ME.
178 *
179 * TODO: This will get more complicated with concurrent binning.
180 */
181 enum tu_stage {
182 /* As a destination stage, this is for operations on the CP which don't
183 * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
184 * As a source stage, it is for things needing no waits.
185 */
186 TU_STAGE_CP,
187
188 /* This is for most operations, which WFI will wait to finish and will not
189 * start until any pending WFIs are finished.
190 */
191 TU_STAGE_GPU,
192
193 /* This is only used as a destination stage and is for things needing no
194 * waits on the GPU (e.g. host operations).
195 */
196 TU_STAGE_BOTTOM,
197 };
198
199 enum tu_cmd_flush_bits {
200 TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
201 TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
202 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
203 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
204 TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
205 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
206 TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
207 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
208 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
209 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
210 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
211 /* This is an unusual flush that isn't automatically executed if pending,
212 * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
213 */
214 TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
215
216 TU_CMD_FLAG_ALL_CLEAN =
217 TU_CMD_FLAG_CCU_CLEAN_DEPTH |
218 TU_CMD_FLAG_CCU_CLEAN_COLOR |
219 TU_CMD_FLAG_CACHE_CLEAN |
220 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
221 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
222 */
223 TU_CMD_FLAG_WAIT_MEM_WRITES,
224
225 TU_CMD_FLAG_ALL_INVALIDATE =
226 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
227 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
228 TU_CMD_FLAG_CACHE_INVALIDATE |
229 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
230 TU_CMD_FLAG_CCHE_INVALIDATE |
231 /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
232 * a command that needs CP_WAIT_FOR_ME is executed. This means we may
233 * insert an extra WAIT_FOR_ME before an indirect command requiring it
234 * in case there was another command before the current command buffer
235 * that it needs to wait for.
236 */
237 TU_CMD_FLAG_WAIT_FOR_ME,
238 };
239
240 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
241 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
242 * which part of the gmem is used by the CCU. Here we keep track of what the
243 * state of the CCU.
244 */
245 enum tu_cmd_ccu_state {
246 TU_CMD_CCU_SYSMEM,
247 TU_CMD_CCU_GMEM,
248 TU_CMD_CCU_UNKNOWN,
249 };
250
251 struct tu_cache_state {
252 /* Caches which must be made available (flushed) eventually if there are
253 * any users outside that cache domain, and caches which must be
254 * invalidated eventually if there are any reads.
255 */
256 BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
257 /* Pending flushes */
258 BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
259 };
260
261 struct tu_vs_params {
262 uint32_t vertex_offset;
263 uint32_t first_instance;
264 uint32_t draw_id;
265 };
266
267 struct tu_tess_params {
268 bool valid;
269 enum a6xx_tess_output output_upper_left, output_lower_left;
270 enum a6xx_tess_spacing spacing;
271 };
272
273 /* This should be for state that is set inside a renderpass and used at
274 * renderpass end time, e.g. to decide whether to use sysmem. This needs
275 * special handling for secondary cmdbufs and suspending/resuming render
276 * passes where the state may need to be combined afterwards.
277 */
278 struct tu_render_pass_state
279 {
280 bool xfb_used;
281 bool has_tess;
282 bool has_prim_generated_query_in_rp;
283 bool has_zpass_done_sample_count_write_in_rp;
284 bool disable_gmem;
285 bool sysmem_single_prim_mode;
286 bool shared_viewport;
287
288 /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
289 bool draw_cs_writes_to_cond_pred;
290
291 uint32_t drawcall_count;
292
293 /* A calculated "draw cost" value for renderpass, which tries to
294 * estimate the bandwidth-per-sample of all the draws according
295 * to:
296 *
297 * foreach_draw (...) {
298 * sum += pipeline->color_bandwidth_per_sample;
299 * if (depth_test_enabled)
300 * sum += pipeline->depth_cpp_per_sample;
301 * if (depth_write_enabled)
302 * sum += pipeline->depth_cpp_per_sample;
303 * if (stencil_write_enabled)
304 * sum += pipeline->stencil_cpp_per_sample * 2;
305 * }
306 * drawcall_bandwidth_per_sample = sum / drawcall_count;
307 *
308 * It allows us to estimate the total bandwidth of drawcalls later, by
309 * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
310 *
311 * This does ignore depth buffer traffic for samples which do not
312 * pass due to depth-test fail, and some other details. But it is
313 * just intended to be a rough estimate that is easy to calculate.
314 */
315 uint32_t drawcall_bandwidth_per_sample_sum;
316
317 const char *lrz_disable_reason;
318 };
319
320 /* These are the states of the suspend/resume state machine. In addition to
321 * tracking whether we're in the middle of a chain of suspending and
322 * resuming passes that will be merged, we need to track whether the
323 * command buffer begins in the middle of such a chain, for when it gets
324 * merged with other command buffers. We call such a chain that begins
325 * before the command buffer starts a "pre-chain".
326 *
327 * Note that when this command buffer is finished, this state is untouched
328 * but it gains a different meaning. For example, if we finish in state
329 * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
330 * there's a suspend/resume chain that extends past the end of the command
331 * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
332 * means that there's a suspend/resume chain that extends before the
333 * beginning.
334 */
335 enum tu_suspend_resume_state
336 {
337 /* Either there are no suspend/resume chains, or they are entirely
338 * contained in the current command buffer.
339 *
340 * BeginCommandBuffer() <- start of current command buffer
341 * ...
342 * // we are here
343 */
344 SR_NONE = 0,
345
346 /* We are in the middle of a suspend/resume chain that starts before the
347 * current command buffer. This happens when the command buffer begins
348 * with a resuming render pass and all of the passes up to the current
349 * one are suspending. In this state, our part of the chain is not saved
350 * and is in the current draw_cs/state.
351 *
352 * BeginRendering() ... EndRendering(suspending)
353 * BeginCommandBuffer() <- start of current command buffer
354 * BeginRendering(resuming) ... EndRendering(suspending)
355 * BeginRendering(resuming) ... EndRendering(suspending)
356 * ...
357 * // we are here
358 */
359 SR_IN_PRE_CHAIN,
360
361 /* We are currently outside of any suspend/resume chains, but there is a
362 * chain starting before the current command buffer. It is saved in
363 * pre_chain.
364 *
365 * BeginRendering() ... EndRendering(suspending)
366 * BeginCommandBuffer() <- start of current command buffer
367 * // This part is stashed in pre_chain
368 * BeginRendering(resuming) ... EndRendering(suspending)
369 * BeginRendering(resuming) ... EndRendering(suspending)
370 * ...
371 * BeginRendering(resuming) ... EndRendering() // end of chain
372 * ...
373 * // we are here
374 */
375 SR_AFTER_PRE_CHAIN,
376
377 /* We are in the middle of a suspend/resume chain and there is no chain
378 * starting before the current command buffer.
379 *
380 * BeginCommandBuffer() <- start of current command buffer
381 * ...
382 * BeginRendering() ... EndRendering(suspending)
383 * BeginRendering(resuming) ... EndRendering(suspending)
384 * BeginRendering(resuming) ... EndRendering(suspending)
385 * ...
386 * // we are here
387 */
388 SR_IN_CHAIN,
389
390 /* We are in the middle of a suspend/resume chain and there is another,
391 * separate, chain starting before the current command buffer.
392 *
393 * BeginRendering() ... EndRendering(suspending)
394 * CommandBufferBegin() <- start of current command buffer
395 * // This part is stashed in pre_chain
396 * BeginRendering(resuming) ... EndRendering(suspending)
397 * BeginRendering(resuming) ... EndRendering(suspending)
398 * ...
399 * BeginRendering(resuming) ... EndRendering() // end of chain
400 * ...
401 * BeginRendering() ... EndRendering(suspending)
402 * BeginRendering(resuming) ... EndRendering(suspending)
403 * BeginRendering(resuming) ... EndRendering(suspending)
404 * ...
405 * // we are here
406 */
407 SR_IN_CHAIN_AFTER_PRE_CHAIN,
408 };
409
410 struct tu_cmd_state
411 {
412 uint32_t dirty;
413
414 struct tu_shader *shaders[MESA_SHADER_STAGES];
415
416 struct tu_program_state program;
417
418 struct tu_render_pass_state rp;
419
420 struct vk_render_pass_state vk_rp;
421 struct vk_vertex_input_state vi;
422 struct vk_sample_locations_state sl;
423
424 struct tu_bandwidth bandwidth;
425
426 /* Vertex buffers
427 * the states for these can be updated partially, so we need to save these
428 * to be able to emit a complete draw state
429 */
430 struct {
431 uint64_t base;
432 uint32_t size;
433 } vb[MAX_VBS];
434
435 uint32_t max_vbs_bound;
436
437 bool per_view_viewport;
438
439 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
440 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
441 struct tu_draw_state vertex_buffers;
442 struct tu_draw_state shader_const;
443 struct tu_draw_state desc_sets;
444 struct tu_draw_state load_state;
445 struct tu_draw_state compute_load_state;
446 struct tu_draw_state prim_order_gmem;
447
448 struct tu_draw_state vs_params;
449 struct tu_draw_state fs_params;
450
451 /* Index buffer */
452 uint64_t index_va;
453 uint32_t max_index_count;
454 uint8_t index_size;
455
456 /* because streamout base has to be 32-byte aligned
457 * there is an extra offset to deal with when it is
458 * unaligned
459 */
460 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
461
462 /* Renderpasses are tricky, because we may need to flush differently if
463 * using sysmem vs. gmem and therefore we have to delay any flushing that
464 * happens before a renderpass. So we have to have two copies of the flush
465 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
466 * and one for outside a renderpass.
467 */
468 struct tu_cache_state cache;
469 struct tu_cache_state renderpass_cache;
470
471 enum tu_cmd_ccu_state ccu_state;
472
473 /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
474 * might get used by tu_store_gmem_attachment().
475 */
476 enum tu_gmem_layout gmem_layout;
477
478 const struct tu_render_pass *pass;
479 const struct tu_subpass *subpass;
480 const struct tu_framebuffer *framebuffer;
481 const struct tu_tiling_config *tiling;
482 VkRect2D render_area;
483
484 const struct tu_image_view **attachments;
485 VkClearValue *clear_values;
486
487 /* State that in the dynamic case comes from VkRenderingInfo and needs to
488 * be saved/restored when suspending. This holds the state for the last
489 * suspended renderpass, which may point to this command buffer's dynamic_*
490 * or another command buffer if executed on a secondary.
491 */
492 struct {
493 const struct tu_render_pass *pass;
494 const struct tu_subpass *subpass;
495 const struct tu_framebuffer *framebuffer;
496 VkRect2D render_area;
497 enum tu_gmem_layout gmem_layout;
498
499 const struct tu_image_view **attachments;
500 VkClearValue *clear_values;
501
502 struct tu_lrz_state lrz;
503 } suspended_pass;
504
505 bool tessfactor_addr_set;
506 bool predication_active;
507 bool msaa_disable;
508 bool blend_reads_dest;
509 bool stencil_front_write;
510 bool stencil_back_write;
511 bool pipeline_sysmem_single_prim_mode;
512 bool pipeline_has_tess;
513 bool pipeline_disable_gmem;
514 bool raster_order_attachment_access;
515 bool raster_order_attachment_access_valid;
516 bool blit_cache_cleaned;
517 VkImageAspectFlags pipeline_feedback_loops;
518 bool pipeline_writes_shading_rate;
519 bool pipeline_reads_shading_rate;
520 bool pipeline_accesses_smask;
521
522 bool pipeline_blend_lrz, pipeline_bandwidth;
523 uint32_t pipeline_draw_states;
524
525 /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
526 * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
527 * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
528 */
529 uint32_t prim_counters_running;
530
531 bool prim_generated_query_running_before_rp;
532
533 enum tu_suspend_resume_state suspend_resume;
534
535 bool suspending, resuming;
536
537 struct tu_lrz_state lrz;
538
539 struct tu_draw_state lrz_and_depth_plane_state;
540
541 struct tu_vs_params last_vs_params;
542 bool last_draw_indexed;
543
544 struct tu_tess_params tess_params;
545
546 uint64_t descriptor_buffer_iova[MAX_SETS];
547 };
548
549 struct tu_cmd_buffer
550 {
551 struct vk_command_buffer vk;
552
553 struct tu_device *device;
554
555 struct u_trace trace;
556 struct u_trace_iterator trace_renderpass_start;
557 struct u_trace_iterator trace_renderpass_end;
558
559 struct list_head renderpass_autotune_results;
560 struct tu_autotune_results_buffer* autotune_buffer;
561
562 void *patchpoints_ctx;
563 struct util_dynarray fdm_bin_patchpoints;
564
565 VkCommandBufferUsageFlags usage_flags;
566
567 VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
568
569 struct tu_cmd_state state;
570 uint32_t queue_family_index;
571
572 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
573 VkShaderStageFlags push_constant_stages;
574 struct tu_descriptor_set meta_push_descriptors;
575
576 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
577
578 struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 2];
579 struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
580 struct tu_subpass_attachment dynamic_input_attachments[MAX_RTS + 1];
581 struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
582 const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 2];
583 VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
584
585 struct tu_render_pass dynamic_pass;
586 struct tu_subpass dynamic_subpass;
587 struct tu_framebuffer dynamic_framebuffer;
588
589 struct tu_cs cs;
590 struct tu_cs draw_cs;
591 struct tu_cs tile_store_cs;
592 struct tu_cs draw_epilogue_cs;
593 struct tu_cs sub_cs;
594
595 /* If the first render pass in the command buffer is resuming, then it is
596 * part of a suspend/resume chain that starts before the current command
597 * buffer and needs to be merged later. In this case, its incomplete state
598 * is stored in pre_chain. In the symmetric case where the last render pass
599 * is suspending, we just skip ending the render pass and its state is
600 * stored in draw_cs/the current state. The first and last render pass
601 * might be part of different chains, which is why all the state may need
602 * to be saved separately here.
603 */
604 struct {
605 struct tu_cs draw_cs;
606 struct tu_cs draw_epilogue_cs;
607
608 struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
609
610 struct tu_render_pass_state state;
611
612 struct util_dynarray fdm_bin_patchpoints;
613 void *patchpoints_ctx;
614 } pre_chain;
615
616 uint32_t vsc_draw_strm_pitch;
617 uint32_t vsc_prim_strm_pitch;
618 uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
619 bool vsc_initialized;
620
621 bool prev_fsr_is_null;
622 };
623 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
624 VK_OBJECT_TYPE_COMMAND_BUFFER)
625
626 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
627
628 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)629 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
630 const struct tu_render_pass_attachment *att,
631 uint32_t layer)
632 {
633 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
634 return att->gmem_offset[cmd->state.gmem_layout] +
635 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
636 att->cpp;
637 }
638
639 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)640 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
641 const struct tu_render_pass_attachment *att,
642 uint32_t layer)
643 {
644 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
645 return att->gmem_offset_stencil[cmd->state.gmem_layout] +
646 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
647 }
648
649 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
650 const struct tu_render_pass_state *src);
651
652 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
653 const VkCommandBufferBeginInfo *pBeginInfo);
654
655 template <chip CHIP>
656 void
657 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
658
659 template <chip CHIP>
660 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
661
662 template <chip CHIP>
663 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
664 struct tu_cs *cs,
665 enum tu_cmd_ccu_state ccu_state);
666
667 void
668 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
669 struct tu_cmd_buffer *secondary);
670
671 void
672 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
673 struct tu_cmd_buffer *secondary);
674
675 void
676 tu_append_post_chain(struct tu_cmd_buffer *cmd,
677 struct tu_cmd_buffer *secondary);
678
679 void
680 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
681 struct tu_cmd_buffer *suspended);
682
683 template <chip CHIP>
684 void tu_cmd_render(struct tu_cmd_buffer *cmd);
685
686 enum fd_gpu_event : uint32_t;
687
688 template <chip CHIP>
689 void
690 tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
691 struct tu_cs *cs,
692 enum vgt_event_type event,
693 bool needs_seqno);
694
695 template <chip CHIP>
696 void
697 tu_emit_event_write(struct tu_cmd_buffer *cmd,
698 struct tu_cs *cs,
699 enum fd_gpu_event event);
700
701 void
702 tu_flush_for_access(struct tu_cache_state *cache,
703 enum tu_cmd_access_mask src_mask,
704 enum tu_cmd_access_mask dst_mask);
705
706 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)707 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
708 VkPipelineBindPoint bind_point)
709 {
710 return &cmd_buffer->descriptors[bind_point];
711 }
712
713 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
714 bool msaa_disable);
715
716 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
717
718 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
719
720 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
721
722 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
723 uint32_t *rb_depth_cntl);
724
725 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
726 struct tu_cs *cs,
727 void *data,
728 VkRect2D bin,
729 unsigned views,
730 VkExtent2D *frag_areas);
731
732 struct tu_fdm_bin_patchpoint {
733 uint64_t iova;
734 uint32_t size;
735 void *data;
736 tu_fdm_bin_apply_t apply;
737 };
738
739
740 void
741 tu_barrier(struct tu_cmd_buffer *cmd,
742 uint32_t dep_count,
743 const VkDependencyInfo *dep_info);
744
745 template <chip CHIP>
746 void
747 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
748 VkPipelineStageFlags2 stageMask, unsigned value);
749
750 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)751 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
752 struct tu_cs *cs,
753 unsigned size,
754 tu_fdm_bin_apply_t apply,
755 void *state,
756 unsigned state_size)
757 {
758 void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
759 memcpy(data, state, state_size);
760 assert(cs->writeable);
761 tu_cs_reserve_space(cs, size);
762 struct tu_fdm_bin_patchpoint patch = {
763 .iova = tu_cs_get_cur_iova(cs),
764 .size = size,
765 .data = data,
766 .apply = apply,
767 };
768
769 /* Apply the "default" setup where there is no scaling. This is used if
770 * sysmem is required, and uses up the dwords that have been reserved.
771 */
772 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
773 VkExtent2D unscaled_frag_areas[num_views];
774 for (unsigned i = 0; i < num_views; i++) {
775 unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
776 }
777 apply(cmd, cs, state, (VkRect2D) {
778 { 0, 0 },
779 { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
780 }, num_views, unscaled_frag_areas);
781 assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
782
783 util_dynarray_append(&cmd->fdm_bin_patchpoints,
784 struct tu_fdm_bin_patchpoint,
785 patch);
786 }
787
788 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
789 _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
790
791 VkResult tu_init_bin_preamble(struct tu_device *device);
792
793 #endif /* TU_CMD_BUFFER_H */
794