1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12
13 #include "tu_common.h"
14
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21
22 enum tu_draw_state_group_id
23 {
24 TU_DRAW_STATE_PROGRAM_CONFIG,
25 TU_DRAW_STATE_PROGRAM,
26 TU_DRAW_STATE_PROGRAM_BINNING,
27 TU_DRAW_STATE_VB,
28 TU_DRAW_STATE_VI,
29 TU_DRAW_STATE_VI_BINNING,
30 TU_DRAW_STATE_RAST,
31 TU_DRAW_STATE_CONST,
32 TU_DRAW_STATE_DESC_SETS,
33 TU_DRAW_STATE_DESC_SETS_LOAD,
34 TU_DRAW_STATE_VS_PARAMS,
35 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
36 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
37 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
38 TU_DRAW_STATE_PRIM_MODE_GMEM,
39 TU_DRAW_STATE_PRIM_MODE_SYSMEM,
40
41 /* dynamic state related draw states */
42 TU_DRAW_STATE_DYNAMIC,
43 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
44 };
45
46 struct tu_descriptor_state
47 {
48 struct tu_descriptor_set *sets[MAX_SETS];
49 struct tu_descriptor_set push_set;
50 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
51 };
52
53 enum tu_cmd_dirty_bits
54 {
55 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
56 TU_CMD_DIRTY_VB_STRIDE = BIT(1),
57 TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
58 TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
59 TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
60 TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
61 TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
62 TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
63 TU_CMD_DIRTY_LRZ = BIT(8),
64 TU_CMD_DIRTY_VS_PARAMS = BIT(9),
65 TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
66 TU_CMD_DIRTY_VIEWPORTS = BIT(11),
67 TU_CMD_DIRTY_BLEND = BIT(12),
68 /* all draw states were disabled and need to be re-enabled: */
69 TU_CMD_DIRTY_DRAW_STATE = BIT(13)
70 };
71
72 /* There are only three cache domains we have to care about: the CCU, or
73 * color cache unit, which is used for color and depth/stencil attachments
74 * and copy/blit destinations, and is split conceptually into color and depth,
75 * and the universal cache or UCHE which is used for pretty much everything
76 * else, except for the CP (uncached) and host. We need to flush whenever data
77 * crosses these boundaries.
78 */
79
80 enum tu_cmd_access_mask {
81 TU_ACCESS_UCHE_READ = 1 << 0,
82 TU_ACCESS_UCHE_WRITE = 1 << 1,
83 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
84 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
85 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
86 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
87
88 /* Experiments have shown that while it's safe to avoid flushing the CCU
89 * after each blit/renderpass, it's not safe to assume that subsequent
90 * lookups with a different attachment state will hit unflushed cache
91 * entries. That is, the CCU needs to be flushed and possibly invalidated
92 * when accessing memory with a different attachment state. Writing to an
93 * attachment under the following conditions after clearing using the
94 * normal 2d engine path is known to have issues:
95 *
96 * - It isn't the 0'th layer.
97 * - There are more than one attachment, and this isn't the 0'th attachment
98 * (this seems to also depend on the cpp of the attachments).
99 *
100 * Our best guess is that the layer/MRT state is used when computing
101 * the location of a cache entry in CCU, to avoid conflicts. We assume that
102 * any access in a renderpass after or before an access by a transfer needs
103 * a flush/invalidate, and use the _INCOHERENT variants to represent access
104 * by a renderpass.
105 */
106 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
107 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
108 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
109 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
110
111 /* Accesses which bypasses any cache. e.g. writes via the host,
112 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
113 */
114 TU_ACCESS_SYSMEM_READ = 1 << 10,
115 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
116
117 /* Memory writes from the CP start in-order with draws and event writes,
118 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
119 */
120 TU_ACCESS_CP_WRITE = 1 << 12,
121
122 TU_ACCESS_READ =
123 TU_ACCESS_UCHE_READ |
124 TU_ACCESS_CCU_COLOR_READ |
125 TU_ACCESS_CCU_DEPTH_READ |
126 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
127 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
128 TU_ACCESS_SYSMEM_READ,
129
130 TU_ACCESS_WRITE =
131 TU_ACCESS_UCHE_WRITE |
132 TU_ACCESS_CCU_COLOR_WRITE |
133 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
134 TU_ACCESS_CCU_DEPTH_WRITE |
135 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
136 TU_ACCESS_SYSMEM_WRITE |
137 TU_ACCESS_CP_WRITE,
138
139 TU_ACCESS_ALL =
140 TU_ACCESS_READ |
141 TU_ACCESS_WRITE,
142 };
143
144 /* Starting with a6xx, the pipeline is split into several "clusters" (really
145 * pipeline stages). Each stage has its own pair of register banks and can
146 * switch them independently, so that earlier stages can run ahead of later
147 * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
148 * the same time.
149 *
150 * As a result of this, we need to insert a WFI when an earlier stage depends
151 * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
152 * pending WFI's to complete before starting, and usually before reading
153 * indirect params even, so a WFI also acts as a full "pipeline stall".
154 *
155 * Note, the names of the stages come from CLUSTER_* in devcoredump. We
156 * include all the stages for completeness, even ones which do not read/write
157 * anything.
158 */
159
160 enum tu_stage {
161 /* This doesn't correspond to a cluster, but we need it for tracking
162 * indirect draw parameter reads etc.
163 */
164 TU_STAGE_CP,
165
166 /* - Fetch index buffer
167 * - Fetch vertex attributes, dispatch VS
168 */
169 TU_STAGE_FE,
170
171 /* Execute all geometry stages (VS thru GS) */
172 TU_STAGE_SP_VS,
173
174 /* Write to VPC, do primitive assembly. */
175 TU_STAGE_PC_VS,
176
177 /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
178 * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
179 * early depth testing is enabled before dispatching fragments? However
180 * GRAS reads and writes LRZ directly.
181 */
182 TU_STAGE_GRAS,
183
184 /* Execute FS */
185 TU_STAGE_SP_PS,
186
187 /* - Fragment tests
188 * - Write color/depth
189 * - Streamout writes (???)
190 * - Varying interpolation (???)
191 */
192 TU_STAGE_PS,
193 };
194
195 enum tu_cmd_flush_bits {
196 TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
197 TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
198 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
199 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
200 TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
201 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
202 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
203 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
204 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
205
206 TU_CMD_FLAG_ALL_FLUSH =
207 TU_CMD_FLAG_CCU_FLUSH_DEPTH |
208 TU_CMD_FLAG_CCU_FLUSH_COLOR |
209 TU_CMD_FLAG_CACHE_FLUSH |
210 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
211 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
212 */
213 TU_CMD_FLAG_WAIT_MEM_WRITES,
214
215 TU_CMD_FLAG_ALL_INVALIDATE =
216 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
217 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
218 TU_CMD_FLAG_CACHE_INVALIDATE |
219 /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
220 * a command that needs CP_WAIT_FOR_ME is executed. This means we may
221 * insert an extra WAIT_FOR_ME before an indirect command requiring it
222 * in case there was another command before the current command buffer
223 * that it needs to wait for.
224 */
225 TU_CMD_FLAG_WAIT_FOR_ME,
226 };
227
228 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
229 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
230 * which part of the gmem is used by the CCU. Here we keep track of what the
231 * state of the CCU.
232 */
233 enum tu_cmd_ccu_state {
234 TU_CMD_CCU_SYSMEM,
235 TU_CMD_CCU_GMEM,
236 TU_CMD_CCU_UNKNOWN,
237 };
238
239 struct tu_cache_state {
240 /* Caches which must be made available (flushed) eventually if there are
241 * any users outside that cache domain, and caches which must be
242 * invalidated eventually if there are any reads.
243 */
244 enum tu_cmd_flush_bits pending_flush_bits;
245 /* Pending flushes */
246 enum tu_cmd_flush_bits flush_bits;
247 };
248
249 struct tu_vs_params {
250 uint32_t vertex_offset;
251 uint32_t first_instance;
252 };
253
254 /* This should be for state that is set inside a renderpass and used at
255 * renderpass end time, e.g. to decide whether to use sysmem. This needs
256 * special handling for secondary cmdbufs and suspending/resuming render
257 * passes where the state may need to be combined afterwards.
258 */
259 struct tu_render_pass_state
260 {
261 bool xfb_used;
262 bool has_tess;
263 bool has_prim_generated_query_in_rp;
264 bool disable_gmem;
265
266 /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
267 bool draw_cs_writes_to_cond_pred;
268
269 uint32_t drawcall_count;
270
271 /* A calculated "draw cost" value for renderpass, which tries to
272 * estimate the bandwidth-per-sample of all the draws according
273 * to:
274 *
275 * foreach_draw (...) {
276 * sum += pipeline->color_bandwidth_per_sample;
277 * if (depth_test_enabled)
278 * sum += pipeline->depth_cpp_per_sample;
279 * if (depth_write_enabled)
280 * sum += pipeline->depth_cpp_per_sample;
281 * if (stencil_write_enabled)
282 * sum += pipeline->stencil_cpp_per_sample * 2;
283 * }
284 * drawcall_bandwidth_per_sample = sum / drawcall_count;
285 *
286 * It allows us to estimate the total bandwidth of drawcalls later, by
287 * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
288 *
289 * This does ignore depth buffer traffic for samples which do not
290 * pass due to depth-test fail, and some other details. But it is
291 * just intended to be a rough estimate that is easy to calculate.
292 */
293 uint32_t drawcall_bandwidth_per_sample_sum;
294 };
295
296 struct tu_cmd_state
297 {
298 uint32_t dirty;
299
300 struct tu_pipeline *pipeline;
301 struct tu_pipeline *compute_pipeline;
302
303 struct tu_render_pass_state rp;
304
305 /* Vertex buffers, viewports, and scissors
306 * the states for these can be updated partially, so we need to save these
307 * to be able to emit a complete draw state
308 */
309 struct {
310 uint64_t base;
311 uint32_t size;
312 uint32_t stride;
313 } vb[MAX_VBS];
314 VkViewport viewport[MAX_VIEWPORTS];
315 VkRect2D scissor[MAX_SCISSORS];
316 uint32_t max_viewport, max_scissor;
317
318 /* for dynamic states that can't be emitted directly */
319 uint32_t dynamic_stencil_mask;
320 uint32_t dynamic_stencil_wrmask;
321 uint32_t dynamic_stencil_ref;
322
323 uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
324 uint32_t pc_raster_cntl, vpc_unknown_9107;
325 uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
326 uint32_t rb_mrt_control_rop;
327 uint32_t rb_blend_cntl, sp_blend_cntl;
328 uint32_t pipeline_color_write_enable, pipeline_blend_enable;
329 uint32_t color_write_enable;
330 bool logic_op_enabled;
331 bool rop_reads_dst;
332 enum pc_di_primtype primtype;
333 bool primitive_restart_enable;
334
335 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
336 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
337 struct tu_draw_state vertex_buffers;
338 struct tu_draw_state shader_const;
339 struct tu_draw_state desc_sets;
340
341 struct tu_draw_state vs_params;
342
343 /* Index buffer */
344 uint64_t index_va;
345 uint32_t max_index_count;
346 uint8_t index_size;
347
348 /* because streamout base has to be 32-byte aligned
349 * there is an extra offset to deal with when it is
350 * unaligned
351 */
352 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
353
354 /* Renderpasses are tricky, because we may need to flush differently if
355 * using sysmem vs. gmem and therefore we have to delay any flushing that
356 * happens before a renderpass. So we have to have two copies of the flush
357 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
358 * and one for outside a renderpass.
359 */
360 struct tu_cache_state cache;
361 struct tu_cache_state renderpass_cache;
362
363 enum tu_cmd_ccu_state ccu_state;
364
365 /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
366 * might get used by tu_store_gmem_attachment().
367 */
368 enum tu_gmem_layout gmem_layout;
369
370 const struct tu_render_pass *pass;
371 const struct tu_subpass *subpass;
372 const struct tu_framebuffer *framebuffer;
373 const struct tu_tiling_config *tiling;
374 VkRect2D render_area;
375
376 const struct tu_image_view **attachments;
377
378 /* State that in the dynamic case comes from VkRenderingInfo and needs to
379 * be saved/restored when suspending. This holds the state for the last
380 * suspended renderpass, which may point to this command buffer's dynamic_*
381 * or another command buffer if executed on a secondary.
382 */
383 struct {
384 const struct tu_render_pass *pass;
385 const struct tu_subpass *subpass;
386 const struct tu_framebuffer *framebuffer;
387 VkRect2D render_area;
388 enum tu_gmem_layout gmem_layout;
389
390 const struct tu_image_view **attachments;
391
392 struct tu_lrz_state lrz;
393 } suspended_pass;
394
395 bool tessfactor_addr_set;
396 bool predication_active;
397 enum a5xx_line_mode line_mode;
398 bool z_negative_one_to_one;
399
400 /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
401 * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
402 * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
403 */
404 uint32_t prim_counters_running;
405
406 bool prim_generated_query_running_before_rp;
407
408 /* These are the states of the suspend/resume state machine. In addition to
409 * tracking whether we're in the middle of a chain of suspending and
410 * resuming passes that will be merged, we need to track whether the
411 * command buffer begins in the middle of such a chain, for when it gets
412 * merged with other command buffers. We call such a chain that begins
413 * before the command buffer starts a "pre-chain".
414 *
415 * Note that when this command buffer is finished, this state is untouched
416 * but it gains a different meaning. For example, if we finish in state
417 * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
418 * there's a suspend/resume chain that extends past the end of the command
419 * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
420 * means that there's a suspend/resume chain that extends before the
421 * beginning.
422 */
423 enum {
424 /* Either there are no suspend/resume chains, or they are entirely
425 * contained in the current command buffer.
426 *
427 * BeginCommandBuffer() <- start of current command buffer
428 * ...
429 * // we are here
430 */
431 SR_NONE = 0,
432
433 /* We are in the middle of a suspend/resume chain that starts before the
434 * current command buffer. This happens when the command buffer begins
435 * with a resuming render pass and all of the passes up to the current
436 * one are suspending. In this state, our part of the chain is not saved
437 * and is in the current draw_cs/state.
438 *
439 * BeginRendering() ... EndRendering(suspending)
440 * BeginCommandBuffer() <- start of current command buffer
441 * BeginRendering(resuming) ... EndRendering(suspending)
442 * BeginRendering(resuming) ... EndRendering(suspending)
443 * ...
444 * // we are here
445 */
446 SR_IN_PRE_CHAIN,
447
448 /* We are currently outside of any suspend/resume chains, but there is a
449 * chain starting before the current command buffer. It is saved in
450 * pre_chain.
451 *
452 * BeginRendering() ... EndRendering(suspending)
453 * BeginCommandBuffer() <- start of current command buffer
454 * // This part is stashed in pre_chain
455 * BeginRendering(resuming) ... EndRendering(suspending)
456 * BeginRendering(resuming) ... EndRendering(suspending)
457 * ...
458 * BeginRendering(resuming) ... EndRendering() // end of chain
459 * ...
460 * // we are here
461 */
462 SR_AFTER_PRE_CHAIN,
463
464 /* We are in the middle of a suspend/resume chain and there is no chain
465 * starting before the current command buffer.
466 *
467 * BeginCommandBuffer() <- start of current command buffer
468 * ...
469 * BeginRendering() ... EndRendering(suspending)
470 * BeginRendering(resuming) ... EndRendering(suspending)
471 * BeginRendering(resuming) ... EndRendering(suspending)
472 * ...
473 * // we are here
474 */
475 SR_IN_CHAIN,
476
477 /* We are in the middle of a suspend/resume chain and there is another,
478 * separate, chain starting before the current command buffer.
479 *
480 * BeginRendering() ... EndRendering(suspending)
481 * CommandBufferBegin() <- start of current command buffer
482 * // This part is stashed in pre_chain
483 * BeginRendering(resuming) ... EndRendering(suspending)
484 * BeginRendering(resuming) ... EndRendering(suspending)
485 * ...
486 * BeginRendering(resuming) ... EndRendering() // end of chain
487 * ...
488 * BeginRendering() ... EndRendering(suspending)
489 * BeginRendering(resuming) ... EndRendering(suspending)
490 * BeginRendering(resuming) ... EndRendering(suspending)
491 * ...
492 * // we are here
493 */
494 SR_IN_CHAIN_AFTER_PRE_CHAIN,
495 } suspend_resume;
496
497 bool suspending, resuming;
498
499 struct tu_lrz_state lrz;
500
501 struct tu_draw_state lrz_and_depth_plane_state;
502
503 struct tu_vs_params last_vs_params;
504 };
505
506 struct tu_cmd_pool
507 {
508 struct vk_command_pool vk;
509
510 struct list_head cmd_buffers;
511 struct list_head free_cmd_buffers;
512 };
513 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
514 VK_OBJECT_TYPE_COMMAND_POOL)
515
516 enum tu_cmd_buffer_status
517 {
518 TU_CMD_BUFFER_STATUS_INVALID,
519 TU_CMD_BUFFER_STATUS_INITIAL,
520 TU_CMD_BUFFER_STATUS_RECORDING,
521 TU_CMD_BUFFER_STATUS_EXECUTABLE,
522 TU_CMD_BUFFER_STATUS_PENDING,
523 };
524
525 struct tu_cmd_buffer
526 {
527 struct vk_command_buffer vk;
528
529 struct tu_device *device;
530
531 struct tu_cmd_pool *pool;
532 struct list_head pool_link;
533
534 struct u_trace trace;
535 struct u_trace_iterator trace_renderpass_start;
536 struct u_trace_iterator trace_renderpass_end;
537
538 struct list_head renderpass_autotune_results;
539 struct tu_autotune_results_buffer* autotune_buffer;
540
541 VkCommandBufferUsageFlags usage_flags;
542 enum tu_cmd_buffer_status status;
543
544 VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
545
546 struct tu_cmd_state state;
547 uint32_t queue_family_index;
548
549 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
550 VkShaderStageFlags push_constant_stages;
551 struct tu_descriptor_set meta_push_descriptors;
552
553 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
554
555 struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
556 struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
557 struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
558 const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
559
560 struct tu_render_pass dynamic_pass;
561 struct tu_subpass dynamic_subpass;
562 struct tu_framebuffer dynamic_framebuffer;
563
564 VkResult record_result;
565
566 struct tu_cs cs;
567 struct tu_cs draw_cs;
568 struct tu_cs tile_store_cs;
569 struct tu_cs draw_epilogue_cs;
570 struct tu_cs sub_cs;
571
572 /* If the first render pass in the command buffer is resuming, then it is
573 * part of a suspend/resume chain that starts before the current command
574 * buffer and needs to be merged later. In this case, its incomplete state
575 * is stored in pre_chain. In the symmetric case where the last render pass
576 * is suspending, we just skip ending the render pass and its state is
577 * stored in draw_cs/the current state. The first and last render pass
578 * might be part of different chains, which is why all the state may need
579 * to be saved separately here.
580 */
581 struct {
582 struct tu_cs draw_cs;
583 struct tu_cs draw_epilogue_cs;
584
585 struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
586
587 struct tu_render_pass_state state;
588 } pre_chain;
589
590 uint32_t vsc_draw_strm_pitch;
591 uint32_t vsc_prim_strm_pitch;
592 };
593 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
594 VK_OBJECT_TYPE_COMMAND_BUFFER)
595
596 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)597 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
598 const struct tu_render_pass_attachment *att)
599 {
600 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
601 return att->gmem_offset[cmd->state.gmem_layout];
602 }
603
604 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)605 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
606 const struct tu_render_pass_attachment *att)
607 {
608 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
609 return att->gmem_offset_stencil[cmd->state.gmem_layout];
610 }
611
612 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
613 const struct tu_render_pass_state *src);
614
615 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
616 VkCommandBufferUsageFlags usage_flags);
617
618 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
619 struct tu_cs *cs);
620
621 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
622 struct tu_cs *cs,
623 enum tu_cmd_ccu_state ccu_state);
624
625 void
626 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
627 struct tu_cmd_buffer *secondary);
628
629 void
630 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
631 struct tu_cmd_buffer *secondary);
632
633 void
634 tu_append_post_chain(struct tu_cmd_buffer *cmd,
635 struct tu_cmd_buffer *secondary);
636
637 void
638 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
639 struct tu_cmd_buffer *suspended);
640
641 void tu_cmd_render(struct tu_cmd_buffer *cmd);
642
643 void
644 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
645 struct tu_cs *cs,
646 enum vgt_event_type event);
647
648 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)649 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
650 VkPipelineBindPoint bind_point)
651 {
652 return &cmd_buffer->descriptors[bind_point];
653 }
654
655 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
656 enum a5xx_line_mode line_mode);
657
658 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
659
660 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
661
662 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
663
664 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
665 uint32_t *rb_depth_cntl);
666
667 void
668 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
669
670 #endif /* TU_CMD_BUFFER_H */
671