• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21 
22 enum tu_draw_state_group_id
23 {
24    TU_DRAW_STATE_PROGRAM_CONFIG,
25    TU_DRAW_STATE_PROGRAM,
26    TU_DRAW_STATE_PROGRAM_BINNING,
27    TU_DRAW_STATE_VB,
28    TU_DRAW_STATE_VI,
29    TU_DRAW_STATE_VI_BINNING,
30    TU_DRAW_STATE_RAST,
31    TU_DRAW_STATE_CONST,
32    TU_DRAW_STATE_DESC_SETS,
33    TU_DRAW_STATE_DESC_SETS_LOAD,
34    TU_DRAW_STATE_VS_PARAMS,
35    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
36    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
37    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
38    TU_DRAW_STATE_PRIM_MODE_GMEM,
39    TU_DRAW_STATE_PRIM_MODE_SYSMEM,
40 
41    /* dynamic state related draw states */
42    TU_DRAW_STATE_DYNAMIC,
43    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
44 };
45 
46 struct tu_descriptor_state
47 {
48    struct tu_descriptor_set *sets[MAX_SETS];
49    struct tu_descriptor_set push_set;
50    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
51 };
52 
53 enum tu_cmd_dirty_bits
54 {
55    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
56    TU_CMD_DIRTY_VB_STRIDE = BIT(1),
57    TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
58    TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
59    TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
60    TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
61    TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
62    TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
63    TU_CMD_DIRTY_LRZ = BIT(8),
64    TU_CMD_DIRTY_VS_PARAMS = BIT(9),
65    TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
66    TU_CMD_DIRTY_VIEWPORTS = BIT(11),
67    TU_CMD_DIRTY_BLEND = BIT(12),
68    /* all draw states were disabled and need to be re-enabled: */
69    TU_CMD_DIRTY_DRAW_STATE = BIT(13)
70 };
71 
72 /* There are only three cache domains we have to care about: the CCU, or
73  * color cache unit, which is used for color and depth/stencil attachments
74  * and copy/blit destinations, and is split conceptually into color and depth,
75  * and the universal cache or UCHE which is used for pretty much everything
76  * else, except for the CP (uncached) and host. We need to flush whenever data
77  * crosses these boundaries.
78  */
79 
80 enum tu_cmd_access_mask {
81    TU_ACCESS_UCHE_READ = 1 << 0,
82    TU_ACCESS_UCHE_WRITE = 1 << 1,
83    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
84    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
85    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
86    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
87 
88    /* Experiments have shown that while it's safe to avoid flushing the CCU
89     * after each blit/renderpass, it's not safe to assume that subsequent
90     * lookups with a different attachment state will hit unflushed cache
91     * entries. That is, the CCU needs to be flushed and possibly invalidated
92     * when accessing memory with a different attachment state. Writing to an
93     * attachment under the following conditions after clearing using the
94     * normal 2d engine path is known to have issues:
95     *
96     * - It isn't the 0'th layer.
97     * - There are more than one attachment, and this isn't the 0'th attachment
98     *   (this seems to also depend on the cpp of the attachments).
99     *
100     * Our best guess is that the layer/MRT state is used when computing
101     * the location of a cache entry in CCU, to avoid conflicts. We assume that
102     * any access in a renderpass after or before an access by a transfer needs
103     * a flush/invalidate, and use the _INCOHERENT variants to represent access
104     * by a renderpass.
105     */
106    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
107    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
108    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
109    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
110 
111    /* Accesses which bypasses any cache. e.g. writes via the host,
112     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
113     */
114    TU_ACCESS_SYSMEM_READ = 1 << 10,
115    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
116 
117    /* Memory writes from the CP start in-order with draws and event writes,
118     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
119     */
120    TU_ACCESS_CP_WRITE = 1 << 12,
121 
122    TU_ACCESS_READ =
123       TU_ACCESS_UCHE_READ |
124       TU_ACCESS_CCU_COLOR_READ |
125       TU_ACCESS_CCU_DEPTH_READ |
126       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
127       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
128       TU_ACCESS_SYSMEM_READ,
129 
130    TU_ACCESS_WRITE =
131       TU_ACCESS_UCHE_WRITE |
132       TU_ACCESS_CCU_COLOR_WRITE |
133       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
134       TU_ACCESS_CCU_DEPTH_WRITE |
135       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
136       TU_ACCESS_SYSMEM_WRITE |
137       TU_ACCESS_CP_WRITE,
138 
139    TU_ACCESS_ALL =
140       TU_ACCESS_READ |
141       TU_ACCESS_WRITE,
142 };
143 
144 /* Starting with a6xx, the pipeline is split into several "clusters" (really
145  * pipeline stages). Each stage has its own pair of register banks and can
146  * switch them independently, so that earlier stages can run ahead of later
147  * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
148  * the same time.
149  *
150  * As a result of this, we need to insert a WFI when an earlier stage depends
151  * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
152  * pending WFI's to complete before starting, and usually before reading
153  * indirect params even, so a WFI also acts as a full "pipeline stall".
154  *
155  * Note, the names of the stages come from CLUSTER_* in devcoredump. We
156  * include all the stages for completeness, even ones which do not read/write
157  * anything.
158  */
159 
160 enum tu_stage {
161    /* This doesn't correspond to a cluster, but we need it for tracking
162     * indirect draw parameter reads etc.
163     */
164    TU_STAGE_CP,
165 
166    /* - Fetch index buffer
167     * - Fetch vertex attributes, dispatch VS
168     */
169    TU_STAGE_FE,
170 
171    /* Execute all geometry stages (VS thru GS) */
172    TU_STAGE_SP_VS,
173 
174    /* Write to VPC, do primitive assembly. */
175    TU_STAGE_PC_VS,
176 
177    /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
178     * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
179     * early depth testing is enabled before dispatching fragments? However
180     * GRAS reads and writes LRZ directly.
181     */
182    TU_STAGE_GRAS,
183 
184    /* Execute FS */
185    TU_STAGE_SP_PS,
186 
187    /* - Fragment tests
188     * - Write color/depth
189     * - Streamout writes (???)
190     * - Varying interpolation (???)
191     */
192    TU_STAGE_PS,
193 };
194 
195 enum tu_cmd_flush_bits {
196    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
197    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
198    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
199    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
200    TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
201    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
202    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
203    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
204    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
205 
206    TU_CMD_FLAG_ALL_FLUSH =
207       TU_CMD_FLAG_CCU_FLUSH_DEPTH |
208       TU_CMD_FLAG_CCU_FLUSH_COLOR |
209       TU_CMD_FLAG_CACHE_FLUSH |
210       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
211        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
212        */
213       TU_CMD_FLAG_WAIT_MEM_WRITES,
214 
215    TU_CMD_FLAG_ALL_INVALIDATE =
216       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
217       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
218       TU_CMD_FLAG_CACHE_INVALIDATE |
219       /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
220        * a command that needs CP_WAIT_FOR_ME is executed. This means we may
221        * insert an extra WAIT_FOR_ME before an indirect command requiring it
222        * in case there was another command before the current command buffer
223        * that it needs to wait for.
224        */
225       TU_CMD_FLAG_WAIT_FOR_ME,
226 };
227 
228 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
229  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
230  * which part of the gmem is used by the CCU. Here we keep track of what the
231  * state of the CCU.
232  */
233 enum tu_cmd_ccu_state {
234    TU_CMD_CCU_SYSMEM,
235    TU_CMD_CCU_GMEM,
236    TU_CMD_CCU_UNKNOWN,
237 };
238 
239 struct tu_cache_state {
240    /* Caches which must be made available (flushed) eventually if there are
241     * any users outside that cache domain, and caches which must be
242     * invalidated eventually if there are any reads.
243     */
244    enum tu_cmd_flush_bits pending_flush_bits;
245    /* Pending flushes */
246    enum tu_cmd_flush_bits flush_bits;
247 };
248 
249 struct tu_vs_params {
250    uint32_t vertex_offset;
251    uint32_t first_instance;
252 };
253 
254 /* This should be for state that is set inside a renderpass and used at
255  * renderpass end time, e.g. to decide whether to use sysmem. This needs
256  * special handling for secondary cmdbufs and suspending/resuming render
257  * passes where the state may need to be combined afterwards.
258  */
259 struct tu_render_pass_state
260 {
261    bool xfb_used;
262    bool has_tess;
263    bool has_prim_generated_query_in_rp;
264    bool disable_gmem;
265 
266    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
267    bool draw_cs_writes_to_cond_pred;
268 
269    uint32_t drawcall_count;
270 
271    /* A calculated "draw cost" value for renderpass, which tries to
272     * estimate the bandwidth-per-sample of all the draws according
273     * to:
274     *
275     *    foreach_draw (...) {
276     *      sum += pipeline->color_bandwidth_per_sample;
277     *      if (depth_test_enabled)
278     *        sum += pipeline->depth_cpp_per_sample;
279     *      if (depth_write_enabled)
280     *        sum += pipeline->depth_cpp_per_sample;
281     *      if (stencil_write_enabled)
282     *        sum += pipeline->stencil_cpp_per_sample * 2;
283     *    }
284     *    drawcall_bandwidth_per_sample = sum / drawcall_count;
285     *
286     * It allows us to estimate the total bandwidth of drawcalls later, by
287     * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
288     *
289     * This does ignore depth buffer traffic for samples which do not
290     * pass due to depth-test fail, and some other details.  But it is
291     * just intended to be a rough estimate that is easy to calculate.
292     */
293    uint32_t drawcall_bandwidth_per_sample_sum;
294 };
295 
296 struct tu_cmd_state
297 {
298    uint32_t dirty;
299 
300    struct tu_pipeline *pipeline;
301    struct tu_pipeline *compute_pipeline;
302 
303    struct tu_render_pass_state rp;
304 
305    /* Vertex buffers, viewports, and scissors
306     * the states for these can be updated partially, so we need to save these
307     * to be able to emit a complete draw state
308     */
309    struct {
310       uint64_t base;
311       uint32_t size;
312       uint32_t stride;
313    } vb[MAX_VBS];
314    VkViewport viewport[MAX_VIEWPORTS];
315    VkRect2D scissor[MAX_SCISSORS];
316    uint32_t max_viewport, max_scissor;
317 
318    /* for dynamic states that can't be emitted directly */
319    uint32_t dynamic_stencil_mask;
320    uint32_t dynamic_stencil_wrmask;
321    uint32_t dynamic_stencil_ref;
322 
323    uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
324    uint32_t pc_raster_cntl, vpc_unknown_9107;
325    uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
326    uint32_t rb_mrt_control_rop;
327    uint32_t rb_blend_cntl, sp_blend_cntl;
328    uint32_t pipeline_color_write_enable, pipeline_blend_enable;
329    uint32_t color_write_enable;
330    bool logic_op_enabled;
331    bool rop_reads_dst;
332    enum pc_di_primtype primtype;
333    bool primitive_restart_enable;
334 
335    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
336    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
337    struct tu_draw_state vertex_buffers;
338    struct tu_draw_state shader_const;
339    struct tu_draw_state desc_sets;
340 
341    struct tu_draw_state vs_params;
342 
343    /* Index buffer */
344    uint64_t index_va;
345    uint32_t max_index_count;
346    uint8_t index_size;
347 
348    /* because streamout base has to be 32-byte aligned
349     * there is an extra offset to deal with when it is
350     * unaligned
351     */
352    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
353 
354    /* Renderpasses are tricky, because we may need to flush differently if
355     * using sysmem vs. gmem and therefore we have to delay any flushing that
356     * happens before a renderpass. So we have to have two copies of the flush
357     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
358     * and one for outside a renderpass.
359     */
360    struct tu_cache_state cache;
361    struct tu_cache_state renderpass_cache;
362 
363    enum tu_cmd_ccu_state ccu_state;
364 
365    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
366     * might get used by tu_store_gmem_attachment().
367     */
368    enum tu_gmem_layout gmem_layout;
369 
370    const struct tu_render_pass *pass;
371    const struct tu_subpass *subpass;
372    const struct tu_framebuffer *framebuffer;
373    const struct tu_tiling_config *tiling;
374    VkRect2D render_area;
375 
376    const struct tu_image_view **attachments;
377 
378    /* State that in the dynamic case comes from VkRenderingInfo and needs to
379     * be saved/restored when suspending. This holds the state for the last
380     * suspended renderpass, which may point to this command buffer's dynamic_*
381     * or another command buffer if executed on a secondary.
382     */
383    struct {
384       const struct tu_render_pass *pass;
385       const struct tu_subpass *subpass;
386       const struct tu_framebuffer *framebuffer;
387       VkRect2D render_area;
388       enum tu_gmem_layout gmem_layout;
389 
390       const struct tu_image_view **attachments;
391 
392       struct tu_lrz_state lrz;
393    } suspended_pass;
394 
395    bool tessfactor_addr_set;
396    bool predication_active;
397    enum a5xx_line_mode line_mode;
398    bool z_negative_one_to_one;
399 
400    /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
401     * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
402     * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
403     */
404    uint32_t prim_counters_running;
405 
406    bool prim_generated_query_running_before_rp;
407 
408    /* These are the states of the suspend/resume state machine. In addition to
409     * tracking whether we're in the middle of a chain of suspending and
410     * resuming passes that will be merged, we need to track whether the
411     * command buffer begins in the middle of such a chain, for when it gets
412     * merged with other command buffers. We call such a chain that begins
413     * before the command buffer starts a "pre-chain".
414     *
415     * Note that when this command buffer is finished, this state is untouched
416     * but it gains a different meaning. For example, if we finish in state
417     * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
418     * there's a suspend/resume chain that extends past the end of the command
419     * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
420     * means that there's a suspend/resume chain that extends before the
421     * beginning.
422     */
423    enum {
424       /* Either there are no suspend/resume chains, or they are entirely
425        * contained in the current command buffer.
426        *
427        *   BeginCommandBuffer() <- start of current command buffer
428        *       ...
429        *       // we are here
430        */
431       SR_NONE = 0,
432 
433       /* We are in the middle of a suspend/resume chain that starts before the
434        * current command buffer. This happens when the command buffer begins
435        * with a resuming render pass and all of the passes up to the current
436        * one are suspending. In this state, our part of the chain is not saved
437        * and is in the current draw_cs/state.
438        *
439        *   BeginRendering() ... EndRendering(suspending)
440        *   BeginCommandBuffer() <- start of current command buffer
441        *       BeginRendering(resuming) ... EndRendering(suspending)
442        *       BeginRendering(resuming) ... EndRendering(suspending)
443        *       ...
444        *       // we are here
445        */
446       SR_IN_PRE_CHAIN,
447 
448       /* We are currently outside of any suspend/resume chains, but there is a
449        * chain starting before the current command buffer. It is saved in
450        * pre_chain.
451        *
452        *   BeginRendering() ... EndRendering(suspending)
453        *   BeginCommandBuffer() <- start of current command buffer
454        *       // This part is stashed in pre_chain
455        *       BeginRendering(resuming) ... EndRendering(suspending)
456        *       BeginRendering(resuming) ... EndRendering(suspending)
457        *       ...
458        *       BeginRendering(resuming) ... EndRendering() // end of chain
459        *       ...
460        *       // we are here
461        */
462       SR_AFTER_PRE_CHAIN,
463 
464       /* We are in the middle of a suspend/resume chain and there is no chain
465        * starting before the current command buffer.
466        *
467        *   BeginCommandBuffer() <- start of current command buffer
468        *       ...
469        *       BeginRendering() ... EndRendering(suspending)
470        *       BeginRendering(resuming) ... EndRendering(suspending)
471        *       BeginRendering(resuming) ... EndRendering(suspending)
472        *       ...
473        *       // we are here
474        */
475       SR_IN_CHAIN,
476 
477       /* We are in the middle of a suspend/resume chain and there is another,
478        * separate, chain starting before the current command buffer.
479        *
480        *   BeginRendering() ... EndRendering(suspending)
481        *   CommandBufferBegin() <- start of current command buffer
482        *       // This part is stashed in pre_chain
483        *       BeginRendering(resuming) ... EndRendering(suspending)
484        *       BeginRendering(resuming) ... EndRendering(suspending)
485        *       ...
486        *       BeginRendering(resuming) ... EndRendering() // end of chain
487        *       ...
488        *       BeginRendering() ... EndRendering(suspending)
489        *       BeginRendering(resuming) ... EndRendering(suspending)
490        *       BeginRendering(resuming) ... EndRendering(suspending)
491        *       ...
492        *       // we are here
493        */
494       SR_IN_CHAIN_AFTER_PRE_CHAIN,
495    } suspend_resume;
496 
497    bool suspending, resuming;
498 
499    struct tu_lrz_state lrz;
500 
501    struct tu_draw_state lrz_and_depth_plane_state;
502 
503    struct tu_vs_params last_vs_params;
504 };
505 
506 struct tu_cmd_pool
507 {
508    struct vk_command_pool vk;
509 
510    struct list_head cmd_buffers;
511    struct list_head free_cmd_buffers;
512 };
513 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
514                                VK_OBJECT_TYPE_COMMAND_POOL)
515 
516 enum tu_cmd_buffer_status
517 {
518    TU_CMD_BUFFER_STATUS_INVALID,
519    TU_CMD_BUFFER_STATUS_INITIAL,
520    TU_CMD_BUFFER_STATUS_RECORDING,
521    TU_CMD_BUFFER_STATUS_EXECUTABLE,
522    TU_CMD_BUFFER_STATUS_PENDING,
523 };
524 
525 struct tu_cmd_buffer
526 {
527    struct vk_command_buffer vk;
528 
529    struct tu_device *device;
530 
531    struct tu_cmd_pool *pool;
532    struct list_head pool_link;
533 
534    struct u_trace trace;
535    struct u_trace_iterator trace_renderpass_start;
536    struct u_trace_iterator trace_renderpass_end;
537 
538    struct list_head renderpass_autotune_results;
539    struct tu_autotune_results_buffer* autotune_buffer;
540 
541    VkCommandBufferUsageFlags usage_flags;
542    enum tu_cmd_buffer_status status;
543 
544    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
545 
546    struct tu_cmd_state state;
547    uint32_t queue_family_index;
548 
549    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
550    VkShaderStageFlags push_constant_stages;
551    struct tu_descriptor_set meta_push_descriptors;
552 
553    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
554 
555    struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
556    struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
557    struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
558    const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
559 
560    struct tu_render_pass dynamic_pass;
561    struct tu_subpass dynamic_subpass;
562    struct tu_framebuffer dynamic_framebuffer;
563 
564    VkResult record_result;
565 
566    struct tu_cs cs;
567    struct tu_cs draw_cs;
568    struct tu_cs tile_store_cs;
569    struct tu_cs draw_epilogue_cs;
570    struct tu_cs sub_cs;
571 
572    /* If the first render pass in the command buffer is resuming, then it is
573     * part of a suspend/resume chain that starts before the current command
574     * buffer and needs to be merged later. In this case, its incomplete state
575     * is stored in pre_chain. In the symmetric case where the last render pass
576     * is suspending, we just skip ending the render pass and its state is
577     * stored in draw_cs/the current state. The first and last render pass
578     * might be part of different chains, which is why all the state may need
579     * to be saved separately here.
580     */
581    struct {
582       struct tu_cs draw_cs;
583       struct tu_cs draw_epilogue_cs;
584 
585       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
586 
587       struct tu_render_pass_state state;
588    } pre_chain;
589 
590    uint32_t vsc_draw_strm_pitch;
591    uint32_t vsc_prim_strm_pitch;
592 };
593 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
594                        VK_OBJECT_TYPE_COMMAND_BUFFER)
595 
596 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)597 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
598                           const struct tu_render_pass_attachment *att)
599 {
600    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
601    return att->gmem_offset[cmd->state.gmem_layout];
602 }
603 
604 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)605 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
606                                   const struct tu_render_pass_attachment *att)
607 {
608    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
609    return att->gmem_offset_stencil[cmd->state.gmem_layout];
610 }
611 
612 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
613                                 const struct tu_render_pass_state *src);
614 
615 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
616                              VkCommandBufferUsageFlags usage_flags);
617 
618 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
619                                     struct tu_cs *cs);
620 
621 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
622                              struct tu_cs *cs,
623                              enum tu_cmd_ccu_state ccu_state);
624 
625 void
626 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
627                     struct tu_cmd_buffer *secondary);
628 
629 void
630 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
631                          struct tu_cmd_buffer *secondary);
632 
633 void
634 tu_append_post_chain(struct tu_cmd_buffer *cmd,
635                      struct tu_cmd_buffer *secondary);
636 
637 void
638 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
639                           struct tu_cmd_buffer *suspended);
640 
641 void tu_cmd_render(struct tu_cmd_buffer *cmd);
642 
643 void
644 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
645                      struct tu_cs *cs,
646                      enum vgt_event_type event);
647 
648 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)649 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
650                          VkPipelineBindPoint bind_point)
651 {
652    return &cmd_buffer->descriptors[bind_point];
653 }
654 
655 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
656                    enum a5xx_line_mode line_mode);
657 
658 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
659 
660 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
661 
662 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
663 
664 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
665                                        uint32_t *rb_depth_cntl);
666 
667 void
668 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
669 
670 #endif /* TU_CMD_BUFFER_H */
671