• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21 
22 enum tu_draw_state_group_id
23 {
24    TU_DRAW_STATE_PROGRAM_CONFIG,
25    TU_DRAW_STATE_VS,
26    TU_DRAW_STATE_VS_BINNING,
27    TU_DRAW_STATE_HS,
28    TU_DRAW_STATE_DS,
29    TU_DRAW_STATE_GS,
30    TU_DRAW_STATE_GS_BINNING,
31    TU_DRAW_STATE_VPC,
32    TU_DRAW_STATE_FS,
33    TU_DRAW_STATE_VB,
34    TU_DRAW_STATE_CONST,
35    TU_DRAW_STATE_DESC_SETS,
36    TU_DRAW_STATE_DESC_SETS_LOAD,
37    TU_DRAW_STATE_VS_PARAMS,
38    TU_DRAW_STATE_FS_PARAMS,
39    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42    TU_DRAW_STATE_PRIM_MODE_GMEM,
43    TU_DRAW_STATE_PRIM_MODE_SYSMEM,
44 
45    /* dynamic state related draw states */
46    TU_DRAW_STATE_DYNAMIC,
47    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
48 };
49 
50 struct tu_descriptor_state
51 {
52    struct tu_descriptor_set *sets[MAX_SETS];
53    struct tu_descriptor_set push_set;
54    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
55    uint64_t set_iova[MAX_SETS];
56    uint32_t max_sets_bound;
57    uint32_t max_dynamic_offset_size;
58 };
59 
60 enum tu_cmd_dirty_bits
61 {
62    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
63    TU_CMD_DIRTY_DESC_SETS = BIT(1),
64    TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
65    TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
66    TU_CMD_DIRTY_LRZ = BIT(4),
67    TU_CMD_DIRTY_VS_PARAMS = BIT(5),
68    TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
69    TU_CMD_DIRTY_SUBPASS = BIT(7),
70    TU_CMD_DIRTY_FDM = BIT(8),
71    TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
72    TU_CMD_DIRTY_TES = BIT(10),
73    TU_CMD_DIRTY_PROGRAM = BIT(11),
74    /* all draw states were disabled and need to be re-enabled: */
75    TU_CMD_DIRTY_DRAW_STATE = BIT(12)
76 };
77 
78 /* There are only three cache domains we have to care about: the CCU, or
79  * color cache unit, which is used for color and depth/stencil attachments
80  * and copy/blit destinations, and is split conceptually into color and depth,
81  * and the universal cache or UCHE which is used for pretty much everything
82  * else, except for the CP (uncached) and host. We need to flush whenever data
83  * crosses these boundaries.
84  */
85 
86 enum tu_cmd_access_mask {
87    TU_ACCESS_NONE = 0,
88    TU_ACCESS_UCHE_READ = 1 << 0,
89    TU_ACCESS_UCHE_WRITE = 1 << 1,
90    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
91    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
92    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
93    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
94 
95    /* Experiments have shown that while it's safe to avoid flushing the CCU
96     * after each blit/renderpass, it's not safe to assume that subsequent
97     * lookups with a different attachment state will hit unflushed cache
98     * entries. That is, the CCU needs to be flushed and possibly invalidated
99     * when accessing memory with a different attachment state. Writing to an
100     * attachment under the following conditions after clearing using the
101     * normal 2d engine path is known to have issues:
102     *
103     * - It isn't the 0'th layer.
104     * - There are more than one attachment, and this isn't the 0'th attachment
105     *   (this seems to also depend on the cpp of the attachments).
106     *
107     * Our best guess is that the layer/MRT state is used when computing
108     * the location of a cache entry in CCU, to avoid conflicts. We assume that
109     * any access in a renderpass after or before an access by a transfer needs
110     * a flush/invalidate, and use the _INCOHERENT variants to represent access
111     * by a renderpass.
112     */
113    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
114    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
115    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
116    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
117 
118    /* Accesses which bypasses any cache. e.g. writes via the host,
119     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
120     */
121    TU_ACCESS_SYSMEM_READ = 1 << 10,
122    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
123 
124    /* Memory writes from the CP start in-order with draws and event writes,
125     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
126     */
127    TU_ACCESS_CP_WRITE = 1 << 12,
128 
129    /* Descriptors are read through UCHE but are also prefetched via
130     * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
131     * when they change.
132     */
133    TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
134 
135    TU_ACCESS_READ =
136       TU_ACCESS_UCHE_READ |
137       TU_ACCESS_CCU_COLOR_READ |
138       TU_ACCESS_CCU_DEPTH_READ |
139       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
140       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
141       TU_ACCESS_SYSMEM_READ |
142       TU_ACCESS_BINDLESS_DESCRIPTOR_READ,
143 
144    TU_ACCESS_WRITE =
145       TU_ACCESS_UCHE_WRITE |
146       TU_ACCESS_CCU_COLOR_WRITE |
147       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
148       TU_ACCESS_CCU_DEPTH_WRITE |
149       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
150       TU_ACCESS_SYSMEM_WRITE |
151       TU_ACCESS_CP_WRITE,
152 
153    TU_ACCESS_ALL =
154       TU_ACCESS_READ |
155       TU_ACCESS_WRITE,
156 };
157 
158 /* From the driver's point of view, we only need to distinguish between things
159  * which won't start until a WFI is complete and things which additionally
160  * need a WAIT_FOR_ME.
161  *
162  * TODO: This will get more complicated with concurrent binning.
163  */
164 enum tu_stage {
165    /* As a destination stage, this is for operations on the CP which don't
166     * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
167     * As a source stage, it is for things needing no waits.
168     */
169    TU_STAGE_CP,
170 
171    /* This is for most operations, which WFI will wait to finish and will not
172     * start until any pending WFIs are finished.
173     */
174    TU_STAGE_GPU,
175 
176    /* This is only used as a destination stage and is for things needing no
177     * waits on the GPU (e.g. host operations).
178     */
179    TU_STAGE_BOTTOM,
180 };
181 
182 enum tu_cmd_flush_bits {
183    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
184    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
185    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
186    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
187    TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
188    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
189    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
190    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
191    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
192    TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 9,
193 
194    TU_CMD_FLAG_ALL_FLUSH =
195       TU_CMD_FLAG_CCU_FLUSH_DEPTH |
196       TU_CMD_FLAG_CCU_FLUSH_COLOR |
197       TU_CMD_FLAG_CACHE_FLUSH |
198       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
199        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
200        */
201       TU_CMD_FLAG_WAIT_MEM_WRITES,
202 
203    TU_CMD_FLAG_ALL_INVALIDATE =
204       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
205       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
206       TU_CMD_FLAG_CACHE_INVALIDATE |
207       TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
208       /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
209        * a command that needs CP_WAIT_FOR_ME is executed. This means we may
210        * insert an extra WAIT_FOR_ME before an indirect command requiring it
211        * in case there was another command before the current command buffer
212        * that it needs to wait for.
213        */
214       TU_CMD_FLAG_WAIT_FOR_ME,
215 };
216 
217 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
218  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
219  * which part of the gmem is used by the CCU. Here we keep track of what the
220  * state of the CCU.
221  */
222 enum tu_cmd_ccu_state {
223    TU_CMD_CCU_SYSMEM,
224    TU_CMD_CCU_GMEM,
225    TU_CMD_CCU_UNKNOWN,
226 };
227 
228 struct tu_cache_state {
229    /* Caches which must be made available (flushed) eventually if there are
230     * any users outside that cache domain, and caches which must be
231     * invalidated eventually if there are any reads.
232     */
233    BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
234    /* Pending flushes */
235    BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
236 };
237 
238 struct tu_vs_params {
239    uint32_t vertex_offset;
240    uint32_t first_instance;
241    uint32_t draw_id;
242 };
243 
244 struct tu_tess_params {
245    bool valid;
246    enum a6xx_tess_output output_upper_left, output_lower_left;
247    enum a6xx_tess_spacing spacing;
248 };
249 
250 /* This should be for state that is set inside a renderpass and used at
251  * renderpass end time, e.g. to decide whether to use sysmem. This needs
252  * special handling for secondary cmdbufs and suspending/resuming render
253  * passes where the state may need to be combined afterwards.
254  */
255 struct tu_render_pass_state
256 {
257    bool xfb_used;
258    bool has_tess;
259    bool has_prim_generated_query_in_rp;
260    bool disable_gmem;
261    bool sysmem_single_prim_mode;
262    bool shared_viewport;
263 
264    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
265    bool draw_cs_writes_to_cond_pred;
266 
267    uint32_t drawcall_count;
268 
269    /* A calculated "draw cost" value for renderpass, which tries to
270     * estimate the bandwidth-per-sample of all the draws according
271     * to:
272     *
273     *    foreach_draw (...) {
274     *      sum += pipeline->color_bandwidth_per_sample;
275     *      if (depth_test_enabled)
276     *        sum += pipeline->depth_cpp_per_sample;
277     *      if (depth_write_enabled)
278     *        sum += pipeline->depth_cpp_per_sample;
279     *      if (stencil_write_enabled)
280     *        sum += pipeline->stencil_cpp_per_sample * 2;
281     *    }
282     *    drawcall_bandwidth_per_sample = sum / drawcall_count;
283     *
284     * It allows us to estimate the total bandwidth of drawcalls later, by
285     * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
286     *
287     * This does ignore depth buffer traffic for samples which do not
288     * pass due to depth-test fail, and some other details.  But it is
289     * just intended to be a rough estimate that is easy to calculate.
290     */
291    uint32_t drawcall_bandwidth_per_sample_sum;
292 };
293 
294 /* These are the states of the suspend/resume state machine. In addition to
295  * tracking whether we're in the middle of a chain of suspending and
296  * resuming passes that will be merged, we need to track whether the
297  * command buffer begins in the middle of such a chain, for when it gets
298  * merged with other command buffers. We call such a chain that begins
299  * before the command buffer starts a "pre-chain".
300  *
301  * Note that when this command buffer is finished, this state is untouched
302  * but it gains a different meaning. For example, if we finish in state
303  * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
304  * there's a suspend/resume chain that extends past the end of the command
305  * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
306  * means that there's a suspend/resume chain that extends before the
307  * beginning.
308  */
309 enum tu_suspend_resume_state
310 {
311    /* Either there are no suspend/resume chains, or they are entirely
312     * contained in the current command buffer.
313     *
314     *   BeginCommandBuffer() <- start of current command buffer
315     *       ...
316     *       // we are here
317     */
318    SR_NONE = 0,
319 
320    /* We are in the middle of a suspend/resume chain that starts before the
321     * current command buffer. This happens when the command buffer begins
322     * with a resuming render pass and all of the passes up to the current
323     * one are suspending. In this state, our part of the chain is not saved
324     * and is in the current draw_cs/state.
325     *
326     *   BeginRendering() ... EndRendering(suspending)
327     *   BeginCommandBuffer() <- start of current command buffer
328     *       BeginRendering(resuming) ... EndRendering(suspending)
329     *       BeginRendering(resuming) ... EndRendering(suspending)
330     *       ...
331     *       // we are here
332     */
333    SR_IN_PRE_CHAIN,
334 
335    /* We are currently outside of any suspend/resume chains, but there is a
336     * chain starting before the current command buffer. It is saved in
337     * pre_chain.
338     *
339     *   BeginRendering() ... EndRendering(suspending)
340     *   BeginCommandBuffer() <- start of current command buffer
341     *       // This part is stashed in pre_chain
342     *       BeginRendering(resuming) ... EndRendering(suspending)
343     *       BeginRendering(resuming) ... EndRendering(suspending)
344     *       ...
345     *       BeginRendering(resuming) ... EndRendering() // end of chain
346     *       ...
347     *       // we are here
348     */
349    SR_AFTER_PRE_CHAIN,
350 
351    /* We are in the middle of a suspend/resume chain and there is no chain
352     * starting before the current command buffer.
353     *
354     *   BeginCommandBuffer() <- start of current command buffer
355     *       ...
356     *       BeginRendering() ... EndRendering(suspending)
357     *       BeginRendering(resuming) ... EndRendering(suspending)
358     *       BeginRendering(resuming) ... EndRendering(suspending)
359     *       ...
360     *       // we are here
361     */
362    SR_IN_CHAIN,
363 
364    /* We are in the middle of a suspend/resume chain and there is another,
365     * separate, chain starting before the current command buffer.
366     *
367     *   BeginRendering() ... EndRendering(suspending)
368     *   CommandBufferBegin() <- start of current command buffer
369     *       // This part is stashed in pre_chain
370     *       BeginRendering(resuming) ... EndRendering(suspending)
371     *       BeginRendering(resuming) ... EndRendering(suspending)
372     *       ...
373     *       BeginRendering(resuming) ... EndRendering() // end of chain
374     *       ...
375     *       BeginRendering() ... EndRendering(suspending)
376     *       BeginRendering(resuming) ... EndRendering(suspending)
377     *       BeginRendering(resuming) ... EndRendering(suspending)
378     *       ...
379     *       // we are here
380     */
381    SR_IN_CHAIN_AFTER_PRE_CHAIN,
382 };
383 
384 struct tu_cmd_state
385 {
386    uint32_t dirty;
387 
388    struct tu_shader *shaders[MESA_SHADER_STAGES];
389 
390    struct tu_program_state program;
391 
392    struct tu_render_pass_state rp;
393 
394    struct vk_render_pass_state vk_rp;
395    struct vk_vertex_input_state vi;
396    struct vk_sample_locations_state sl;
397 
398    struct tu_bandwidth bandwidth;
399 
400    /* Vertex buffers
401     * the states for these can be updated partially, so we need to save these
402     * to be able to emit a complete draw state
403     */
404    struct {
405       uint64_t base;
406       uint32_t size;
407    } vb[MAX_VBS];
408 
409    uint32_t max_vbs_bound;
410 
411    bool per_view_viewport;
412 
413    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
414    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
415    struct tu_draw_state vertex_buffers;
416    struct tu_draw_state shader_const;
417    struct tu_draw_state desc_sets;
418    struct tu_draw_state load_state;
419    struct tu_draw_state compute_load_state;
420    struct tu_draw_state prim_order_sysmem, prim_order_gmem;
421 
422    struct tu_draw_state vs_params;
423    struct tu_draw_state fs_params;
424 
425    /* Index buffer */
426    uint64_t index_va;
427    uint32_t max_index_count;
428    uint8_t index_size;
429 
430    /* because streamout base has to be 32-byte aligned
431     * there is an extra offset to deal with when it is
432     * unaligned
433     */
434    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
435 
436    /* Renderpasses are tricky, because we may need to flush differently if
437     * using sysmem vs. gmem and therefore we have to delay any flushing that
438     * happens before a renderpass. So we have to have two copies of the flush
439     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
440     * and one for outside a renderpass.
441     */
442    struct tu_cache_state cache;
443    struct tu_cache_state renderpass_cache;
444 
445    enum tu_cmd_ccu_state ccu_state;
446 
447    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
448     * might get used by tu_store_gmem_attachment().
449     */
450    enum tu_gmem_layout gmem_layout;
451 
452    const struct tu_render_pass *pass;
453    const struct tu_subpass *subpass;
454    const struct tu_framebuffer *framebuffer;
455    const struct tu_tiling_config *tiling;
456    VkRect2D render_area;
457 
458    const struct tu_image_view **attachments;
459    VkClearValue *clear_values;
460 
461    /* State that in the dynamic case comes from VkRenderingInfo and needs to
462     * be saved/restored when suspending. This holds the state for the last
463     * suspended renderpass, which may point to this command buffer's dynamic_*
464     * or another command buffer if executed on a secondary.
465     */
466    struct {
467       const struct tu_render_pass *pass;
468       const struct tu_subpass *subpass;
469       const struct tu_framebuffer *framebuffer;
470       VkRect2D render_area;
471       enum tu_gmem_layout gmem_layout;
472 
473       const struct tu_image_view **attachments;
474 
475       struct tu_lrz_state lrz;
476    } suspended_pass;
477 
478    bool tessfactor_addr_set;
479    bool predication_active;
480    bool msaa_disable;
481    bool blend_reads_dest;
482    bool stencil_front_write;
483    bool stencil_back_write;
484    bool pipeline_feedback_loop_ds;
485 
486    bool pipeline_blend_lrz, pipeline_bandwidth;
487    uint32_t pipeline_draw_states;
488 
489    /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
490     * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
491     * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
492     */
493    uint32_t prim_counters_running;
494 
495    bool prim_generated_query_running_before_rp;
496 
497    enum tu_suspend_resume_state suspend_resume;
498 
499    bool suspending, resuming;
500 
501    struct tu_lrz_state lrz;
502 
503    struct tu_draw_state lrz_and_depth_plane_state;
504 
505    struct tu_vs_params last_vs_params;
506 
507    struct tu_tess_params tess_params;
508 
509    uint64_t descriptor_buffer_iova[MAX_SETS];
510 };
511 
512 struct tu_cmd_buffer
513 {
514    struct vk_command_buffer vk;
515 
516    struct tu_device *device;
517 
518    struct u_trace trace;
519    struct u_trace_iterator trace_renderpass_start;
520    struct u_trace_iterator trace_renderpass_end;
521 
522    struct list_head renderpass_autotune_results;
523    struct tu_autotune_results_buffer* autotune_buffer;
524 
525    void *patchpoints_ctx;
526    struct util_dynarray fdm_bin_patchpoints;
527 
528    VkCommandBufferUsageFlags usage_flags;
529 
530    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
531 
532    struct tu_cmd_state state;
533    uint32_t queue_family_index;
534 
535    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
536    VkShaderStageFlags push_constant_stages;
537    struct tu_descriptor_set meta_push_descriptors;
538 
539    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
540 
541    struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 1];
542    struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
543    struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
544    const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 1];
545    VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
546 
547    struct tu_render_pass dynamic_pass;
548    struct tu_subpass dynamic_subpass;
549    struct tu_framebuffer dynamic_framebuffer;
550 
551    struct tu_cs cs;
552    struct tu_cs draw_cs;
553    struct tu_cs tile_store_cs;
554    struct tu_cs draw_epilogue_cs;
555    struct tu_cs sub_cs;
556 
557    /* If the first render pass in the command buffer is resuming, then it is
558     * part of a suspend/resume chain that starts before the current command
559     * buffer and needs to be merged later. In this case, its incomplete state
560     * is stored in pre_chain. In the symmetric case where the last render pass
561     * is suspending, we just skip ending the render pass and its state is
562     * stored in draw_cs/the current state. The first and last render pass
563     * might be part of different chains, which is why all the state may need
564     * to be saved separately here.
565     */
566    struct {
567       struct tu_cs draw_cs;
568       struct tu_cs draw_epilogue_cs;
569 
570       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
571 
572       struct tu_render_pass_state state;
573 
574       struct util_dynarray fdm_bin_patchpoints;
575       void *patchpoints_ctx;
576    } pre_chain;
577 
578    uint32_t vsc_draw_strm_pitch;
579    uint32_t vsc_prim_strm_pitch;
580    bool vsc_initialized;
581 };
582 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
583                        VK_OBJECT_TYPE_COMMAND_BUFFER)
584 
585 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
586 
587 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)588 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
589                           const struct tu_render_pass_attachment *att,
590                           uint32_t layer)
591 {
592    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
593    return att->gmem_offset[cmd->state.gmem_layout] +
594       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
595       att->cpp;
596 }
597 
598 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)599 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
600                                   const struct tu_render_pass_attachment *att,
601                                   uint32_t layer)
602 {
603    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
604    return att->gmem_offset_stencil[cmd->state.gmem_layout] +
605       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
606 }
607 
608 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
609                                 const struct tu_render_pass_state *src);
610 
611 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
612                              const VkCommandBufferBeginInfo *pBeginInfo);
613 
614 template <chip CHIP>
615 void
616 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
617 
618 template <chip CHIP>
619 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
620 
621 template <chip CHIP>
622 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
623                              struct tu_cs *cs,
624                              enum tu_cmd_ccu_state ccu_state);
625 
626 void
627 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
628                     struct tu_cmd_buffer *secondary);
629 
630 void
631 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
632                          struct tu_cmd_buffer *secondary);
633 
634 void
635 tu_append_post_chain(struct tu_cmd_buffer *cmd,
636                      struct tu_cmd_buffer *secondary);
637 
638 void
639 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
640                           struct tu_cmd_buffer *suspended);
641 
642 template <chip CHIP>
643 void tu_cmd_render(struct tu_cmd_buffer *cmd);
644 
645 enum fd_gpu_event : uint32_t;
646 
647 template <chip CHIP>
648 void
649 tu_emit_event_write(struct tu_cmd_buffer *cmd,
650                     struct tu_cs *cs,
651                     enum fd_gpu_event event);
652 
653 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)654 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
655                          VkPipelineBindPoint bind_point)
656 {
657    return &cmd_buffer->descriptors[bind_point];
658 }
659 
660 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
661                    bool msaa_disable);
662 
663 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
664 
665 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
666 
667 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
668 
669 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
670                                        uint32_t *rb_depth_cntl);
671 
672 void
673 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
674 
675 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
676                                    struct tu_cs *cs,
677                                    void *data,
678                                    VkRect2D bin,
679                                    unsigned views,
680                                    VkExtent2D *frag_areas);
681 
682 struct tu_fdm_bin_patchpoint {
683    uint64_t iova;
684    uint32_t size;
685    void *data;
686    tu_fdm_bin_apply_t apply;
687 };
688 
689 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)690 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
691                               struct tu_cs *cs,
692                               unsigned size,
693                               tu_fdm_bin_apply_t apply,
694                               void *state,
695                               unsigned state_size)
696 {
697    void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
698    memcpy(data, state, state_size);
699    assert(cs->writeable);
700    tu_cs_reserve_space(cs, size);
701    struct tu_fdm_bin_patchpoint patch = {
702       .iova = tu_cs_get_cur_iova(cs),
703       .size = size,
704       .data = data,
705       .apply = apply,
706    };
707 
708    /* Apply the "default" setup where there is no scaling. This is used if
709     * sysmem is required, and uses up the dwords that have been reserved.
710     */
711    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
712    VkExtent2D unscaled_frag_areas[num_views];
713    for (unsigned i = 0; i < num_views; i++) {
714       unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
715    }
716    apply(cmd, cs, state, (VkRect2D) {
717          { 0, 0 },
718          { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
719         }, num_views, unscaled_frag_areas);
720    assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
721 
722    util_dynarray_append(&cmd->fdm_bin_patchpoints,
723                         struct tu_fdm_bin_patchpoint,
724                         patch);
725 }
726 
727 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
728    _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
729 
730 #endif /* TU_CMD_BUFFER_H */
731