• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21 
22 enum tu_draw_state_group_id
23 {
24    TU_DRAW_STATE_PROGRAM_CONFIG,
25    TU_DRAW_STATE_VS,
26    TU_DRAW_STATE_VS_BINNING,
27    TU_DRAW_STATE_HS,
28    TU_DRAW_STATE_DS,
29    TU_DRAW_STATE_GS,
30    TU_DRAW_STATE_GS_BINNING,
31    TU_DRAW_STATE_VPC,
32    TU_DRAW_STATE_FS,
33    TU_DRAW_STATE_VB,
34    TU_DRAW_STATE_CONST,
35    TU_DRAW_STATE_DESC_SETS,
36    TU_DRAW_STATE_DESC_SETS_LOAD,
37    TU_DRAW_STATE_VS_PARAMS,
38    TU_DRAW_STATE_FS_PARAMS,
39    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42    TU_DRAW_STATE_PRIM_MODE_GMEM,
43 
44    /* dynamic state related draw states */
45    TU_DRAW_STATE_DYNAMIC,
46    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48 
49 struct tu_descriptor_state
50 {
51    struct tu_descriptor_set *sets[MAX_SETS];
52    struct tu_descriptor_set push_set;
53    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54    uint64_t set_iova[MAX_SETS];
55    uint32_t max_sets_bound;
56    uint32_t max_dynamic_offset_size;
57 };
58 
59 enum tu_cmd_dirty_bits
60 {
61    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62    TU_CMD_DIRTY_DESC_SETS = BIT(1),
63    TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64    TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65    TU_CMD_DIRTY_LRZ = BIT(4),
66    TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67    TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68    TU_CMD_DIRTY_SUBPASS = BIT(7),
69    TU_CMD_DIRTY_FDM = BIT(8),
70    TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71    TU_CMD_DIRTY_TES = BIT(10),
72    TU_CMD_DIRTY_PROGRAM = BIT(11),
73    TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74    TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75    TU_CMD_DIRTY_FS = BIT(14),
76    TU_CMD_DIRTY_SHADING_RATE = BIT(15),
77    /* all draw states were disabled and need to be re-enabled: */
78    TU_CMD_DIRTY_DRAW_STATE = BIT(16)
79 };
80 
81 /* There are only three cache domains we have to care about: the CCU, or
82  * color cache unit, which is used for color and depth/stencil attachments
83  * and copy/blit destinations, and is split conceptually into color and depth,
84  * and the universal cache or UCHE which is used for pretty much everything
85  * else, except for the CP (uncached) and host. We need to flush whenever data
86  * crosses these boundaries.
87  */
88 
89 enum tu_cmd_access_mask {
90    TU_ACCESS_NONE = 0,
91    TU_ACCESS_UCHE_READ = 1 << 0,
92    TU_ACCESS_UCHE_WRITE = 1 << 1,
93    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
94    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
95    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
96    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
97 
98    /* Experiments have shown that while it's safe to avoid flushing the CCU
99     * after each blit/renderpass, it's not safe to assume that subsequent
100     * lookups with a different attachment state will hit unflushed cache
101     * entries. That is, the CCU needs to be flushed and possibly invalidated
102     * when accessing memory with a different attachment state. Writing to an
103     * attachment under the following conditions after clearing using the
104     * normal 2d engine path is known to have issues:
105     *
106     * - It isn't the 0'th layer.
107     * - There are more than one attachment, and this isn't the 0'th attachment
108     *   (this seems to also depend on the cpp of the attachments).
109     *
110     * Our best guess is that the layer/MRT state is used when computing
111     * the location of a cache entry in CCU, to avoid conflicts. We assume that
112     * any access in a renderpass after or before an access by a transfer needs
113     * a flush/invalidate, and use the _INCOHERENT variants to represent access
114     * by a renderpass.
115     */
116    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
117    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
118    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
119    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
120 
121    /* Accesses which bypasses any cache. e.g. writes via the host,
122     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
123     */
124    TU_ACCESS_SYSMEM_READ = 1 << 10,
125    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
126 
127    /* Memory writes from the CP start in-order with draws and event writes,
128     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
129     */
130    TU_ACCESS_CP_WRITE = 1 << 12,
131 
132    /* Descriptors are read through UCHE but are also prefetched via
133     * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
134     * when they change.
135     */
136    TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
137 
138    /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
139    TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
140 
141    /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
142    TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
143 
144    /* The CCHE is a write-through cache which sits behind UCHE, with multiple
145     * incoherent copies. Because it's write-through we only have to worry
146     * about invalidating it for reads. It's invalidated by "ccinv" in the
147     * shader and CP_CCHE_INVALIDATE in the command stream.
148     */
149    TU_ACCESS_CCHE_READ = 1 << 16,
150 
151    TU_ACCESS_READ =
152       TU_ACCESS_UCHE_READ |
153       TU_ACCESS_CCU_COLOR_READ |
154       TU_ACCESS_CCU_DEPTH_READ |
155       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
156       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
157       TU_ACCESS_SYSMEM_READ |
158       TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
159       TU_ACCESS_CCHE_READ,
160 
161    TU_ACCESS_WRITE =
162       TU_ACCESS_UCHE_WRITE |
163       TU_ACCESS_CCU_COLOR_WRITE |
164       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
165       TU_ACCESS_CCU_DEPTH_WRITE |
166       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
167       TU_ACCESS_SYSMEM_WRITE |
168       TU_ACCESS_CP_WRITE,
169 
170    TU_ACCESS_ALL =
171       TU_ACCESS_READ |
172       TU_ACCESS_WRITE,
173 };
174 
175 /* From the driver's point of view, we only need to distinguish between things
176  * which won't start until a WFI is complete and things which additionally
177  * need a WAIT_FOR_ME.
178  *
179  * TODO: This will get more complicated with concurrent binning.
180  */
181 enum tu_stage {
182    /* As a destination stage, this is for operations on the CP which don't
183     * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
184     * As a source stage, it is for things needing no waits.
185     */
186    TU_STAGE_CP,
187 
188    /* This is for most operations, which WFI will wait to finish and will not
189     * start until any pending WFIs are finished.
190     */
191    TU_STAGE_GPU,
192 
193    /* This is only used as a destination stage and is for things needing no
194     * waits on the GPU (e.g. host operations).
195     */
196    TU_STAGE_BOTTOM,
197 };
198 
199 enum tu_cmd_flush_bits {
200    TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
201    TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
202    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
203    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
204    TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
205    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
206    TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
207    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
208    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
209    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
210    TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
211    /* This is an unusual flush that isn't automatically executed if pending,
212     * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
213     */
214    TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
215 
216    TU_CMD_FLAG_ALL_CLEAN =
217       TU_CMD_FLAG_CCU_CLEAN_DEPTH |
218       TU_CMD_FLAG_CCU_CLEAN_COLOR |
219       TU_CMD_FLAG_CACHE_CLEAN |
220       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
221        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
222        */
223       TU_CMD_FLAG_WAIT_MEM_WRITES,
224 
225    TU_CMD_FLAG_ALL_INVALIDATE =
226       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
227       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
228       TU_CMD_FLAG_CACHE_INVALIDATE |
229       TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
230       TU_CMD_FLAG_CCHE_INVALIDATE |
231       /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
232        * a command that needs CP_WAIT_FOR_ME is executed. This means we may
233        * insert an extra WAIT_FOR_ME before an indirect command requiring it
234        * in case there was another command before the current command buffer
235        * that it needs to wait for.
236        */
237       TU_CMD_FLAG_WAIT_FOR_ME,
238 };
239 
240 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
241  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
242  * which part of the gmem is used by the CCU. Here we keep track of what the
243  * state of the CCU.
244  */
245 enum tu_cmd_ccu_state {
246    TU_CMD_CCU_SYSMEM,
247    TU_CMD_CCU_GMEM,
248    TU_CMD_CCU_UNKNOWN,
249 };
250 
251 struct tu_cache_state {
252    /* Caches which must be made available (flushed) eventually if there are
253     * any users outside that cache domain, and caches which must be
254     * invalidated eventually if there are any reads.
255     */
256    BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
257    /* Pending flushes */
258    BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
259 };
260 
261 struct tu_vs_params {
262    uint32_t vertex_offset;
263    uint32_t first_instance;
264    uint32_t draw_id;
265 };
266 
267 struct tu_tess_params {
268    bool valid;
269    enum a6xx_tess_output output_upper_left, output_lower_left;
270    enum a6xx_tess_spacing spacing;
271 };
272 
273 /* This should be for state that is set inside a renderpass and used at
274  * renderpass end time, e.g. to decide whether to use sysmem. This needs
275  * special handling for secondary cmdbufs and suspending/resuming render
276  * passes where the state may need to be combined afterwards.
277  */
278 struct tu_render_pass_state
279 {
280    bool xfb_used;
281    bool has_tess;
282    bool has_prim_generated_query_in_rp;
283    bool has_zpass_done_sample_count_write_in_rp;
284    bool disable_gmem;
285    bool sysmem_single_prim_mode;
286    bool shared_viewport;
287 
288    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
289    bool draw_cs_writes_to_cond_pred;
290 
291    uint32_t drawcall_count;
292 
293    /* A calculated "draw cost" value for renderpass, which tries to
294     * estimate the bandwidth-per-sample of all the draws according
295     * to:
296     *
297     *    foreach_draw (...) {
298     *      sum += pipeline->color_bandwidth_per_sample;
299     *      if (depth_test_enabled)
300     *        sum += pipeline->depth_cpp_per_sample;
301     *      if (depth_write_enabled)
302     *        sum += pipeline->depth_cpp_per_sample;
303     *      if (stencil_write_enabled)
304     *        sum += pipeline->stencil_cpp_per_sample * 2;
305     *    }
306     *    drawcall_bandwidth_per_sample = sum / drawcall_count;
307     *
308     * It allows us to estimate the total bandwidth of drawcalls later, by
309     * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
310     *
311     * This does ignore depth buffer traffic for samples which do not
312     * pass due to depth-test fail, and some other details.  But it is
313     * just intended to be a rough estimate that is easy to calculate.
314     */
315    uint32_t drawcall_bandwidth_per_sample_sum;
316 
317    const char *lrz_disable_reason;
318 };
319 
320 /* These are the states of the suspend/resume state machine. In addition to
321  * tracking whether we're in the middle of a chain of suspending and
322  * resuming passes that will be merged, we need to track whether the
323  * command buffer begins in the middle of such a chain, for when it gets
324  * merged with other command buffers. We call such a chain that begins
325  * before the command buffer starts a "pre-chain".
326  *
327  * Note that when this command buffer is finished, this state is untouched
328  * but it gains a different meaning. For example, if we finish in state
329  * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
330  * there's a suspend/resume chain that extends past the end of the command
331  * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
332  * means that there's a suspend/resume chain that extends before the
333  * beginning.
334  */
335 enum tu_suspend_resume_state
336 {
337    /* Either there are no suspend/resume chains, or they are entirely
338     * contained in the current command buffer.
339     *
340     *   BeginCommandBuffer() <- start of current command buffer
341     *       ...
342     *       // we are here
343     */
344    SR_NONE = 0,
345 
346    /* We are in the middle of a suspend/resume chain that starts before the
347     * current command buffer. This happens when the command buffer begins
348     * with a resuming render pass and all of the passes up to the current
349     * one are suspending. In this state, our part of the chain is not saved
350     * and is in the current draw_cs/state.
351     *
352     *   BeginRendering() ... EndRendering(suspending)
353     *   BeginCommandBuffer() <- start of current command buffer
354     *       BeginRendering(resuming) ... EndRendering(suspending)
355     *       BeginRendering(resuming) ... EndRendering(suspending)
356     *       ...
357     *       // we are here
358     */
359    SR_IN_PRE_CHAIN,
360 
361    /* We are currently outside of any suspend/resume chains, but there is a
362     * chain starting before the current command buffer. It is saved in
363     * pre_chain.
364     *
365     *   BeginRendering() ... EndRendering(suspending)
366     *   BeginCommandBuffer() <- start of current command buffer
367     *       // This part is stashed in pre_chain
368     *       BeginRendering(resuming) ... EndRendering(suspending)
369     *       BeginRendering(resuming) ... EndRendering(suspending)
370     *       ...
371     *       BeginRendering(resuming) ... EndRendering() // end of chain
372     *       ...
373     *       // we are here
374     */
375    SR_AFTER_PRE_CHAIN,
376 
377    /* We are in the middle of a suspend/resume chain and there is no chain
378     * starting before the current command buffer.
379     *
380     *   BeginCommandBuffer() <- start of current command buffer
381     *       ...
382     *       BeginRendering() ... EndRendering(suspending)
383     *       BeginRendering(resuming) ... EndRendering(suspending)
384     *       BeginRendering(resuming) ... EndRendering(suspending)
385     *       ...
386     *       // we are here
387     */
388    SR_IN_CHAIN,
389 
390    /* We are in the middle of a suspend/resume chain and there is another,
391     * separate, chain starting before the current command buffer.
392     *
393     *   BeginRendering() ... EndRendering(suspending)
394     *   CommandBufferBegin() <- start of current command buffer
395     *       // This part is stashed in pre_chain
396     *       BeginRendering(resuming) ... EndRendering(suspending)
397     *       BeginRendering(resuming) ... EndRendering(suspending)
398     *       ...
399     *       BeginRendering(resuming) ... EndRendering() // end of chain
400     *       ...
401     *       BeginRendering() ... EndRendering(suspending)
402     *       BeginRendering(resuming) ... EndRendering(suspending)
403     *       BeginRendering(resuming) ... EndRendering(suspending)
404     *       ...
405     *       // we are here
406     */
407    SR_IN_CHAIN_AFTER_PRE_CHAIN,
408 };
409 
410 struct tu_cmd_state
411 {
412    uint32_t dirty;
413 
414    struct tu_shader *shaders[MESA_SHADER_STAGES];
415 
416    struct tu_program_state program;
417 
418    struct tu_render_pass_state rp;
419 
420    struct vk_render_pass_state vk_rp;
421    struct vk_vertex_input_state vi;
422    struct vk_sample_locations_state sl;
423 
424    struct tu_bandwidth bandwidth;
425 
426    /* Vertex buffers
427     * the states for these can be updated partially, so we need to save these
428     * to be able to emit a complete draw state
429     */
430    struct {
431       uint64_t base;
432       uint32_t size;
433    } vb[MAX_VBS];
434 
435    uint32_t max_vbs_bound;
436 
437    bool per_view_viewport;
438 
439    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
440    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
441    struct tu_draw_state vertex_buffers;
442    struct tu_draw_state shader_const;
443    struct tu_draw_state desc_sets;
444    struct tu_draw_state load_state;
445    struct tu_draw_state compute_load_state;
446    struct tu_draw_state prim_order_gmem;
447 
448    struct tu_draw_state vs_params;
449    struct tu_draw_state fs_params;
450 
451    /* Index buffer */
452    uint64_t index_va;
453    uint32_t max_index_count;
454    uint8_t index_size;
455 
456    /* because streamout base has to be 32-byte aligned
457     * there is an extra offset to deal with when it is
458     * unaligned
459     */
460    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
461 
462    /* Renderpasses are tricky, because we may need to flush differently if
463     * using sysmem vs. gmem and therefore we have to delay any flushing that
464     * happens before a renderpass. So we have to have two copies of the flush
465     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
466     * and one for outside a renderpass.
467     */
468    struct tu_cache_state cache;
469    struct tu_cache_state renderpass_cache;
470 
471    enum tu_cmd_ccu_state ccu_state;
472 
473    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
474     * might get used by tu_store_gmem_attachment().
475     */
476    enum tu_gmem_layout gmem_layout;
477 
478    const struct tu_render_pass *pass;
479    const struct tu_subpass *subpass;
480    const struct tu_framebuffer *framebuffer;
481    const struct tu_tiling_config *tiling;
482    VkRect2D render_area;
483 
484    const struct tu_image_view **attachments;
485    VkClearValue *clear_values;
486 
487    /* State that in the dynamic case comes from VkRenderingInfo and needs to
488     * be saved/restored when suspending. This holds the state for the last
489     * suspended renderpass, which may point to this command buffer's dynamic_*
490     * or another command buffer if executed on a secondary.
491     */
492    struct {
493       const struct tu_render_pass *pass;
494       const struct tu_subpass *subpass;
495       const struct tu_framebuffer *framebuffer;
496       VkRect2D render_area;
497       enum tu_gmem_layout gmem_layout;
498 
499       const struct tu_image_view **attachments;
500       VkClearValue *clear_values;
501 
502       struct tu_lrz_state lrz;
503    } suspended_pass;
504 
505    bool tessfactor_addr_set;
506    bool predication_active;
507    bool msaa_disable;
508    bool blend_reads_dest;
509    bool stencil_front_write;
510    bool stencil_back_write;
511    bool pipeline_sysmem_single_prim_mode;
512    bool pipeline_has_tess;
513    bool pipeline_disable_gmem;
514    bool raster_order_attachment_access;
515    bool raster_order_attachment_access_valid;
516    bool blit_cache_cleaned;
517    VkImageAspectFlags pipeline_feedback_loops;
518    bool pipeline_writes_shading_rate;
519    bool pipeline_reads_shading_rate;
520    bool pipeline_accesses_smask;
521 
522    bool pipeline_blend_lrz, pipeline_bandwidth;
523    uint32_t pipeline_draw_states;
524 
525    /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
526     * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
527     * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
528     */
529    uint32_t prim_counters_running;
530 
531    bool prim_generated_query_running_before_rp;
532 
533    enum tu_suspend_resume_state suspend_resume;
534 
535    bool suspending, resuming;
536 
537    struct tu_lrz_state lrz;
538 
539    struct tu_draw_state lrz_and_depth_plane_state;
540 
541    struct tu_vs_params last_vs_params;
542    bool last_draw_indexed;
543 
544    struct tu_tess_params tess_params;
545 
546    uint64_t descriptor_buffer_iova[MAX_SETS];
547 };
548 
549 struct tu_cmd_buffer
550 {
551    struct vk_command_buffer vk;
552 
553    struct tu_device *device;
554 
555    struct u_trace trace;
556    struct u_trace_iterator trace_renderpass_start;
557    struct u_trace_iterator trace_renderpass_end;
558 
559    struct list_head renderpass_autotune_results;
560    struct tu_autotune_results_buffer* autotune_buffer;
561 
562    void *patchpoints_ctx;
563    struct util_dynarray fdm_bin_patchpoints;
564 
565    VkCommandBufferUsageFlags usage_flags;
566 
567    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
568 
569    struct tu_cmd_state state;
570    uint32_t queue_family_index;
571 
572    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
573    VkShaderStageFlags push_constant_stages;
574    struct tu_descriptor_set meta_push_descriptors;
575 
576    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
577 
578    struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 2];
579    struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
580    struct tu_subpass_attachment dynamic_input_attachments[MAX_RTS + 1];
581    struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
582    const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 2];
583    VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
584 
585    struct tu_render_pass dynamic_pass;
586    struct tu_subpass dynamic_subpass;
587    struct tu_framebuffer dynamic_framebuffer;
588 
589    struct tu_cs cs;
590    struct tu_cs draw_cs;
591    struct tu_cs tile_store_cs;
592    struct tu_cs draw_epilogue_cs;
593    struct tu_cs sub_cs;
594 
595    /* If the first render pass in the command buffer is resuming, then it is
596     * part of a suspend/resume chain that starts before the current command
597     * buffer and needs to be merged later. In this case, its incomplete state
598     * is stored in pre_chain. In the symmetric case where the last render pass
599     * is suspending, we just skip ending the render pass and its state is
600     * stored in draw_cs/the current state. The first and last render pass
601     * might be part of different chains, which is why all the state may need
602     * to be saved separately here.
603     */
604    struct {
605       struct tu_cs draw_cs;
606       struct tu_cs draw_epilogue_cs;
607 
608       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
609 
610       struct tu_render_pass_state state;
611 
612       struct util_dynarray fdm_bin_patchpoints;
613       void *patchpoints_ctx;
614    } pre_chain;
615 
616    uint32_t vsc_draw_strm_pitch;
617    uint32_t vsc_prim_strm_pitch;
618    uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
619    bool vsc_initialized;
620 
621    bool prev_fsr_is_null;
622 };
623 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
624                        VK_OBJECT_TYPE_COMMAND_BUFFER)
625 
626 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
627 
628 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)629 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
630                           const struct tu_render_pass_attachment *att,
631                           uint32_t layer)
632 {
633    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
634    return att->gmem_offset[cmd->state.gmem_layout] +
635       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
636       att->cpp;
637 }
638 
639 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)640 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
641                                   const struct tu_render_pass_attachment *att,
642                                   uint32_t layer)
643 {
644    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
645    return att->gmem_offset_stencil[cmd->state.gmem_layout] +
646       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
647 }
648 
649 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
650                                 const struct tu_render_pass_state *src);
651 
652 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
653                              const VkCommandBufferBeginInfo *pBeginInfo);
654 
655 template <chip CHIP>
656 void
657 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
658 
659 template <chip CHIP>
660 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
661 
662 template <chip CHIP>
663 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
664                              struct tu_cs *cs,
665                              enum tu_cmd_ccu_state ccu_state);
666 
667 void
668 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
669                     struct tu_cmd_buffer *secondary);
670 
671 void
672 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
673                          struct tu_cmd_buffer *secondary);
674 
675 void
676 tu_append_post_chain(struct tu_cmd_buffer *cmd,
677                      struct tu_cmd_buffer *secondary);
678 
679 void
680 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
681                           struct tu_cmd_buffer *suspended);
682 
683 template <chip CHIP>
684 void tu_cmd_render(struct tu_cmd_buffer *cmd);
685 
686 enum fd_gpu_event : uint32_t;
687 
688 template <chip CHIP>
689 void
690 tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
691                         struct tu_cs *cs,
692                         enum vgt_event_type event,
693                         bool needs_seqno);
694 
695 template <chip CHIP>
696 void
697 tu_emit_event_write(struct tu_cmd_buffer *cmd,
698                     struct tu_cs *cs,
699                     enum fd_gpu_event event);
700 
701 void
702 tu_flush_for_access(struct tu_cache_state *cache,
703                     enum tu_cmd_access_mask src_mask,
704                     enum tu_cmd_access_mask dst_mask);
705 
706 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)707 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
708                          VkPipelineBindPoint bind_point)
709 {
710    return &cmd_buffer->descriptors[bind_point];
711 }
712 
713 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
714                    bool msaa_disable);
715 
716 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
717 
718 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
719 
720 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
721 
722 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
723                                        uint32_t *rb_depth_cntl);
724 
725 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
726                                    struct tu_cs *cs,
727                                    void *data,
728                                    VkRect2D bin,
729                                    unsigned views,
730                                    VkExtent2D *frag_areas);
731 
732 struct tu_fdm_bin_patchpoint {
733    uint64_t iova;
734    uint32_t size;
735    void *data;
736    tu_fdm_bin_apply_t apply;
737 };
738 
739 
740 void
741 tu_barrier(struct tu_cmd_buffer *cmd,
742            uint32_t dep_count,
743            const VkDependencyInfo *dep_info);
744 
745 template <chip CHIP>
746 void
747 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
748                VkPipelineStageFlags2 stageMask, unsigned value);
749 
750 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)751 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
752                               struct tu_cs *cs,
753                               unsigned size,
754                               tu_fdm_bin_apply_t apply,
755                               void *state,
756                               unsigned state_size)
757 {
758    void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
759    memcpy(data, state, state_size);
760    assert(cs->writeable);
761    tu_cs_reserve_space(cs, size);
762    struct tu_fdm_bin_patchpoint patch = {
763       .iova = tu_cs_get_cur_iova(cs),
764       .size = size,
765       .data = data,
766       .apply = apply,
767    };
768 
769    /* Apply the "default" setup where there is no scaling. This is used if
770     * sysmem is required, and uses up the dwords that have been reserved.
771     */
772    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
773    VkExtent2D unscaled_frag_areas[num_views];
774    for (unsigned i = 0; i < num_views; i++) {
775       unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
776    }
777    apply(cmd, cs, state, (VkRect2D) {
778          { 0, 0 },
779          { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
780         }, num_views, unscaled_frag_areas);
781    assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
782 
783    util_dynarray_append(&cmd->fdm_bin_patchpoints,
784                         struct tu_fdm_bin_patchpoint,
785                         patch);
786 }
787 
788 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
789    _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
790 
791 VkResult tu_init_bin_preamble(struct tu_device *device);
792 
793 #endif /* TU_CMD_BUFFER_H */
794