• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21 
22 enum tu_draw_state_group_id
23 {
24    TU_DRAW_STATE_PROGRAM_CONFIG,
25    TU_DRAW_STATE_VS,
26    TU_DRAW_STATE_VS_BINNING,
27    TU_DRAW_STATE_HS,
28    TU_DRAW_STATE_DS,
29    TU_DRAW_STATE_GS,
30    TU_DRAW_STATE_GS_BINNING,
31    TU_DRAW_STATE_VPC,
32    TU_DRAW_STATE_FS,
33    TU_DRAW_STATE_VB,
34    TU_DRAW_STATE_CONST,
35    TU_DRAW_STATE_DESC_SETS,
36    TU_DRAW_STATE_DESC_SETS_LOAD,
37    TU_DRAW_STATE_VS_PARAMS,
38    TU_DRAW_STATE_FS_PARAMS,
39    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42    TU_DRAW_STATE_PRIM_MODE_GMEM,
43 
44    /* dynamic state related draw states */
45    TU_DRAW_STATE_DYNAMIC,
46    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48 
49 struct tu_descriptor_state
50 {
51    struct tu_descriptor_set *sets[MAX_SETS];
52    struct tu_descriptor_set push_set;
53    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54    uint64_t set_iova[MAX_SETS];
55    uint32_t max_sets_bound;
56    uint32_t max_dynamic_offset_size;
57 };
58 
59 enum tu_cmd_dirty_bits
60 {
61    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62    TU_CMD_DIRTY_DESC_SETS = BIT(1),
63    TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64    TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65    TU_CMD_DIRTY_LRZ = BIT(4),
66    TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67    TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68    TU_CMD_DIRTY_SUBPASS = BIT(7),
69    TU_CMD_DIRTY_FDM = BIT(8),
70    TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71    TU_CMD_DIRTY_TES = BIT(10),
72    TU_CMD_DIRTY_PROGRAM = BIT(11),
73    TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74    TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75    TU_CMD_DIRTY_FS = BIT(14),
76    TU_CMD_DIRTY_SHADING_RATE = BIT(15),
77    /* all draw states were disabled and need to be re-enabled: */
78    TU_CMD_DIRTY_DRAW_STATE = BIT(16)
79 };
80 
81 /* There are only three cache domains we have to care about: the CCU, or
82  * color cache unit, which is used for color and depth/stencil attachments
83  * and copy/blit destinations, and is split conceptually into color and depth,
84  * and the universal cache or UCHE which is used for pretty much everything
85  * else, except for the CP (uncached) and host. We need to flush whenever data
86  * crosses these boundaries.
87  */
88 
89 enum tu_cmd_access_mask {
90    TU_ACCESS_NONE = 0,
91    TU_ACCESS_UCHE_READ = 1 << 0,
92    TU_ACCESS_UCHE_WRITE = 1 << 1,
93    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
94    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
95    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
96    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
97 
98    /* Experiments have shown that while it's safe to avoid flushing the CCU
99     * after each blit/renderpass, it's not safe to assume that subsequent
100     * lookups with a different attachment state will hit unflushed cache
101     * entries. That is, the CCU needs to be flushed and possibly invalidated
102     * when accessing memory with a different attachment state. Writing to an
103     * attachment under the following conditions after clearing using the
104     * normal 2d engine path is known to have issues:
105     *
106     * - It isn't the 0'th layer.
107     * - There are more than one attachment, and this isn't the 0'th attachment
108     *   (this seems to also depend on the cpp of the attachments).
109     *
110     * Our best guess is that the layer/MRT state is used when computing
111     * the location of a cache entry in CCU, to avoid conflicts. We assume that
112     * any access in a renderpass after or before an access by a transfer needs
113     * a flush/invalidate, and use the _INCOHERENT variants to represent access
114     * by a renderpass.
115     */
116    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
117    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
118    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
119    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
120 
121    /* Accesses which bypasses any cache. e.g. writes via the host,
122     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
123     */
124    TU_ACCESS_SYSMEM_READ = 1 << 10,
125    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
126 
127    /* Memory writes from the CP start in-order with draws and event writes,
128     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
129     */
130    TU_ACCESS_CP_WRITE = 1 << 12,
131 
132    /* Descriptors are read through UCHE but are also prefetched via
133     * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
134     * when they change.
135     */
136    TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
137 
138    /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
139    TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
140 
141    /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
142    TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
143 
144    /* The CCHE is a write-through cache which sits behind UCHE, with multiple
145     * incoherent copies. Because it's write-through we only have to worry
146     * about invalidating it for reads. It's invalidated by "ccinv" in the
147     * shader and CP_CCHE_INVALIDATE in the command stream.
148     */
149    TU_ACCESS_CCHE_READ = 1 << 16,
150 
151    TU_ACCESS_RTU_READ = 1 << 17,
152 
153    TU_ACCESS_READ =
154       TU_ACCESS_UCHE_READ |
155       TU_ACCESS_CCU_COLOR_READ |
156       TU_ACCESS_CCU_DEPTH_READ |
157       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
158       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
159       TU_ACCESS_SYSMEM_READ |
160       TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
161       TU_ACCESS_CCHE_READ,
162 
163    TU_ACCESS_WRITE =
164       TU_ACCESS_UCHE_WRITE |
165       TU_ACCESS_CCU_COLOR_WRITE |
166       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
167       TU_ACCESS_CCU_DEPTH_WRITE |
168       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
169       TU_ACCESS_SYSMEM_WRITE |
170       TU_ACCESS_CP_WRITE,
171 
172    TU_ACCESS_ALL =
173       TU_ACCESS_READ |
174       TU_ACCESS_WRITE,
175 };
176 
177 /* From the driver's point of view, we only need to distinguish between things
178  * which won't start until a WFI is complete and things which additionally
179  * need a WAIT_FOR_ME.
180  *
181  * TODO: This will get more complicated with concurrent binning.
182  */
183 enum tu_stage {
184    /* As a destination stage, this is for operations on the CP which don't
185     * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
186     * As a source stage, it is for things needing no waits.
187     */
188    TU_STAGE_CP,
189 
190    /* This is for most operations, which WFI will wait to finish and will not
191     * start until any pending WFIs are finished.
192     */
193    TU_STAGE_GPU,
194 
195    /* This is only used as a destination stage and is for things needing no
196     * waits on the GPU (e.g. host operations).
197     */
198    TU_STAGE_BOTTOM,
199 };
200 
201 enum tu_cmd_flush_bits {
202    TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
203    TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
204    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
205    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
206    TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
207    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
208    TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
209    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
210    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
211    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
212    TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
213    /* This is an unusual flush that isn't automatically executed if pending,
214     * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
215     */
216    TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
217    TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
218 
219    TU_CMD_FLAG_ALL_CLEAN =
220       TU_CMD_FLAG_CCU_CLEAN_DEPTH |
221       TU_CMD_FLAG_CCU_CLEAN_COLOR |
222       TU_CMD_FLAG_CACHE_CLEAN |
223       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
224        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
225        */
226       TU_CMD_FLAG_WAIT_MEM_WRITES,
227 
228    TU_CMD_FLAG_ALL_INVALIDATE =
229       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
230       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
231       TU_CMD_FLAG_CACHE_INVALIDATE |
232       TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
233       TU_CMD_FLAG_CCHE_INVALIDATE |
234       /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
235        * a command that needs CP_WAIT_FOR_ME is executed. This means we may
236        * insert an extra WAIT_FOR_ME before an indirect command requiring it
237        * in case there was another command before the current command buffer
238        * that it needs to wait for.
239        */
240       TU_CMD_FLAG_WAIT_FOR_ME |
241       TU_CMD_FLAG_RTU_INVALIDATE,
242 };
243 
244 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
245  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
246  * which part of the gmem is used by the CCU. Here we keep track of what the
247  * state of the CCU.
248  */
249 enum tu_cmd_ccu_state {
250    TU_CMD_CCU_SYSMEM,
251    TU_CMD_CCU_GMEM,
252    TU_CMD_CCU_UNKNOWN,
253 };
254 
255 struct tu_cache_state {
256    /* Caches which must be made available (flushed) eventually if there are
257     * any users outside that cache domain, and caches which must be
258     * invalidated eventually if there are any reads.
259     */
260    BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
261    /* Pending flushes */
262    BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
263 };
264 
265 struct tu_vs_params {
266    uint32_t vertex_offset;
267    uint32_t first_instance;
268    uint32_t draw_id;
269 };
270 
271 struct tu_tess_params {
272    bool valid;
273    enum a6xx_tess_output output_upper_left, output_lower_left;
274    enum a6xx_tess_spacing spacing;
275 };
276 
277 /* This should be for state that is set inside a renderpass and used at
278  * renderpass end time, e.g. to decide whether to use sysmem. This needs
279  * special handling for secondary cmdbufs and suspending/resuming render
280  * passes where the state may need to be combined afterwards.
281  */
282 struct tu_render_pass_state
283 {
284    bool xfb_used;
285    bool has_tess;
286    bool has_prim_generated_query_in_rp;
287    bool has_zpass_done_sample_count_write_in_rp;
288    bool disable_gmem;
289    bool sysmem_single_prim_mode;
290    bool shared_viewport;
291 
292    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
293    bool draw_cs_writes_to_cond_pred;
294 
295    uint32_t drawcall_count;
296 
297    /* A calculated "draw cost" value for renderpass, which tries to
298     * estimate the bandwidth-per-sample of all the draws according
299     * to:
300     *
301     *    foreach_draw (...) {
302     *      sum += pipeline->color_bandwidth_per_sample;
303     *      if (depth_test_enabled)
304     *        sum += pipeline->depth_cpp_per_sample;
305     *      if (depth_write_enabled)
306     *        sum += pipeline->depth_cpp_per_sample;
307     *      if (stencil_write_enabled)
308     *        sum += pipeline->stencil_cpp_per_sample * 2;
309     *    }
310     *    drawcall_bandwidth_per_sample = sum / drawcall_count;
311     *
312     * It allows us to estimate the total bandwidth of drawcalls later, by
313     * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
314     *
315     * This does ignore depth buffer traffic for samples which do not
316     * pass due to depth-test fail, and some other details.  But it is
317     * just intended to be a rough estimate that is easy to calculate.
318     */
319    uint32_t drawcall_bandwidth_per_sample_sum;
320 
321    const char *lrz_disable_reason;
322    uint32_t lrz_disabled_at_draw;
323 };
324 
325 /* These are the states of the suspend/resume state machine. In addition to
326  * tracking whether we're in the middle of a chain of suspending and
327  * resuming passes that will be merged, we need to track whether the
328  * command buffer begins in the middle of such a chain, for when it gets
329  * merged with other command buffers. We call such a chain that begins
330  * before the command buffer starts a "pre-chain".
331  *
332  * Note that when this command buffer is finished, this state is untouched
333  * but it gains a different meaning. For example, if we finish in state
334  * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
335  * there's a suspend/resume chain that extends past the end of the command
336  * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
337  * means that there's a suspend/resume chain that extends before the
338  * beginning.
339  */
340 enum tu_suspend_resume_state
341 {
342    /* Either there are no suspend/resume chains, or they are entirely
343     * contained in the current command buffer.
344     *
345     *   BeginCommandBuffer() <- start of current command buffer
346     *       ...
347     *       // we are here
348     */
349    SR_NONE = 0,
350 
351    /* We are in the middle of a suspend/resume chain that starts before the
352     * current command buffer. This happens when the command buffer begins
353     * with a resuming render pass and all of the passes up to the current
354     * one are suspending. In this state, our part of the chain is not saved
355     * and is in the current draw_cs/state.
356     *
357     *   BeginRendering() ... EndRendering(suspending)
358     *   BeginCommandBuffer() <- start of current command buffer
359     *       BeginRendering(resuming) ... EndRendering(suspending)
360     *       BeginRendering(resuming) ... EndRendering(suspending)
361     *       ...
362     *       // we are here
363     */
364    SR_IN_PRE_CHAIN,
365 
366    /* We are currently outside of any suspend/resume chains, but there is a
367     * chain starting before the current command buffer. It is saved in
368     * pre_chain.
369     *
370     *   BeginRendering() ... EndRendering(suspending)
371     *   BeginCommandBuffer() <- start of current command buffer
372     *       // This part is stashed in pre_chain
373     *       BeginRendering(resuming) ... EndRendering(suspending)
374     *       BeginRendering(resuming) ... EndRendering(suspending)
375     *       ...
376     *       BeginRendering(resuming) ... EndRendering() // end of chain
377     *       ...
378     *       // we are here
379     */
380    SR_AFTER_PRE_CHAIN,
381 
382    /* We are in the middle of a suspend/resume chain and there is no chain
383     * starting before the current command buffer.
384     *
385     *   BeginCommandBuffer() <- start of current command buffer
386     *       ...
387     *       BeginRendering() ... EndRendering(suspending)
388     *       BeginRendering(resuming) ... EndRendering(suspending)
389     *       BeginRendering(resuming) ... EndRendering(suspending)
390     *       ...
391     *       // we are here
392     */
393    SR_IN_CHAIN,
394 
395    /* We are in the middle of a suspend/resume chain and there is another,
396     * separate, chain starting before the current command buffer.
397     *
398     *   BeginRendering() ... EndRendering(suspending)
399     *   CommandBufferBegin() <- start of current command buffer
400     *       // This part is stashed in pre_chain
401     *       BeginRendering(resuming) ... EndRendering(suspending)
402     *       BeginRendering(resuming) ... EndRendering(suspending)
403     *       ...
404     *       BeginRendering(resuming) ... EndRendering() // end of chain
405     *       ...
406     *       BeginRendering() ... EndRendering(suspending)
407     *       BeginRendering(resuming) ... EndRendering(suspending)
408     *       BeginRendering(resuming) ... EndRendering(suspending)
409     *       ...
410     *       // we are here
411     */
412    SR_IN_CHAIN_AFTER_PRE_CHAIN,
413 };
414 
415 struct tu_cmd_state
416 {
417    uint32_t dirty;
418 
419    struct tu_shader *shaders[MESA_SHADER_STAGES];
420 
421    struct tu_program_state program;
422 
423    struct tu_render_pass_state rp;
424 
425    struct vk_render_pass_state vk_rp;
426    struct vk_vertex_input_state vi;
427    struct vk_sample_locations_state sl;
428 
429    struct tu_bandwidth bandwidth;
430 
431    /* Vertex buffers
432     * the states for these can be updated partially, so we need to save these
433     * to be able to emit a complete draw state
434     */
435    struct {
436       uint64_t base;
437       uint32_t size;
438    } vb[MAX_VBS];
439 
440    uint32_t max_vbs_bound;
441 
442    bool per_view_viewport;
443 
444    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
445    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
446    struct tu_draw_state vertex_buffers;
447    struct tu_draw_state shader_const;
448    struct tu_draw_state desc_sets;
449    struct tu_draw_state load_state;
450    struct tu_draw_state compute_load_state;
451    struct tu_draw_state prim_order_gmem;
452 
453    struct tu_draw_state vs_params;
454    struct tu_draw_state fs_params;
455 
456    /* Index buffer */
457    uint64_t index_va;
458    uint32_t max_index_count;
459    uint8_t index_size;
460 
461    /* because streamout base has to be 32-byte aligned
462     * there is an extra offset to deal with when it is
463     * unaligned
464     */
465    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
466 
467    /* Renderpasses are tricky, because we may need to flush differently if
468     * using sysmem vs. gmem and therefore we have to delay any flushing that
469     * happens before a renderpass. So we have to have two copies of the flush
470     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
471     * and one for outside a renderpass.
472     */
473    struct tu_cache_state cache;
474    struct tu_cache_state renderpass_cache;
475 
476    enum tu_cmd_ccu_state ccu_state;
477 
478    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
479     * might get used by tu_store_gmem_attachment().
480     */
481    enum tu_gmem_layout gmem_layout;
482 
483    const struct tu_render_pass *pass;
484    const struct tu_subpass *subpass;
485    const struct tu_framebuffer *framebuffer;
486    const struct tu_tiling_config *tiling;
487    VkRect2D render_area;
488 
489    const struct tu_image_view **attachments;
490    VkClearValue *clear_values;
491 
492    /* State that in the dynamic case comes from VkRenderingInfo and needs to
493     * be saved/restored when suspending. This holds the state for the last
494     * suspended renderpass, which may point to this command buffer's dynamic_*
495     * or another command buffer if executed on a secondary.
496     */
497    struct {
498       const struct tu_render_pass *pass;
499       const struct tu_subpass *subpass;
500       const struct tu_framebuffer *framebuffer;
501       VkRect2D render_area;
502       enum tu_gmem_layout gmem_layout;
503 
504       const struct tu_image_view **attachments;
505       VkClearValue *clear_values;
506 
507       struct tu_lrz_state lrz;
508    } suspended_pass;
509 
510    bool tessfactor_addr_set;
511    bool predication_active;
512    bool msaa_disable;
513    bool blend_reads_dest;
514    bool stencil_front_write;
515    bool stencil_back_write;
516    bool pipeline_sysmem_single_prim_mode;
517    bool pipeline_has_tess;
518    bool pipeline_disable_gmem;
519    bool raster_order_attachment_access;
520    bool raster_order_attachment_access_valid;
521    bool blit_cache_cleaned;
522    VkImageAspectFlags pipeline_feedback_loops;
523    bool pipeline_writes_shading_rate;
524    bool pipeline_reads_shading_rate;
525    bool pipeline_accesses_smask;
526 
527    bool pipeline_blend_lrz, pipeline_bandwidth;
528    uint32_t pipeline_draw_states;
529 
530    /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
531     * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
532     * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
533     */
534    uint32_t prim_counters_running;
535 
536    bool prim_generated_query_running_before_rp;
537 
538    enum tu_suspend_resume_state suspend_resume;
539 
540    bool suspending, resuming;
541 
542    struct tu_lrz_state lrz;
543 
544    struct tu_draw_state lrz_and_depth_plane_state;
545 
546    struct tu_vs_params last_vs_params;
547    bool last_draw_indexed;
548 
549    struct tu_tess_params tess_params;
550 
551    uint64_t descriptor_buffer_iova[MAX_SETS];
552 };
553 
554 struct tu_cmd_buffer
555 {
556    struct vk_command_buffer vk;
557 
558    struct tu_device *device;
559 
560    struct u_trace trace;
561    struct u_trace_iterator trace_renderpass_start;
562    struct u_trace_iterator trace_renderpass_end;
563 
564    struct list_head renderpass_autotune_results;
565    struct tu_autotune_results_buffer* autotune_buffer;
566 
567    void *patchpoints_ctx;
568    struct util_dynarray fdm_bin_patchpoints;
569 
570    VkCommandBufferUsageFlags usage_flags;
571 
572    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
573 
574    struct tu_cmd_state state;
575    uint32_t queue_family_index;
576 
577    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
578    VkShaderStageFlags push_constant_stages;
579    struct tu_descriptor_set meta_push_descriptors;
580 
581    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
582 
583    struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 2];
584    struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
585    struct tu_subpass_attachment dynamic_input_attachments[MAX_RTS + 1];
586    struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
587    const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 2];
588    VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
589 
590    struct tu_render_pass dynamic_pass;
591    struct tu_subpass dynamic_subpass;
592    struct tu_framebuffer dynamic_framebuffer;
593 
594    struct tu_cs cs;
595    struct tu_cs draw_cs;
596    struct tu_cs tile_store_cs;
597    struct tu_cs draw_epilogue_cs;
598    struct tu_cs sub_cs;
599 
600    /* If the first render pass in the command buffer is resuming, then it is
601     * part of a suspend/resume chain that starts before the current command
602     * buffer and needs to be merged later. In this case, its incomplete state
603     * is stored in pre_chain. In the symmetric case where the last render pass
604     * is suspending, we just skip ending the render pass and its state is
605     * stored in draw_cs/the current state. The first and last render pass
606     * might be part of different chains, which is why all the state may need
607     * to be saved separately here.
608     */
609    struct {
610       struct tu_cs draw_cs;
611       struct tu_cs draw_epilogue_cs;
612 
613       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
614 
615       struct tu_render_pass_state state;
616 
617       struct util_dynarray fdm_bin_patchpoints;
618       void *patchpoints_ctx;
619    } pre_chain;
620 
621    uint32_t vsc_draw_strm_pitch;
622    uint32_t vsc_prim_strm_pitch;
623    uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
624    bool vsc_initialized;
625 
626    bool prev_fsr_is_null;
627 };
628 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
629                        VK_OBJECT_TYPE_COMMAND_BUFFER)
630 
631 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
632 
633 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)634 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
635                           const struct tu_render_pass_attachment *att,
636                           uint32_t layer)
637 {
638    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
639    return att->gmem_offset[cmd->state.gmem_layout] +
640       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
641       att->cpp;
642 }
643 
644 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)645 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
646                                   const struct tu_render_pass_attachment *att,
647                                   uint32_t layer)
648 {
649    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
650    return att->gmem_offset_stencil[cmd->state.gmem_layout] +
651       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
652 }
653 
654 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
655                                 const struct tu_render_pass_state *src);
656 
657 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
658                              const VkCommandBufferBeginInfo *pBeginInfo);
659 
660 template <chip CHIP>
661 void
662 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
663 
664 template <chip CHIP>
665 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
666 
667 template <chip CHIP>
668 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
669                              struct tu_cs *cs,
670                              enum tu_cmd_ccu_state ccu_state);
671 
672 void
673 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
674                     struct tu_cmd_buffer *secondary);
675 
676 void
677 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
678                          struct tu_cmd_buffer *secondary);
679 
680 void
681 tu_append_post_chain(struct tu_cmd_buffer *cmd,
682                      struct tu_cmd_buffer *secondary);
683 
684 void
685 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
686                           struct tu_cmd_buffer *suspended);
687 
688 template <chip CHIP>
689 void tu_cmd_render(struct tu_cmd_buffer *cmd);
690 
691 void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
692                            uint32_t x, uint32_t y, uint32_t z);
693 
694 void tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer,
695                                     VkDeviceAddress size_addr);
696 
697 void tu_write_buffer_cp(VkCommandBuffer commandBuffer,
698                         VkDeviceAddress addr,
699                         void *data, uint32_t size);
700 
701 void tu_flush_buffer_write_cp(VkCommandBuffer commandBuffer);
702 
703 enum fd_gpu_event : uint32_t;
704 
705 template <chip CHIP>
706 void
707 tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
708                         struct tu_cs *cs,
709                         enum vgt_event_type event,
710                         bool needs_seqno);
711 
712 template <chip CHIP>
713 void
714 tu_emit_event_write(struct tu_cmd_buffer *cmd,
715                     struct tu_cs *cs,
716                     enum fd_gpu_event event);
717 
718 void
719 tu_flush_for_access(struct tu_cache_state *cache,
720                     enum tu_cmd_access_mask src_mask,
721                     enum tu_cmd_access_mask dst_mask);
722 
723 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)724 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
725                          VkPipelineBindPoint bind_point)
726 {
727    return &cmd_buffer->descriptors[bind_point];
728 }
729 
730 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
731                    bool msaa_disable);
732 
733 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
734 
735 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
736 
737 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
738 
739 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
740                                        uint32_t *rb_depth_cntl);
741 
742 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
743                                    struct tu_cs *cs,
744                                    void *data,
745                                    VkRect2D bin,
746                                    unsigned views,
747                                    VkExtent2D *frag_areas);
748 
749 struct tu_fdm_bin_patchpoint {
750    uint64_t iova;
751    uint32_t size;
752    void *data;
753    tu_fdm_bin_apply_t apply;
754 };
755 
756 
757 void
758 tu_barrier(struct tu_cmd_buffer *cmd,
759            uint32_t dep_count,
760            const VkDependencyInfo *dep_info);
761 
762 template <chip CHIP>
763 void
764 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
765                VkPipelineStageFlags2 stageMask, unsigned value);
766 
767 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)768 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
769                               struct tu_cs *cs,
770                               unsigned size,
771                               tu_fdm_bin_apply_t apply,
772                               void *state,
773                               unsigned state_size)
774 {
775    void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
776    memcpy(data, state, state_size);
777    assert(cs->writeable);
778    tu_cs_reserve_space(cs, size);
779    struct tu_fdm_bin_patchpoint patch = {
780       .iova = tu_cs_get_cur_iova(cs),
781       .size = size,
782       .data = data,
783       .apply = apply,
784    };
785 
786    /* Apply the "default" setup where there is no scaling. This is used if
787     * sysmem is required, and uses up the dwords that have been reserved.
788     */
789    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
790    VkExtent2D unscaled_frag_areas[num_views];
791    for (unsigned i = 0; i < num_views; i++) {
792       unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
793    }
794    apply(cmd, cs, state, (VkRect2D) {
795          { 0, 0 },
796          { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
797         }, num_views, unscaled_frag_areas);
798    assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
799 
800    util_dynarray_append(&cmd->fdm_bin_patchpoints,
801                         struct tu_fdm_bin_patchpoint,
802                         patch);
803 }
804 
805 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
806    _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
807 
808 VkResult tu_init_bin_preamble(struct tu_device *device);
809 
810 #endif /* TU_CMD_BUFFER_H */
811