1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12
13 #include "tu_common.h"
14
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21
22 enum tu_draw_state_group_id
23 {
24 TU_DRAW_STATE_PROGRAM_CONFIG,
25 TU_DRAW_STATE_VS,
26 TU_DRAW_STATE_VS_BINNING,
27 TU_DRAW_STATE_HS,
28 TU_DRAW_STATE_DS,
29 TU_DRAW_STATE_GS,
30 TU_DRAW_STATE_GS_BINNING,
31 TU_DRAW_STATE_VPC,
32 TU_DRAW_STATE_FS,
33 TU_DRAW_STATE_VB,
34 TU_DRAW_STATE_CONST,
35 TU_DRAW_STATE_DESC_SETS,
36 TU_DRAW_STATE_DESC_SETS_LOAD,
37 TU_DRAW_STATE_VS_PARAMS,
38 TU_DRAW_STATE_FS_PARAMS,
39 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42 TU_DRAW_STATE_PRIM_MODE_GMEM,
43
44 /* dynamic state related draw states */
45 TU_DRAW_STATE_DYNAMIC,
46 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48
49 struct tu_descriptor_state
50 {
51 struct tu_descriptor_set *sets[MAX_SETS];
52 struct tu_descriptor_set push_set;
53 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54 uint64_t set_iova[MAX_SETS];
55 uint32_t max_sets_bound;
56 uint32_t max_dynamic_offset_size;
57 };
58
59 enum tu_cmd_dirty_bits
60 {
61 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62 TU_CMD_DIRTY_DESC_SETS = BIT(1),
63 TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64 TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65 TU_CMD_DIRTY_LRZ = BIT(4),
66 TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67 TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68 TU_CMD_DIRTY_SUBPASS = BIT(7),
69 TU_CMD_DIRTY_FDM = BIT(8),
70 TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71 TU_CMD_DIRTY_TES = BIT(10),
72 TU_CMD_DIRTY_PROGRAM = BIT(11),
73 TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74 TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75 TU_CMD_DIRTY_FS = BIT(14),
76 TU_CMD_DIRTY_SHADING_RATE = BIT(15),
77 /* all draw states were disabled and need to be re-enabled: */
78 TU_CMD_DIRTY_DRAW_STATE = BIT(16)
79 };
80
81 /* There are only three cache domains we have to care about: the CCU, or
82 * color cache unit, which is used for color and depth/stencil attachments
83 * and copy/blit destinations, and is split conceptually into color and depth,
84 * and the universal cache or UCHE which is used for pretty much everything
85 * else, except for the CP (uncached) and host. We need to flush whenever data
86 * crosses these boundaries.
87 */
88
89 enum tu_cmd_access_mask {
90 TU_ACCESS_NONE = 0,
91 TU_ACCESS_UCHE_READ = 1 << 0,
92 TU_ACCESS_UCHE_WRITE = 1 << 1,
93 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
94 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
95 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
96 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
97
98 /* Experiments have shown that while it's safe to avoid flushing the CCU
99 * after each blit/renderpass, it's not safe to assume that subsequent
100 * lookups with a different attachment state will hit unflushed cache
101 * entries. That is, the CCU needs to be flushed and possibly invalidated
102 * when accessing memory with a different attachment state. Writing to an
103 * attachment under the following conditions after clearing using the
104 * normal 2d engine path is known to have issues:
105 *
106 * - It isn't the 0'th layer.
107 * - There are more than one attachment, and this isn't the 0'th attachment
108 * (this seems to also depend on the cpp of the attachments).
109 *
110 * Our best guess is that the layer/MRT state is used when computing
111 * the location of a cache entry in CCU, to avoid conflicts. We assume that
112 * any access in a renderpass after or before an access by a transfer needs
113 * a flush/invalidate, and use the _INCOHERENT variants to represent access
114 * by a renderpass.
115 */
116 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
117 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
118 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
119 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
120
121 /* Accesses which bypasses any cache. e.g. writes via the host,
122 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
123 */
124 TU_ACCESS_SYSMEM_READ = 1 << 10,
125 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
126
127 /* Memory writes from the CP start in-order with draws and event writes,
128 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
129 */
130 TU_ACCESS_CP_WRITE = 1 << 12,
131
132 /* Descriptors are read through UCHE but are also prefetched via
133 * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
134 * when they change.
135 */
136 TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
137
138 /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
139 TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
140
141 /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
142 TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
143
144 /* The CCHE is a write-through cache which sits behind UCHE, with multiple
145 * incoherent copies. Because it's write-through we only have to worry
146 * about invalidating it for reads. It's invalidated by "ccinv" in the
147 * shader and CP_CCHE_INVALIDATE in the command stream.
148 */
149 TU_ACCESS_CCHE_READ = 1 << 16,
150
151 TU_ACCESS_RTU_READ = 1 << 17,
152
153 TU_ACCESS_READ =
154 TU_ACCESS_UCHE_READ |
155 TU_ACCESS_CCU_COLOR_READ |
156 TU_ACCESS_CCU_DEPTH_READ |
157 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
158 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
159 TU_ACCESS_SYSMEM_READ |
160 TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
161 TU_ACCESS_CCHE_READ,
162
163 TU_ACCESS_WRITE =
164 TU_ACCESS_UCHE_WRITE |
165 TU_ACCESS_CCU_COLOR_WRITE |
166 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
167 TU_ACCESS_CCU_DEPTH_WRITE |
168 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
169 TU_ACCESS_SYSMEM_WRITE |
170 TU_ACCESS_CP_WRITE,
171
172 TU_ACCESS_ALL =
173 TU_ACCESS_READ |
174 TU_ACCESS_WRITE,
175 };
176
177 /* From the driver's point of view, we only need to distinguish between things
178 * which won't start until a WFI is complete and things which additionally
179 * need a WAIT_FOR_ME.
180 *
181 * TODO: This will get more complicated with concurrent binning.
182 */
183 enum tu_stage {
184 /* As a destination stage, this is for operations on the CP which don't
185 * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
186 * As a source stage, it is for things needing no waits.
187 */
188 TU_STAGE_CP,
189
190 /* This is for most operations, which WFI will wait to finish and will not
191 * start until any pending WFIs are finished.
192 */
193 TU_STAGE_GPU,
194
195 /* This is only used as a destination stage and is for things needing no
196 * waits on the GPU (e.g. host operations).
197 */
198 TU_STAGE_BOTTOM,
199 };
200
201 enum tu_cmd_flush_bits {
202 TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
203 TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
204 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
205 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
206 TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
207 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
208 TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
209 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
210 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
211 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
212 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
213 /* This is an unusual flush that isn't automatically executed if pending,
214 * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
215 */
216 TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
217 TU_CMD_FLAG_RTU_INVALIDATE = 1 << 12,
218
219 TU_CMD_FLAG_ALL_CLEAN =
220 TU_CMD_FLAG_CCU_CLEAN_DEPTH |
221 TU_CMD_FLAG_CCU_CLEAN_COLOR |
222 TU_CMD_FLAG_CACHE_CLEAN |
223 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
224 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
225 */
226 TU_CMD_FLAG_WAIT_MEM_WRITES,
227
228 TU_CMD_FLAG_ALL_INVALIDATE =
229 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
230 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
231 TU_CMD_FLAG_CACHE_INVALIDATE |
232 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
233 TU_CMD_FLAG_CCHE_INVALIDATE |
234 /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
235 * a command that needs CP_WAIT_FOR_ME is executed. This means we may
236 * insert an extra WAIT_FOR_ME before an indirect command requiring it
237 * in case there was another command before the current command buffer
238 * that it needs to wait for.
239 */
240 TU_CMD_FLAG_WAIT_FOR_ME |
241 TU_CMD_FLAG_RTU_INVALIDATE,
242 };
243
244 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
245 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
246 * which part of the gmem is used by the CCU. Here we keep track of what the
247 * state of the CCU.
248 */
249 enum tu_cmd_ccu_state {
250 TU_CMD_CCU_SYSMEM,
251 TU_CMD_CCU_GMEM,
252 TU_CMD_CCU_UNKNOWN,
253 };
254
255 struct tu_cache_state {
256 /* Caches which must be made available (flushed) eventually if there are
257 * any users outside that cache domain, and caches which must be
258 * invalidated eventually if there are any reads.
259 */
260 BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
261 /* Pending flushes */
262 BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
263 };
264
265 struct tu_vs_params {
266 uint32_t vertex_offset;
267 uint32_t first_instance;
268 uint32_t draw_id;
269 };
270
271 struct tu_tess_params {
272 bool valid;
273 enum a6xx_tess_output output_upper_left, output_lower_left;
274 enum a6xx_tess_spacing spacing;
275 };
276
277 /* This should be for state that is set inside a renderpass and used at
278 * renderpass end time, e.g. to decide whether to use sysmem. This needs
279 * special handling for secondary cmdbufs and suspending/resuming render
280 * passes where the state may need to be combined afterwards.
281 */
282 struct tu_render_pass_state
283 {
284 bool xfb_used;
285 bool has_tess;
286 bool has_prim_generated_query_in_rp;
287 bool has_zpass_done_sample_count_write_in_rp;
288 bool disable_gmem;
289 bool sysmem_single_prim_mode;
290 bool shared_viewport;
291
292 /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
293 bool draw_cs_writes_to_cond_pred;
294
295 uint32_t drawcall_count;
296
297 /* A calculated "draw cost" value for renderpass, which tries to
298 * estimate the bandwidth-per-sample of all the draws according
299 * to:
300 *
301 * foreach_draw (...) {
302 * sum += pipeline->color_bandwidth_per_sample;
303 * if (depth_test_enabled)
304 * sum += pipeline->depth_cpp_per_sample;
305 * if (depth_write_enabled)
306 * sum += pipeline->depth_cpp_per_sample;
307 * if (stencil_write_enabled)
308 * sum += pipeline->stencil_cpp_per_sample * 2;
309 * }
310 * drawcall_bandwidth_per_sample = sum / drawcall_count;
311 *
312 * It allows us to estimate the total bandwidth of drawcalls later, by
313 * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
314 *
315 * This does ignore depth buffer traffic for samples which do not
316 * pass due to depth-test fail, and some other details. But it is
317 * just intended to be a rough estimate that is easy to calculate.
318 */
319 uint32_t drawcall_bandwidth_per_sample_sum;
320
321 const char *lrz_disable_reason;
322 uint32_t lrz_disabled_at_draw;
323 };
324
325 /* These are the states of the suspend/resume state machine. In addition to
326 * tracking whether we're in the middle of a chain of suspending and
327 * resuming passes that will be merged, we need to track whether the
328 * command buffer begins in the middle of such a chain, for when it gets
329 * merged with other command buffers. We call such a chain that begins
330 * before the command buffer starts a "pre-chain".
331 *
332 * Note that when this command buffer is finished, this state is untouched
333 * but it gains a different meaning. For example, if we finish in state
334 * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
335 * there's a suspend/resume chain that extends past the end of the command
336 * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
337 * means that there's a suspend/resume chain that extends before the
338 * beginning.
339 */
340 enum tu_suspend_resume_state
341 {
342 /* Either there are no suspend/resume chains, or they are entirely
343 * contained in the current command buffer.
344 *
345 * BeginCommandBuffer() <- start of current command buffer
346 * ...
347 * // we are here
348 */
349 SR_NONE = 0,
350
351 /* We are in the middle of a suspend/resume chain that starts before the
352 * current command buffer. This happens when the command buffer begins
353 * with a resuming render pass and all of the passes up to the current
354 * one are suspending. In this state, our part of the chain is not saved
355 * and is in the current draw_cs/state.
356 *
357 * BeginRendering() ... EndRendering(suspending)
358 * BeginCommandBuffer() <- start of current command buffer
359 * BeginRendering(resuming) ... EndRendering(suspending)
360 * BeginRendering(resuming) ... EndRendering(suspending)
361 * ...
362 * // we are here
363 */
364 SR_IN_PRE_CHAIN,
365
366 /* We are currently outside of any suspend/resume chains, but there is a
367 * chain starting before the current command buffer. It is saved in
368 * pre_chain.
369 *
370 * BeginRendering() ... EndRendering(suspending)
371 * BeginCommandBuffer() <- start of current command buffer
372 * // This part is stashed in pre_chain
373 * BeginRendering(resuming) ... EndRendering(suspending)
374 * BeginRendering(resuming) ... EndRendering(suspending)
375 * ...
376 * BeginRendering(resuming) ... EndRendering() // end of chain
377 * ...
378 * // we are here
379 */
380 SR_AFTER_PRE_CHAIN,
381
382 /* We are in the middle of a suspend/resume chain and there is no chain
383 * starting before the current command buffer.
384 *
385 * BeginCommandBuffer() <- start of current command buffer
386 * ...
387 * BeginRendering() ... EndRendering(suspending)
388 * BeginRendering(resuming) ... EndRendering(suspending)
389 * BeginRendering(resuming) ... EndRendering(suspending)
390 * ...
391 * // we are here
392 */
393 SR_IN_CHAIN,
394
395 /* We are in the middle of a suspend/resume chain and there is another,
396 * separate, chain starting before the current command buffer.
397 *
398 * BeginRendering() ... EndRendering(suspending)
399 * CommandBufferBegin() <- start of current command buffer
400 * // This part is stashed in pre_chain
401 * BeginRendering(resuming) ... EndRendering(suspending)
402 * BeginRendering(resuming) ... EndRendering(suspending)
403 * ...
404 * BeginRendering(resuming) ... EndRendering() // end of chain
405 * ...
406 * BeginRendering() ... EndRendering(suspending)
407 * BeginRendering(resuming) ... EndRendering(suspending)
408 * BeginRendering(resuming) ... EndRendering(suspending)
409 * ...
410 * // we are here
411 */
412 SR_IN_CHAIN_AFTER_PRE_CHAIN,
413 };
414
415 struct tu_cmd_state
416 {
417 uint32_t dirty;
418
419 struct tu_shader *shaders[MESA_SHADER_STAGES];
420
421 struct tu_program_state program;
422
423 struct tu_render_pass_state rp;
424
425 struct vk_render_pass_state vk_rp;
426 struct vk_vertex_input_state vi;
427 struct vk_sample_locations_state sl;
428
429 struct tu_bandwidth bandwidth;
430
431 /* Vertex buffers
432 * the states for these can be updated partially, so we need to save these
433 * to be able to emit a complete draw state
434 */
435 struct {
436 uint64_t base;
437 uint32_t size;
438 } vb[MAX_VBS];
439
440 uint32_t max_vbs_bound;
441
442 bool per_view_viewport;
443
444 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
445 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
446 struct tu_draw_state vertex_buffers;
447 struct tu_draw_state shader_const;
448 struct tu_draw_state desc_sets;
449 struct tu_draw_state load_state;
450 struct tu_draw_state compute_load_state;
451 struct tu_draw_state prim_order_gmem;
452
453 struct tu_draw_state vs_params;
454 struct tu_draw_state fs_params;
455
456 /* Index buffer */
457 uint64_t index_va;
458 uint32_t max_index_count;
459 uint8_t index_size;
460
461 /* because streamout base has to be 32-byte aligned
462 * there is an extra offset to deal with when it is
463 * unaligned
464 */
465 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
466
467 /* Renderpasses are tricky, because we may need to flush differently if
468 * using sysmem vs. gmem and therefore we have to delay any flushing that
469 * happens before a renderpass. So we have to have two copies of the flush
470 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
471 * and one for outside a renderpass.
472 */
473 struct tu_cache_state cache;
474 struct tu_cache_state renderpass_cache;
475
476 enum tu_cmd_ccu_state ccu_state;
477
478 /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
479 * might get used by tu_store_gmem_attachment().
480 */
481 enum tu_gmem_layout gmem_layout;
482
483 const struct tu_render_pass *pass;
484 const struct tu_subpass *subpass;
485 const struct tu_framebuffer *framebuffer;
486 const struct tu_tiling_config *tiling;
487 VkRect2D render_area;
488
489 const struct tu_image_view **attachments;
490 VkClearValue *clear_values;
491
492 /* State that in the dynamic case comes from VkRenderingInfo and needs to
493 * be saved/restored when suspending. This holds the state for the last
494 * suspended renderpass, which may point to this command buffer's dynamic_*
495 * or another command buffer if executed on a secondary.
496 */
497 struct {
498 const struct tu_render_pass *pass;
499 const struct tu_subpass *subpass;
500 const struct tu_framebuffer *framebuffer;
501 VkRect2D render_area;
502 enum tu_gmem_layout gmem_layout;
503
504 const struct tu_image_view **attachments;
505 VkClearValue *clear_values;
506
507 struct tu_lrz_state lrz;
508 } suspended_pass;
509
510 bool tessfactor_addr_set;
511 bool predication_active;
512 bool msaa_disable;
513 bool blend_reads_dest;
514 bool stencil_front_write;
515 bool stencil_back_write;
516 bool pipeline_sysmem_single_prim_mode;
517 bool pipeline_has_tess;
518 bool pipeline_disable_gmem;
519 bool raster_order_attachment_access;
520 bool raster_order_attachment_access_valid;
521 bool blit_cache_cleaned;
522 VkImageAspectFlags pipeline_feedback_loops;
523 bool pipeline_writes_shading_rate;
524 bool pipeline_reads_shading_rate;
525 bool pipeline_accesses_smask;
526
527 bool pipeline_blend_lrz, pipeline_bandwidth;
528 uint32_t pipeline_draw_states;
529
530 /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
531 * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
532 * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
533 */
534 uint32_t prim_counters_running;
535
536 bool prim_generated_query_running_before_rp;
537
538 enum tu_suspend_resume_state suspend_resume;
539
540 bool suspending, resuming;
541
542 struct tu_lrz_state lrz;
543
544 struct tu_draw_state lrz_and_depth_plane_state;
545
546 struct tu_vs_params last_vs_params;
547 bool last_draw_indexed;
548
549 struct tu_tess_params tess_params;
550
551 uint64_t descriptor_buffer_iova[MAX_SETS];
552 };
553
554 struct tu_cmd_buffer
555 {
556 struct vk_command_buffer vk;
557
558 struct tu_device *device;
559
560 struct u_trace trace;
561 struct u_trace_iterator trace_renderpass_start;
562 struct u_trace_iterator trace_renderpass_end;
563
564 struct list_head renderpass_autotune_results;
565 struct tu_autotune_results_buffer* autotune_buffer;
566
567 void *patchpoints_ctx;
568 struct util_dynarray fdm_bin_patchpoints;
569
570 VkCommandBufferUsageFlags usage_flags;
571
572 VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
573
574 struct tu_cmd_state state;
575 uint32_t queue_family_index;
576
577 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
578 VkShaderStageFlags push_constant_stages;
579 struct tu_descriptor_set meta_push_descriptors;
580
581 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
582
583 struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 2];
584 struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
585 struct tu_subpass_attachment dynamic_input_attachments[MAX_RTS + 1];
586 struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
587 const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 2];
588 VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
589
590 struct tu_render_pass dynamic_pass;
591 struct tu_subpass dynamic_subpass;
592 struct tu_framebuffer dynamic_framebuffer;
593
594 struct tu_cs cs;
595 struct tu_cs draw_cs;
596 struct tu_cs tile_store_cs;
597 struct tu_cs draw_epilogue_cs;
598 struct tu_cs sub_cs;
599
600 /* If the first render pass in the command buffer is resuming, then it is
601 * part of a suspend/resume chain that starts before the current command
602 * buffer and needs to be merged later. In this case, its incomplete state
603 * is stored in pre_chain. In the symmetric case where the last render pass
604 * is suspending, we just skip ending the render pass and its state is
605 * stored in draw_cs/the current state. The first and last render pass
606 * might be part of different chains, which is why all the state may need
607 * to be saved separately here.
608 */
609 struct {
610 struct tu_cs draw_cs;
611 struct tu_cs draw_epilogue_cs;
612
613 struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
614
615 struct tu_render_pass_state state;
616
617 struct util_dynarray fdm_bin_patchpoints;
618 void *patchpoints_ctx;
619 } pre_chain;
620
621 uint32_t vsc_draw_strm_pitch;
622 uint32_t vsc_prim_strm_pitch;
623 uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
624 bool vsc_initialized;
625
626 bool prev_fsr_is_null;
627 };
628 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
629 VK_OBJECT_TYPE_COMMAND_BUFFER)
630
631 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
632
633 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)634 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
635 const struct tu_render_pass_attachment *att,
636 uint32_t layer)
637 {
638 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
639 return att->gmem_offset[cmd->state.gmem_layout] +
640 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
641 att->cpp;
642 }
643
644 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)645 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
646 const struct tu_render_pass_attachment *att,
647 uint32_t layer)
648 {
649 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
650 return att->gmem_offset_stencil[cmd->state.gmem_layout] +
651 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
652 }
653
654 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
655 const struct tu_render_pass_state *src);
656
657 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
658 const VkCommandBufferBeginInfo *pBeginInfo);
659
660 template <chip CHIP>
661 void
662 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
663
664 template <chip CHIP>
665 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
666
667 template <chip CHIP>
668 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
669 struct tu_cs *cs,
670 enum tu_cmd_ccu_state ccu_state);
671
672 void
673 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
674 struct tu_cmd_buffer *secondary);
675
676 void
677 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
678 struct tu_cmd_buffer *secondary);
679
680 void
681 tu_append_post_chain(struct tu_cmd_buffer *cmd,
682 struct tu_cmd_buffer *secondary);
683
684 void
685 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
686 struct tu_cmd_buffer *suspended);
687
688 template <chip CHIP>
689 void tu_cmd_render(struct tu_cmd_buffer *cmd);
690
691 void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
692 uint32_t x, uint32_t y, uint32_t z);
693
694 void tu_dispatch_unaligned_indirect(VkCommandBuffer commandBuffer,
695 VkDeviceAddress size_addr);
696
697 void tu_write_buffer_cp(VkCommandBuffer commandBuffer,
698 VkDeviceAddress addr,
699 void *data, uint32_t size);
700
701 void tu_flush_buffer_write_cp(VkCommandBuffer commandBuffer);
702
703 enum fd_gpu_event : uint32_t;
704
705 template <chip CHIP>
706 void
707 tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
708 struct tu_cs *cs,
709 enum vgt_event_type event,
710 bool needs_seqno);
711
712 template <chip CHIP>
713 void
714 tu_emit_event_write(struct tu_cmd_buffer *cmd,
715 struct tu_cs *cs,
716 enum fd_gpu_event event);
717
718 void
719 tu_flush_for_access(struct tu_cache_state *cache,
720 enum tu_cmd_access_mask src_mask,
721 enum tu_cmd_access_mask dst_mask);
722
723 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)724 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
725 VkPipelineBindPoint bind_point)
726 {
727 return &cmd_buffer->descriptors[bind_point];
728 }
729
730 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
731 bool msaa_disable);
732
733 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
734
735 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
736
737 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
738
739 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
740 uint32_t *rb_depth_cntl);
741
742 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
743 struct tu_cs *cs,
744 void *data,
745 VkRect2D bin,
746 unsigned views,
747 VkExtent2D *frag_areas);
748
749 struct tu_fdm_bin_patchpoint {
750 uint64_t iova;
751 uint32_t size;
752 void *data;
753 tu_fdm_bin_apply_t apply;
754 };
755
756
757 void
758 tu_barrier(struct tu_cmd_buffer *cmd,
759 uint32_t dep_count,
760 const VkDependencyInfo *dep_info);
761
762 template <chip CHIP>
763 void
764 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
765 VkPipelineStageFlags2 stageMask, unsigned value);
766
767 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)768 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
769 struct tu_cs *cs,
770 unsigned size,
771 tu_fdm_bin_apply_t apply,
772 void *state,
773 unsigned state_size)
774 {
775 void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
776 memcpy(data, state, state_size);
777 assert(cs->writeable);
778 tu_cs_reserve_space(cs, size);
779 struct tu_fdm_bin_patchpoint patch = {
780 .iova = tu_cs_get_cur_iova(cs),
781 .size = size,
782 .data = data,
783 .apply = apply,
784 };
785
786 /* Apply the "default" setup where there is no scaling. This is used if
787 * sysmem is required, and uses up the dwords that have been reserved.
788 */
789 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
790 VkExtent2D unscaled_frag_areas[num_views];
791 for (unsigned i = 0; i < num_views; i++) {
792 unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
793 }
794 apply(cmd, cs, state, (VkRect2D) {
795 { 0, 0 },
796 { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
797 }, num_views, unscaled_frag_areas);
798 assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
799
800 util_dynarray_append(&cmd->fdm_bin_patchpoints,
801 struct tu_fdm_bin_patchpoint,
802 patch);
803 }
804
805 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
806 _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
807
808 VkResult tu_init_bin_preamble(struct tu_device *device);
809
810 #endif /* TU_CMD_BUFFER_H */
811