1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12
13 #include "tu_common.h"
14
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21
22 enum tu_draw_state_group_id
23 {
24 TU_DRAW_STATE_PROGRAM_CONFIG,
25 TU_DRAW_STATE_VS,
26 TU_DRAW_STATE_VS_BINNING,
27 TU_DRAW_STATE_HS,
28 TU_DRAW_STATE_DS,
29 TU_DRAW_STATE_GS,
30 TU_DRAW_STATE_GS_BINNING,
31 TU_DRAW_STATE_VPC,
32 TU_DRAW_STATE_FS,
33 TU_DRAW_STATE_VB,
34 TU_DRAW_STATE_CONST,
35 TU_DRAW_STATE_DESC_SETS,
36 TU_DRAW_STATE_DESC_SETS_LOAD,
37 TU_DRAW_STATE_VS_PARAMS,
38 TU_DRAW_STATE_FS_PARAMS,
39 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42 TU_DRAW_STATE_PRIM_MODE_GMEM,
43 TU_DRAW_STATE_PRIM_MODE_SYSMEM,
44
45 /* dynamic state related draw states */
46 TU_DRAW_STATE_DYNAMIC,
47 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
48 };
49
50 struct tu_descriptor_state
51 {
52 struct tu_descriptor_set *sets[MAX_SETS];
53 struct tu_descriptor_set push_set;
54 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
55 uint64_t set_iova[MAX_SETS];
56 uint32_t max_sets_bound;
57 uint32_t max_dynamic_offset_size;
58 };
59
60 enum tu_cmd_dirty_bits
61 {
62 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
63 TU_CMD_DIRTY_DESC_SETS = BIT(1),
64 TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
65 TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
66 TU_CMD_DIRTY_LRZ = BIT(4),
67 TU_CMD_DIRTY_VS_PARAMS = BIT(5),
68 TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
69 TU_CMD_DIRTY_SUBPASS = BIT(7),
70 TU_CMD_DIRTY_FDM = BIT(8),
71 TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
72 TU_CMD_DIRTY_TES = BIT(10),
73 TU_CMD_DIRTY_PROGRAM = BIT(11),
74 /* all draw states were disabled and need to be re-enabled: */
75 TU_CMD_DIRTY_DRAW_STATE = BIT(12)
76 };
77
78 /* There are only three cache domains we have to care about: the CCU, or
79 * color cache unit, which is used for color and depth/stencil attachments
80 * and copy/blit destinations, and is split conceptually into color and depth,
81 * and the universal cache or UCHE which is used for pretty much everything
82 * else, except for the CP (uncached) and host. We need to flush whenever data
83 * crosses these boundaries.
84 */
85
86 enum tu_cmd_access_mask {
87 TU_ACCESS_NONE = 0,
88 TU_ACCESS_UCHE_READ = 1 << 0,
89 TU_ACCESS_UCHE_WRITE = 1 << 1,
90 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
91 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
92 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
93 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
94
95 /* Experiments have shown that while it's safe to avoid flushing the CCU
96 * after each blit/renderpass, it's not safe to assume that subsequent
97 * lookups with a different attachment state will hit unflushed cache
98 * entries. That is, the CCU needs to be flushed and possibly invalidated
99 * when accessing memory with a different attachment state. Writing to an
100 * attachment under the following conditions after clearing using the
101 * normal 2d engine path is known to have issues:
102 *
103 * - It isn't the 0'th layer.
104 * - There are more than one attachment, and this isn't the 0'th attachment
105 * (this seems to also depend on the cpp of the attachments).
106 *
107 * Our best guess is that the layer/MRT state is used when computing
108 * the location of a cache entry in CCU, to avoid conflicts. We assume that
109 * any access in a renderpass after or before an access by a transfer needs
110 * a flush/invalidate, and use the _INCOHERENT variants to represent access
111 * by a renderpass.
112 */
113 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
114 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
115 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
116 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
117
118 /* Accesses which bypasses any cache. e.g. writes via the host,
119 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
120 */
121 TU_ACCESS_SYSMEM_READ = 1 << 10,
122 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
123
124 /* Memory writes from the CP start in-order with draws and event writes,
125 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
126 */
127 TU_ACCESS_CP_WRITE = 1 << 12,
128
129 /* Descriptors are read through UCHE but are also prefetched via
130 * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
131 * when they change.
132 */
133 TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
134
135 TU_ACCESS_READ =
136 TU_ACCESS_UCHE_READ |
137 TU_ACCESS_CCU_COLOR_READ |
138 TU_ACCESS_CCU_DEPTH_READ |
139 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
140 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
141 TU_ACCESS_SYSMEM_READ |
142 TU_ACCESS_BINDLESS_DESCRIPTOR_READ,
143
144 TU_ACCESS_WRITE =
145 TU_ACCESS_UCHE_WRITE |
146 TU_ACCESS_CCU_COLOR_WRITE |
147 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
148 TU_ACCESS_CCU_DEPTH_WRITE |
149 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
150 TU_ACCESS_SYSMEM_WRITE |
151 TU_ACCESS_CP_WRITE,
152
153 TU_ACCESS_ALL =
154 TU_ACCESS_READ |
155 TU_ACCESS_WRITE,
156 };
157
158 /* From the driver's point of view, we only need to distinguish between things
159 * which won't start until a WFI is complete and things which additionally
160 * need a WAIT_FOR_ME.
161 *
162 * TODO: This will get more complicated with concurrent binning.
163 */
164 enum tu_stage {
165 /* As a destination stage, this is for operations on the CP which don't
166 * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
167 * As a source stage, it is for things needing no waits.
168 */
169 TU_STAGE_CP,
170
171 /* This is for most operations, which WFI will wait to finish and will not
172 * start until any pending WFIs are finished.
173 */
174 TU_STAGE_GPU,
175
176 /* This is only used as a destination stage and is for things needing no
177 * waits on the GPU (e.g. host operations).
178 */
179 TU_STAGE_BOTTOM,
180 };
181
182 enum tu_cmd_flush_bits {
183 TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
184 TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
185 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
186 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
187 TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
188 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
189 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
190 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
191 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
192 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 9,
193
194 TU_CMD_FLAG_ALL_FLUSH =
195 TU_CMD_FLAG_CCU_FLUSH_DEPTH |
196 TU_CMD_FLAG_CCU_FLUSH_COLOR |
197 TU_CMD_FLAG_CACHE_FLUSH |
198 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
199 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
200 */
201 TU_CMD_FLAG_WAIT_MEM_WRITES,
202
203 TU_CMD_FLAG_ALL_INVALIDATE =
204 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
205 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
206 TU_CMD_FLAG_CACHE_INVALIDATE |
207 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
208 /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
209 * a command that needs CP_WAIT_FOR_ME is executed. This means we may
210 * insert an extra WAIT_FOR_ME before an indirect command requiring it
211 * in case there was another command before the current command buffer
212 * that it needs to wait for.
213 */
214 TU_CMD_FLAG_WAIT_FOR_ME,
215 };
216
217 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
218 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
219 * which part of the gmem is used by the CCU. Here we keep track of what the
220 * state of the CCU.
221 */
222 enum tu_cmd_ccu_state {
223 TU_CMD_CCU_SYSMEM,
224 TU_CMD_CCU_GMEM,
225 TU_CMD_CCU_UNKNOWN,
226 };
227
228 struct tu_cache_state {
229 /* Caches which must be made available (flushed) eventually if there are
230 * any users outside that cache domain, and caches which must be
231 * invalidated eventually if there are any reads.
232 */
233 BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
234 /* Pending flushes */
235 BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
236 };
237
238 struct tu_vs_params {
239 uint32_t vertex_offset;
240 uint32_t first_instance;
241 uint32_t draw_id;
242 };
243
244 struct tu_tess_params {
245 bool valid;
246 enum a6xx_tess_output output_upper_left, output_lower_left;
247 enum a6xx_tess_spacing spacing;
248 };
249
250 /* This should be for state that is set inside a renderpass and used at
251 * renderpass end time, e.g. to decide whether to use sysmem. This needs
252 * special handling for secondary cmdbufs and suspending/resuming render
253 * passes where the state may need to be combined afterwards.
254 */
255 struct tu_render_pass_state
256 {
257 bool xfb_used;
258 bool has_tess;
259 bool has_prim_generated_query_in_rp;
260 bool disable_gmem;
261 bool sysmem_single_prim_mode;
262 bool shared_viewport;
263
264 /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
265 bool draw_cs_writes_to_cond_pred;
266
267 uint32_t drawcall_count;
268
269 /* A calculated "draw cost" value for renderpass, which tries to
270 * estimate the bandwidth-per-sample of all the draws according
271 * to:
272 *
273 * foreach_draw (...) {
274 * sum += pipeline->color_bandwidth_per_sample;
275 * if (depth_test_enabled)
276 * sum += pipeline->depth_cpp_per_sample;
277 * if (depth_write_enabled)
278 * sum += pipeline->depth_cpp_per_sample;
279 * if (stencil_write_enabled)
280 * sum += pipeline->stencil_cpp_per_sample * 2;
281 * }
282 * drawcall_bandwidth_per_sample = sum / drawcall_count;
283 *
284 * It allows us to estimate the total bandwidth of drawcalls later, by
285 * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
286 *
287 * This does ignore depth buffer traffic for samples which do not
288 * pass due to depth-test fail, and some other details. But it is
289 * just intended to be a rough estimate that is easy to calculate.
290 */
291 uint32_t drawcall_bandwidth_per_sample_sum;
292 };
293
294 /* These are the states of the suspend/resume state machine. In addition to
295 * tracking whether we're in the middle of a chain of suspending and
296 * resuming passes that will be merged, we need to track whether the
297 * command buffer begins in the middle of such a chain, for when it gets
298 * merged with other command buffers. We call such a chain that begins
299 * before the command buffer starts a "pre-chain".
300 *
301 * Note that when this command buffer is finished, this state is untouched
302 * but it gains a different meaning. For example, if we finish in state
303 * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
304 * there's a suspend/resume chain that extends past the end of the command
305 * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
306 * means that there's a suspend/resume chain that extends before the
307 * beginning.
308 */
309 enum tu_suspend_resume_state
310 {
311 /* Either there are no suspend/resume chains, or they are entirely
312 * contained in the current command buffer.
313 *
314 * BeginCommandBuffer() <- start of current command buffer
315 * ...
316 * // we are here
317 */
318 SR_NONE = 0,
319
320 /* We are in the middle of a suspend/resume chain that starts before the
321 * current command buffer. This happens when the command buffer begins
322 * with a resuming render pass and all of the passes up to the current
323 * one are suspending. In this state, our part of the chain is not saved
324 * and is in the current draw_cs/state.
325 *
326 * BeginRendering() ... EndRendering(suspending)
327 * BeginCommandBuffer() <- start of current command buffer
328 * BeginRendering(resuming) ... EndRendering(suspending)
329 * BeginRendering(resuming) ... EndRendering(suspending)
330 * ...
331 * // we are here
332 */
333 SR_IN_PRE_CHAIN,
334
335 /* We are currently outside of any suspend/resume chains, but there is a
336 * chain starting before the current command buffer. It is saved in
337 * pre_chain.
338 *
339 * BeginRendering() ... EndRendering(suspending)
340 * BeginCommandBuffer() <- start of current command buffer
341 * // This part is stashed in pre_chain
342 * BeginRendering(resuming) ... EndRendering(suspending)
343 * BeginRendering(resuming) ... EndRendering(suspending)
344 * ...
345 * BeginRendering(resuming) ... EndRendering() // end of chain
346 * ...
347 * // we are here
348 */
349 SR_AFTER_PRE_CHAIN,
350
351 /* We are in the middle of a suspend/resume chain and there is no chain
352 * starting before the current command buffer.
353 *
354 * BeginCommandBuffer() <- start of current command buffer
355 * ...
356 * BeginRendering() ... EndRendering(suspending)
357 * BeginRendering(resuming) ... EndRendering(suspending)
358 * BeginRendering(resuming) ... EndRendering(suspending)
359 * ...
360 * // we are here
361 */
362 SR_IN_CHAIN,
363
364 /* We are in the middle of a suspend/resume chain and there is another,
365 * separate, chain starting before the current command buffer.
366 *
367 * BeginRendering() ... EndRendering(suspending)
368 * CommandBufferBegin() <- start of current command buffer
369 * // This part is stashed in pre_chain
370 * BeginRendering(resuming) ... EndRendering(suspending)
371 * BeginRendering(resuming) ... EndRendering(suspending)
372 * ...
373 * BeginRendering(resuming) ... EndRendering() // end of chain
374 * ...
375 * BeginRendering() ... EndRendering(suspending)
376 * BeginRendering(resuming) ... EndRendering(suspending)
377 * BeginRendering(resuming) ... EndRendering(suspending)
378 * ...
379 * // we are here
380 */
381 SR_IN_CHAIN_AFTER_PRE_CHAIN,
382 };
383
384 struct tu_cmd_state
385 {
386 uint32_t dirty;
387
388 struct tu_shader *shaders[MESA_SHADER_STAGES];
389
390 struct tu_program_state program;
391
392 struct tu_render_pass_state rp;
393
394 struct vk_render_pass_state vk_rp;
395 struct vk_vertex_input_state vi;
396 struct vk_sample_locations_state sl;
397
398 struct tu_bandwidth bandwidth;
399
400 /* Vertex buffers
401 * the states for these can be updated partially, so we need to save these
402 * to be able to emit a complete draw state
403 */
404 struct {
405 uint64_t base;
406 uint32_t size;
407 } vb[MAX_VBS];
408
409 uint32_t max_vbs_bound;
410
411 bool per_view_viewport;
412
413 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
414 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
415 struct tu_draw_state vertex_buffers;
416 struct tu_draw_state shader_const;
417 struct tu_draw_state desc_sets;
418 struct tu_draw_state load_state;
419 struct tu_draw_state compute_load_state;
420 struct tu_draw_state prim_order_sysmem, prim_order_gmem;
421
422 struct tu_draw_state vs_params;
423 struct tu_draw_state fs_params;
424
425 /* Index buffer */
426 uint64_t index_va;
427 uint32_t max_index_count;
428 uint8_t index_size;
429
430 /* because streamout base has to be 32-byte aligned
431 * there is an extra offset to deal with when it is
432 * unaligned
433 */
434 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
435
436 /* Renderpasses are tricky, because we may need to flush differently if
437 * using sysmem vs. gmem and therefore we have to delay any flushing that
438 * happens before a renderpass. So we have to have two copies of the flush
439 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
440 * and one for outside a renderpass.
441 */
442 struct tu_cache_state cache;
443 struct tu_cache_state renderpass_cache;
444
445 enum tu_cmd_ccu_state ccu_state;
446
447 /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
448 * might get used by tu_store_gmem_attachment().
449 */
450 enum tu_gmem_layout gmem_layout;
451
452 const struct tu_render_pass *pass;
453 const struct tu_subpass *subpass;
454 const struct tu_framebuffer *framebuffer;
455 const struct tu_tiling_config *tiling;
456 VkRect2D render_area;
457
458 const struct tu_image_view **attachments;
459 VkClearValue *clear_values;
460
461 /* State that in the dynamic case comes from VkRenderingInfo and needs to
462 * be saved/restored when suspending. This holds the state for the last
463 * suspended renderpass, which may point to this command buffer's dynamic_*
464 * or another command buffer if executed on a secondary.
465 */
466 struct {
467 const struct tu_render_pass *pass;
468 const struct tu_subpass *subpass;
469 const struct tu_framebuffer *framebuffer;
470 VkRect2D render_area;
471 enum tu_gmem_layout gmem_layout;
472
473 const struct tu_image_view **attachments;
474
475 struct tu_lrz_state lrz;
476 } suspended_pass;
477
478 bool tessfactor_addr_set;
479 bool predication_active;
480 bool msaa_disable;
481 bool blend_reads_dest;
482 bool stencil_front_write;
483 bool stencil_back_write;
484 bool pipeline_feedback_loop_ds;
485
486 bool pipeline_blend_lrz, pipeline_bandwidth;
487 uint32_t pipeline_draw_states;
488
489 /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
490 * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
491 * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
492 */
493 uint32_t prim_counters_running;
494
495 bool prim_generated_query_running_before_rp;
496
497 enum tu_suspend_resume_state suspend_resume;
498
499 bool suspending, resuming;
500
501 struct tu_lrz_state lrz;
502
503 struct tu_draw_state lrz_and_depth_plane_state;
504
505 struct tu_vs_params last_vs_params;
506
507 struct tu_tess_params tess_params;
508
509 uint64_t descriptor_buffer_iova[MAX_SETS];
510 };
511
512 struct tu_cmd_buffer
513 {
514 struct vk_command_buffer vk;
515
516 struct tu_device *device;
517
518 struct u_trace trace;
519 struct u_trace_iterator trace_renderpass_start;
520 struct u_trace_iterator trace_renderpass_end;
521
522 struct list_head renderpass_autotune_results;
523 struct tu_autotune_results_buffer* autotune_buffer;
524
525 void *patchpoints_ctx;
526 struct util_dynarray fdm_bin_patchpoints;
527
528 VkCommandBufferUsageFlags usage_flags;
529
530 VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
531
532 struct tu_cmd_state state;
533 uint32_t queue_family_index;
534
535 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
536 VkShaderStageFlags push_constant_stages;
537 struct tu_descriptor_set meta_push_descriptors;
538
539 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
540
541 struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 1];
542 struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
543 struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
544 const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 1];
545 VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
546
547 struct tu_render_pass dynamic_pass;
548 struct tu_subpass dynamic_subpass;
549 struct tu_framebuffer dynamic_framebuffer;
550
551 struct tu_cs cs;
552 struct tu_cs draw_cs;
553 struct tu_cs tile_store_cs;
554 struct tu_cs draw_epilogue_cs;
555 struct tu_cs sub_cs;
556
557 /* If the first render pass in the command buffer is resuming, then it is
558 * part of a suspend/resume chain that starts before the current command
559 * buffer and needs to be merged later. In this case, its incomplete state
560 * is stored in pre_chain. In the symmetric case where the last render pass
561 * is suspending, we just skip ending the render pass and its state is
562 * stored in draw_cs/the current state. The first and last render pass
563 * might be part of different chains, which is why all the state may need
564 * to be saved separately here.
565 */
566 struct {
567 struct tu_cs draw_cs;
568 struct tu_cs draw_epilogue_cs;
569
570 struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
571
572 struct tu_render_pass_state state;
573
574 struct util_dynarray fdm_bin_patchpoints;
575 void *patchpoints_ctx;
576 } pre_chain;
577
578 uint32_t vsc_draw_strm_pitch;
579 uint32_t vsc_prim_strm_pitch;
580 bool vsc_initialized;
581 };
582 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
583 VK_OBJECT_TYPE_COMMAND_BUFFER)
584
585 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
586
587 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)588 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
589 const struct tu_render_pass_attachment *att,
590 uint32_t layer)
591 {
592 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
593 return att->gmem_offset[cmd->state.gmem_layout] +
594 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
595 att->cpp;
596 }
597
598 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)599 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
600 const struct tu_render_pass_attachment *att,
601 uint32_t layer)
602 {
603 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
604 return att->gmem_offset_stencil[cmd->state.gmem_layout] +
605 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
606 }
607
608 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
609 const struct tu_render_pass_state *src);
610
611 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
612 const VkCommandBufferBeginInfo *pBeginInfo);
613
614 template <chip CHIP>
615 void
616 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
617
618 template <chip CHIP>
619 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
620
621 template <chip CHIP>
622 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
623 struct tu_cs *cs,
624 enum tu_cmd_ccu_state ccu_state);
625
626 void
627 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
628 struct tu_cmd_buffer *secondary);
629
630 void
631 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
632 struct tu_cmd_buffer *secondary);
633
634 void
635 tu_append_post_chain(struct tu_cmd_buffer *cmd,
636 struct tu_cmd_buffer *secondary);
637
638 void
639 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
640 struct tu_cmd_buffer *suspended);
641
642 template <chip CHIP>
643 void tu_cmd_render(struct tu_cmd_buffer *cmd);
644
645 enum fd_gpu_event : uint32_t;
646
647 template <chip CHIP>
648 void
649 tu_emit_event_write(struct tu_cmd_buffer *cmd,
650 struct tu_cs *cs,
651 enum fd_gpu_event event);
652
653 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)654 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
655 VkPipelineBindPoint bind_point)
656 {
657 return &cmd_buffer->descriptors[bind_point];
658 }
659
660 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
661 bool msaa_disable);
662
663 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
664
665 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
666
667 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
668
669 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
670 uint32_t *rb_depth_cntl);
671
672 void
673 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
674
675 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
676 struct tu_cs *cs,
677 void *data,
678 VkRect2D bin,
679 unsigned views,
680 VkExtent2D *frag_areas);
681
682 struct tu_fdm_bin_patchpoint {
683 uint64_t iova;
684 uint32_t size;
685 void *data;
686 tu_fdm_bin_apply_t apply;
687 };
688
689 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)690 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
691 struct tu_cs *cs,
692 unsigned size,
693 tu_fdm_bin_apply_t apply,
694 void *state,
695 unsigned state_size)
696 {
697 void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
698 memcpy(data, state, state_size);
699 assert(cs->writeable);
700 tu_cs_reserve_space(cs, size);
701 struct tu_fdm_bin_patchpoint patch = {
702 .iova = tu_cs_get_cur_iova(cs),
703 .size = size,
704 .data = data,
705 .apply = apply,
706 };
707
708 /* Apply the "default" setup where there is no scaling. This is used if
709 * sysmem is required, and uses up the dwords that have been reserved.
710 */
711 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
712 VkExtent2D unscaled_frag_areas[num_views];
713 for (unsigned i = 0; i < num_views; i++) {
714 unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
715 }
716 apply(cmd, cs, state, (VkRect2D) {
717 { 0, 0 },
718 { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
719 }, num_views, unscaled_frag_areas);
720 assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
721
722 util_dynarray_append(&cmd->fdm_bin_patchpoints,
723 struct tu_fdm_bin_patchpoint,
724 patch);
725 }
726
727 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
728 _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
729
730 #endif /* TU_CMD_BUFFER_H */
731