• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_cmd_buffer.h"
11 
12 #include "vk_common_entrypoints.h"
13 #include "vk_render_pass.h"
14 #include "vk_util.h"
15 
16 #include "tu_buffer.h"
17 #include "tu_clear_blit.h"
18 #include "tu_cs.h"
19 #include "tu_event.h"
20 #include "tu_image.h"
21 #include "tu_tracepoints.h"
22 
23 #include "common/freedreno_gpu_event.h"
24 #include "common/freedreno_lrz.h"
25 
26 static void
tu_clone_trace_range(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct u_trace_iterator begin,struct u_trace_iterator end)27 tu_clone_trace_range(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
28                      struct u_trace_iterator begin, struct u_trace_iterator end)
29 {
30    if (u_trace_iterator_equal(begin, end))
31       return;
32 
33    tu_cs_emit_wfi(cs);
34    tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
35    u_trace_clone_append(begin, end, &cmd->trace, cs, tu_copy_buffer);
36 }
37 
38 static void
tu_clone_trace(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct u_trace * trace)39 tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
40                struct u_trace *trace)
41 {
42    tu_clone_trace_range(cmd, cs, u_trace_begin_iterator(trace),
43          u_trace_end_iterator(trace));
44 }
45 
46 template <chip CHIP>
47 void
tu_emit_raw_event_write(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum vgt_event_type event,bool needs_seqno)48 tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
49                         struct tu_cs *cs,
50                         enum vgt_event_type event,
51                         bool needs_seqno)
52 {
53    if (CHIP == A6XX) {
54       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, needs_seqno ? 4 : 1);
55       tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
56    } else {
57       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, needs_seqno ? 4 : 1);
58       tu_cs_emit(cs,
59          CP_EVENT_WRITE7_0(.event = event,
60                            .write_src = EV_WRITE_USER_32B,
61                            .write_dst = EV_DST_RAM,
62                            .write_enabled = needs_seqno).value);
63    }
64 
65    if (needs_seqno) {
66       tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
67       tu_cs_emit(cs, 0);
68    }
69 }
70 TU_GENX(tu_emit_raw_event_write);
71 
72 template <chip CHIP>
73 void
tu_emit_event_write(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum fd_gpu_event event)74 tu_emit_event_write(struct tu_cmd_buffer *cmd,
75                     struct tu_cs *cs,
76                     enum fd_gpu_event event)
77 {
78    struct fd_gpu_event_info event_info = fd_gpu_events<CHIP>[event];
79    tu_emit_raw_event_write<CHIP>(cmd, cs, event_info.raw_event,
80                                  event_info.needs_seqno);
81 }
82 TU_GENX(tu_emit_event_write);
83 
84 /* Emits the tessfactor address to the top-level CS if it hasn't been already.
85  * Updating this register requires a WFI if outstanding drawing is using it, but
86  * tu6_init_hardware() will have WFIed before we started and no other draws
87  * could be using the tessfactor address yet since we only emit one per cmdbuf.
88  */
89 template <chip CHIP>
90 static void
tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer * cmd)91 tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd)
92 {
93    if (cmd->state.tessfactor_addr_set)
94       return;
95 
96    tu_cs_emit_regs(&cmd->cs, PC_TESSFACTOR_ADDR(CHIP, .qword = cmd->device->tess_bo->iova));
97    /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
98    cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
99    cmd->state.tessfactor_addr_set = true;
100 }
101 
102 static void
tu6_lazy_init_vsc(struct tu_cmd_buffer * cmd)103 tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd)
104 {
105    struct tu_device *dev = cmd->device;
106    uint32_t num_vsc_pipes = dev->physical_device->info->num_vsc_pipes;
107 
108    /* VSC buffers:
109     * use vsc pitches from the largest values used so far with this device
110     * if there hasn't been overflow, there will already be a scratch bo
111     * allocated for these sizes
112     *
113     * if overflow is detected, the stream size is increased by 2x
114     */
115    mtx_lock(&dev->mutex);
116 
117    struct tu6_global *global = dev->global_bo_map;
118 
119    uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
120    uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
121 
122    if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
123       dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
124 
125    if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
126       dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
127 
128    cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
129    cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
130 
131    mtx_unlock(&dev->mutex);
132 
133    struct tu_bo *vsc_bo;
134    uint32_t size0 = cmd->vsc_prim_strm_pitch * num_vsc_pipes +
135                     cmd->vsc_draw_strm_pitch * num_vsc_pipes;
136 
137    tu_get_scratch_bo(dev, size0 + num_vsc_pipes * 4, &vsc_bo);
138 
139    cmd->vsc_draw_strm_va = vsc_bo->iova + cmd->vsc_prim_strm_pitch * num_vsc_pipes;
140    cmd->vsc_draw_strm_size_va = vsc_bo->iova + size0;
141    cmd->vsc_prim_strm_va = vsc_bo->iova;
142 }
143 
144 template <chip CHIP>
145 static void
tu_emit_vsc(struct tu_cmd_buffer * cmd,struct tu_cs * cs)146 tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
147 {
148    if (CHIP == A6XX) {
149       tu_cs_emit_regs(cs,
150                      A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.qword = cmd->vsc_draw_strm_size_va));
151       tu_cs_emit_regs(cs,
152                      A6XX_VSC_PRIM_STRM_ADDRESS(.qword = cmd->vsc_prim_strm_va));
153       tu_cs_emit_regs(
154          cs, A6XX_VSC_DRAW_STRM_ADDRESS(.qword = cmd->vsc_draw_strm_va));
155    } else {
156       tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3);
157       tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(DRAW_STRM_ADDRESS));
158       tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va);
159       tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(DRAW_STRM_SIZE_ADDRESS));
160       tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va);
161       tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(PRIM_STRM_ADDRESS));
162       tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va);
163    }
164 
165    cmd->vsc_initialized = true;
166 }
167 
168 template <chip CHIP>
169 static void
tu6_emit_flushes(struct tu_cmd_buffer * cmd_buffer,struct tu_cs * cs,struct tu_cache_state * cache)170 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
171                  struct tu_cs *cs,
172                  struct tu_cache_state *cache)
173 {
174    BITMASK_ENUM(tu_cmd_flush_bits) flushes = cache->flush_bits;
175    cache->flush_bits = 0;
176 
177    if (TU_DEBUG(FLUSHALL))
178       flushes |= TU_CMD_FLAG_ALL_CLEAN | TU_CMD_FLAG_ALL_INVALIDATE;
179 
180    if (TU_DEBUG(SYNCDRAW))
181       flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
182                  TU_CMD_FLAG_WAIT_FOR_IDLE |
183                  TU_CMD_FLAG_WAIT_FOR_ME;
184 
185    /* Experiments show that invalidating CCU while it still has data in it
186     * doesn't work, so make sure to always flush before invalidating in case
187     * any data remains that hasn't yet been made available through a barrier.
188     * However it does seem to work for UCHE.
189     */
190    if (flushes & (TU_CMD_FLAG_CCU_CLEAN_COLOR |
191                   TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
192       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CCU_CLEAN_COLOR);
193    if (flushes & (TU_CMD_FLAG_CCU_CLEAN_DEPTH |
194                   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
195       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CCU_CLEAN_DEPTH);
196    if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
197       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CCU_INVALIDATE_COLOR);
198    if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
199       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CCU_INVALIDATE_DEPTH);
200    if (flushes & TU_CMD_FLAG_CACHE_CLEAN)
201       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CACHE_CLEAN);
202    if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
203       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CACHE_INVALIDATE);
204    if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) {
205       tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
206             .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
207             .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
208       ));
209    }
210    if (CHIP >= A7XX && flushes & TU_CMD_FLAG_BLIT_CACHE_CLEAN)
211       /* On A7XX, blit cache flushes are required to ensure blit writes are visible
212        * via UCHE. This isn't necessary on A6XX, all writes should be visible implictly.
213        */
214       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CCU_CLEAN_BLIT_CACHE);
215    if (CHIP >= A7XX && (flushes & TU_CMD_FLAG_CCHE_INVALIDATE) &&
216        /* Invalidating UCHE seems to also invalidate CCHE */
217        !(flushes & TU_CMD_FLAG_CACHE_INVALIDATE))
218       tu_cs_emit_pkt7(cs, CP_CCHE_INVALIDATE, 0);
219    if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
220       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
221    if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
222       tu_cs_emit_wfi(cs);
223    if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
224       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
225 }
226 
227 /* "Normal" cache flushes outside the renderpass, that don't require any special handling */
228 template <chip CHIP>
229 void
tu_emit_cache_flush(struct tu_cmd_buffer * cmd_buffer)230 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer)
231 {
232    tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->cs, &cmd_buffer->state.cache);
233 }
234 TU_GENX(tu_emit_cache_flush);
235 
236 /* Renderpass cache flushes inside the draw_cs */
237 template <chip CHIP>
238 void
tu_emit_cache_flush_renderpass(struct tu_cmd_buffer * cmd_buffer)239 tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer)
240 {
241    if (!cmd_buffer->state.renderpass_cache.flush_bits &&
242        likely(!tu_env.debug))
243       return;
244    tu6_emit_flushes<CHIP>(cmd_buffer, &cmd_buffer->draw_cs,
245                     &cmd_buffer->state.renderpass_cache);
246    if (cmd_buffer->state.renderpass_cache.flush_bits &
247        TU_CMD_FLAG_BLIT_CACHE_CLEAN) {
248       cmd_buffer->state.blit_cache_cleaned = true;
249    }
250 }
251 TU_GENX(tu_emit_cache_flush_renderpass);
252 
253 template <chip CHIP>
254 static void
emit_rb_ccu_cntl(struct tu_cs * cs,struct tu_device * dev,bool gmem)255 emit_rb_ccu_cntl(struct tu_cs *cs, struct tu_device *dev, bool gmem)
256 {
257    /* The CCUs are a cache that allocates memory from GMEM while facilitating
258     * framebuffer caching for sysmem rendering. The CCU is split into two parts,
259     * one for color and one for depth. The size and offset of these in GMEM can
260     * be configured separately.
261     *
262     * The most common configuration for the CCU is to occupy as much as possible
263     * of GMEM (CACHE_SIZE_FULL) during sysmem rendering as GMEM is unused. On
264     * the other hand, when rendering to GMEM, the CCUs can be left enabled at
265     * any configuration as they don't interfere with GMEM rendering and only
266     * overwrite GMEM when sysmem operations are performed.
267     *
268     * The vast majority of GMEM rendering doesn't need any sysmem operations
269     * but there are some cases where it is required. For example, when the
270     * framebuffer isn't aligned to the tile size or with certain MSAA resolves.
271     *
272     * To correctly handle these cases, we need to be able to switch between
273     * sysmem and GMEM rendering. We do this by allocating a carveout at the
274     * end of GMEM for the color CCU (as none of these operations are depth)
275     * which the color CCU offset is set to and the GMEM size available to the
276     * GMEM layout calculations is adjusted accordingly.
277     */
278    uint32_t color_offset = gmem ? dev->physical_device->ccu_offset_gmem
279                                 : dev->physical_device->ccu_offset_bypass;
280 
281    uint32_t color_offset_hi = color_offset >> 21;
282    color_offset &= 0x1fffff;
283 
284    uint32_t depth_offset = gmem ? 0
285                                 : dev->physical_device->ccu_depth_offset_bypass;
286 
287    uint32_t depth_offset_hi = depth_offset >> 21;
288    depth_offset &= 0x1fffff;
289 
290    enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL : !gmem ? CCU_CACHE_SIZE_FULL :
291       (a6xx_ccu_cache_size)(dev->physical_device->info->a6xx.gmem_ccu_color_cache_fraction);
292 
293    if (CHIP == A7XX) {
294       tu_cs_emit_regs(cs, A7XX_RB_CCU_CNTL2(
295          .depth_offset_hi = depth_offset_hi,
296          .color_offset_hi = color_offset_hi,
297          .depth_cache_size = CCU_CACHE_SIZE_FULL,
298          .depth_offset = depth_offset,
299          .color_cache_size = color_cache_size,
300          .color_offset = color_offset
301       ));
302 
303       if (dev->physical_device->info->a7xx.has_gmem_vpc_attr_buf) {
304          tu_cs_emit_regs(cs,
305             A7XX_VPC_ATTR_BUF_SIZE_GMEM(
306                   .size_gmem =
307                      gmem ? dev->physical_device->vpc_attr_buf_size_gmem
308                           : dev->physical_device->vpc_attr_buf_size_bypass),
309             A7XX_VPC_ATTR_BUF_BASE_GMEM(
310                   .base_gmem =
311                      gmem ? dev->physical_device->vpc_attr_buf_offset_gmem
312                           : dev->physical_device->vpc_attr_buf_offset_bypass), );
313          tu_cs_emit_regs(cs,
314             A7XX_PC_ATTR_BUF_SIZE_GMEM(
315                   .size_gmem =
316                      gmem ? dev->physical_device->vpc_attr_buf_size_gmem
317                           : dev->physical_device->vpc_attr_buf_size_bypass), );
318       }
319    } else {
320       tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP,
321          .gmem_fast_clear_disable =
322             !dev->physical_device->info->a6xx.has_gmem_fast_clear,
323          .concurrent_resolve =
324             dev->physical_device->info->a6xx.concurrent_resolve,
325          .depth_offset_hi = 0,
326          .color_offset_hi = color_offset_hi,
327          .depth_cache_size = CCU_CACHE_SIZE_FULL,
328          .depth_offset = 0,
329          .color_cache_size = color_cache_size,
330          .color_offset = color_offset
331       ));
332    }
333 }
334 
335 /* Cache flushes for things that use the color/depth read/write path (i.e.
336  * blits and draws). This deals with changing CCU state as well as the usual
337  * cache flushing.
338  */
339 template <chip CHIP>
340 void
tu_emit_cache_flush_ccu(struct tu_cmd_buffer * cmd_buffer,struct tu_cs * cs,enum tu_cmd_ccu_state ccu_state)341 tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
342                         struct tu_cs *cs,
343                         enum tu_cmd_ccu_state ccu_state)
344 {
345    assert(ccu_state != TU_CMD_CCU_UNKNOWN);
346    /* It's unsafe to flush inside condition because we clear flush_bits */
347    assert(!cs->cond_stack_depth);
348 
349    /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
350     * the CCU may also contain data that we haven't flushed out yet, so we
351     * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
352     * emit a WFI as it isn't pipelined.
353     *
354     * Note: On A7XX, with the introduction of RB_CCU_CNTL2, we no longer need
355     * to emit a WFI when changing a subset of CCU state.
356     */
357    if (ccu_state != cmd_buffer->state.ccu_state) {
358       if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
359          cmd_buffer->state.cache.flush_bits |=
360             TU_CMD_FLAG_CCU_CLEAN_COLOR |
361             TU_CMD_FLAG_CCU_CLEAN_DEPTH;
362          cmd_buffer->state.cache.pending_flush_bits &= ~(
363             TU_CMD_FLAG_CCU_CLEAN_COLOR |
364             TU_CMD_FLAG_CCU_CLEAN_DEPTH);
365       }
366       cmd_buffer->state.cache.flush_bits |=
367          TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
368          TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
369          (CHIP == A6XX ? TU_CMD_FLAG_WAIT_FOR_IDLE : 0);
370       cmd_buffer->state.cache.pending_flush_bits &= ~(
371          TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
372          TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
373          (CHIP == A6XX ? TU_CMD_FLAG_WAIT_FOR_IDLE : 0));
374    }
375 
376    tu6_emit_flushes<CHIP>(cmd_buffer, cs, &cmd_buffer->state.cache);
377 
378    if (ccu_state != cmd_buffer->state.ccu_state) {
379       emit_rb_ccu_cntl<CHIP>(cs, cmd_buffer->device,
380                              ccu_state == TU_CMD_CCU_GMEM);
381       cmd_buffer->state.ccu_state = ccu_state;
382    }
383 }
384 TU_GENX(tu_emit_cache_flush_ccu);
385 
386 template <chip CHIP>
387 static void
tu6_emit_zs(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,struct tu_cs * cs)388 tu6_emit_zs(struct tu_cmd_buffer *cmd,
389             const struct tu_subpass *subpass,
390             struct tu_cs *cs)
391 {
392    const uint32_t a = subpass->depth_stencil_attachment.attachment;
393    if (a == VK_ATTACHMENT_UNUSED) {
394       tu_cs_emit_regs(cs,
395                       RB_DEPTH_BUFFER_INFO(CHIP, .depth_format = DEPTH6_NONE),
396                       A6XX_RB_DEPTH_BUFFER_PITCH(0),
397                       A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
398                       A6XX_RB_DEPTH_BUFFER_BASE(0),
399                       A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
400 
401       tu_cs_emit_regs(cs,
402                       A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
403 
404       tu_cs_emit_regs(cs, RB_STENCIL_INFO(CHIP, 0));
405 
406       return;
407    }
408 
409    const struct tu_image_view *iview = cmd->state.attachments[a];
410    const struct tu_render_pass_attachment *attachment =
411       &cmd->state.pass->attachments[a];
412    enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
413 
414    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
415    tu_cs_emit(cs, RB_DEPTH_BUFFER_INFO(CHIP,
416                      .depth_format = fmt,
417                      .tilemode = TILE6_3,
418                      .losslesscompen = iview->view.ubwc_enabled,
419                      ).value);
420    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
421       tu_cs_image_depth_ref(cs, iview, 0);
422    else
423       tu_cs_image_ref(cs, &iview->view, 0);
424    tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment, 0));
425 
426    tu_cs_emit_regs(cs,
427                    A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
428 
429    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
430    tu_cs_image_flag_ref(cs, &iview->view, 0);
431 
432    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
433        attachment->format == VK_FORMAT_S8_UINT) {
434 
435       tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
436       tu_cs_emit(cs, RB_STENCIL_INFO(CHIP,
437                         .separate_stencil = true,
438                         .tilemode = TILE6_3,
439                         ).value);
440       if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
441          tu_cs_image_stencil_ref(cs, iview, 0);
442          tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment, 0));
443       } else {
444          tu_cs_image_ref(cs, &iview->view, 0);
445          tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment, 0));
446       }
447    } else {
448       tu_cs_emit_regs(cs,
449                      RB_STENCIL_INFO(CHIP, 0));
450    }
451 }
452 
453 template <chip CHIP>
454 static void
tu6_emit_mrt(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,struct tu_cs * cs)455 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
456              const struct tu_subpass *subpass,
457              struct tu_cs *cs)
458 {
459    const struct tu_framebuffer *fb = cmd->state.framebuffer;
460 
461    enum a6xx_format mrt0_format = FMT6_NONE;
462 
463    uint32_t written = 0;
464    for (uint32_t i = 0; i < subpass->color_count; ++i) {
465       uint32_t a = subpass->color_attachments[i].attachment;
466       unsigned remapped = cmd->vk.dynamic_graphics_state.cal.color_map[i];
467       if (a == VK_ATTACHMENT_UNUSED ||
468           remapped == MESA_VK_ATTACHMENT_UNUSED)
469          continue;
470 
471       const struct tu_image_view *iview = cmd->state.attachments[a];
472 
473       tu_cs_emit_regs(cs,
474          RB_MRT_BUF_INFO(CHIP, remapped, .dword = iview->view.RB_MRT_BUF_INFO),
475          A6XX_RB_MRT_PITCH(remapped, iview->view.pitch),
476          A6XX_RB_MRT_ARRAY_PITCH(remapped, iview->view.layer_size),
477          A6XX_RB_MRT_BASE(remapped, .qword = tu_layer_address(&iview->view, 0)),
478          A6XX_RB_MRT_BASE_GMEM(remapped,
479             tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a], 0)
480          ),
481       );
482 
483       tu_cs_emit_regs(cs,
484                       A6XX_SP_FS_MRT_REG(remapped, .dword = iview->view.SP_FS_MRT_REG));
485 
486       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(remapped), 3);
487       tu_cs_image_flag_ref(cs, &iview->view, 0);
488 
489       if (remapped == 0)
490          mrt0_format = (enum a6xx_format) (iview->view.SP_FS_MRT_REG & 0xff);
491 
492       written |= 1u << remapped;
493    }
494 
495    u_foreach_bit (i, ~written) {
496       if (i >= subpass->color_count)
497          break;
498 
499       /* From the VkPipelineRenderingCreateInfo definition:
500        *
501        *    Valid formats indicate that an attachment can be used - but it
502        *    is still valid to set the attachment to NULL when beginning
503        *    rendering.
504        *
505        * This means that with dynamic rendering, pipelines may write to
506        * some attachments that are UNUSED here. Setting the format to 0
507        * here should prevent them from writing to anything. This also seems
508        * to also be required for alpha-to-coverage which can use the alpha
509        * value for an otherwise-unused attachment.
510        */
511        tu_cs_emit_regs(cs,
512          RB_MRT_BUF_INFO(CHIP, i),
513          A6XX_RB_MRT_PITCH(i),
514          A6XX_RB_MRT_ARRAY_PITCH(i),
515          A6XX_RB_MRT_BASE(i),
516          A6XX_RB_MRT_BASE_GMEM(i),
517        );
518 
519        tu_cs_emit_regs(cs,
520                        A6XX_SP_FS_MRT_REG(i, .dword = 0));
521    }
522 
523    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = mrt0_format));
524 
525    const bool dither = subpass->legacy_dithering_enabled;
526    const uint32_t dither_cntl =
527       A6XX_RB_DITHER_CNTL(
528             .dither_mode_mrt0 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
529             .dither_mode_mrt1 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
530             .dither_mode_mrt2 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
531             .dither_mode_mrt3 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
532             .dither_mode_mrt4 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
533             .dither_mode_mrt5 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
534             .dither_mode_mrt6 = dither ? DITHER_ALWAYS : DITHER_DISABLE,
535             .dither_mode_mrt7 = dither ? DITHER_ALWAYS : DITHER_DISABLE, )
536          .value;
537    tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL(.dword = dither_cntl));
538    if (CHIP >= A7XX) {
539       tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL(.dword = dither_cntl));
540    }
541 
542    tu_cs_emit_regs(cs,
543                    A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
544    tu_cs_emit_regs(cs,
545                    A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
546 
547    unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
548    tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
549 }
550 
551 struct tu_bin_size_params {
552    enum a6xx_render_mode render_mode;
553    bool force_lrz_write_dis;
554    enum a6xx_buffers_location buffers_location;
555    enum a6xx_lrz_feedback_mask lrz_feedback_zmode_mask;
556 };
557 
558 template <chip CHIP>
559 static void
tu6_emit_bin_size(struct tu_cs * cs,uint32_t bin_w,uint32_t bin_h,struct tu_bin_size_params && p)560 tu6_emit_bin_size(struct tu_cs *cs,
561                   uint32_t bin_w,
562                   uint32_t bin_h,
563                   struct tu_bin_size_params &&p)
564 {
565    if (CHIP == A6XX) {
566       tu_cs_emit_regs(
567          cs, A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
568                                    .binh = bin_h,
569                                    .render_mode = p.render_mode,
570                                    .force_lrz_write_dis = p.force_lrz_write_dis,
571                                    .buffers_location = p.buffers_location,
572                                    .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
573    } else {
574       tu_cs_emit_regs(cs,
575                       A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
576                                             .binh = bin_h,
577                                             .render_mode = p.render_mode,
578                                             .force_lrz_write_dis = p.force_lrz_write_dis,
579                                             .lrz_feedback_zmode_mask =
580                                                p.lrz_feedback_zmode_mask, ));
581    }
582 
583    tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP,
584                         .binw = bin_w,
585                         .binh = bin_h,
586                         .render_mode = p.render_mode,
587                         .force_lrz_write_dis = p.force_lrz_write_dis,
588                         .buffers_location = p.buffers_location,
589                         .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, ));
590 
591    /* no flag for RB_BIN_CONTROL2... */
592    tu_cs_emit_regs(cs,
593                    A6XX_RB_BIN_CONTROL2(.binw = bin_w,
594                                         .binh = bin_h));
595 }
596 
597 template <chip CHIP>
598 static void
599 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
600                      const struct tu_subpass *subpass,
601                      struct tu_cs *cs,
602                      bool binning);
603 
604 template <>
605 void
tu6_emit_render_cntl(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,struct tu_cs * cs,bool binning)606 tu6_emit_render_cntl<A6XX>(struct tu_cmd_buffer *cmd,
607                      const struct tu_subpass *subpass,
608                      struct tu_cs *cs,
609                      bool binning)
610 {
611    /* doesn't RB_RENDER_CNTL set differently for binning pass: */
612    bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
613    uint32_t cntl = 0;
614    cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2);
615    if (binning) {
616       if (no_track)
617          return;
618       cntl |= A6XX_RB_RENDER_CNTL_BINNING;
619    } else {
620       uint32_t mrts_ubwc_enable = 0;
621       for (uint32_t i = 0; i < subpass->color_count; ++i) {
622          uint32_t a = subpass->color_attachments[i].attachment;
623          unsigned remapped = cmd->vk.dynamic_graphics_state.cal.color_map[i];
624          if (a == VK_ATTACHMENT_UNUSED ||
625              remapped == MESA_VK_ATTACHMENT_UNUSED)
626             continue;
627 
628          const struct tu_image_view *iview = cmd->state.attachments[a];
629          if (iview->view.ubwc_enabled)
630             mrts_ubwc_enable |= 1 << remapped;
631       }
632 
633       cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
634 
635       const uint32_t a = subpass->depth_stencil_attachment.attachment;
636       if (a != VK_ATTACHMENT_UNUSED) {
637          const struct tu_image_view *iview = cmd->state.attachments[a];
638          if (iview->view.ubwc_enabled)
639             cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
640       }
641 
642       if (no_track) {
643          tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);
644          tu_cs_emit(cs, cntl);
645          return;
646       }
647 
648       /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
649        * in order to set it correctly for the different subpasses. However,
650        * that means the packets we're emitting also happen during binning. So
651        * we need to guard the write on !BINNING at CP execution time.
652        */
653       tu_cs_reserve(cs, 3 + 4);
654       tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
655       tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
656                      CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
657       tu_cs_emit(cs, RENDER_MODE_CP_COND_REG_EXEC_1_DWORDS(4));
658    }
659 
660    tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
661    tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
662    tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
663    tu_cs_emit(cs, cntl);
664 }
665 
666 template <>
667 void
tu6_emit_render_cntl(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,struct tu_cs * cs,bool binning)668 tu6_emit_render_cntl<A7XX>(struct tu_cmd_buffer *cmd,
669                      const struct tu_subpass *subpass,
670                      struct tu_cs *cs,
671                      bool binning)
672 {
673    tu_cs_emit_regs(
674       cs, RB_RENDER_CNTL(A7XX, .binning = binning, .raster_mode = TYPE_TILED,
675                               .raster_direction = LR_TB));
676    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL(.binning = binning));
677 }
678 
679 static void
tu6_emit_blit_scissor(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool align)680 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
681 {
682    struct tu_physical_device *phys_dev = cmd->device->physical_device;
683    const VkRect2D *render_area = &cmd->state.render_area;
684 
685    /* Avoid assertion fails with an empty render area at (0, 0) where the
686     * subtraction below wraps around. Empty render areas should be forced to
687     * the sysmem path by use_sysmem_rendering(). It's not even clear whether
688     * an empty scissor here works, and the blob seems to force sysmem too as
689     * it sets something wrong (non-empty) for the scissor.
690     */
691    if (render_area->extent.width == 0 ||
692        render_area->extent.height == 0)
693       return;
694 
695    uint32_t x1 = render_area->offset.x;
696    uint32_t y1 = render_area->offset.y;
697    uint32_t x2 = x1 + render_area->extent.width - 1;
698    uint32_t y2 = y1 + render_area->extent.height - 1;
699 
700    if (align) {
701       x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);
702       y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);
703       x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;
704       y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
705    }
706 
707    tu_cs_emit_regs(cs,
708                    A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
709                    A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
710 }
711 
712 void
tu6_emit_window_scissor(struct tu_cs * cs,uint32_t x1,uint32_t y1,uint32_t x2,uint32_t y2)713 tu6_emit_window_scissor(struct tu_cs *cs,
714                         uint32_t x1,
715                         uint32_t y1,
716                         uint32_t x2,
717                         uint32_t y2)
718 {
719    tu_cs_emit_regs(cs,
720                    A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
721                    A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
722 
723    tu_cs_emit_regs(cs,
724                    A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
725                    A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
726 }
727 
728 template <chip CHIP>
729 void
tu6_emit_window_offset(struct tu_cs * cs,uint32_t x1,uint32_t y1)730 tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
731 {
732    tu_cs_emit_regs(cs,
733                    A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
734 
735    tu_cs_emit_regs(cs,
736                    A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
737 
738    tu_cs_emit_regs(cs,
739                    SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1));
740 
741    tu_cs_emit_regs(cs,
742                    A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
743 
744    tu_cs_emit_regs(cs,
745                    A7XX_SP_PS_2D_WINDOW_OFFSET(.x = x1, .y = y1));
746 }
747 
748 void
tu6_apply_depth_bounds_workaround(struct tu_device * device,uint32_t * rb_depth_cntl)749 tu6_apply_depth_bounds_workaround(struct tu_device *device,
750                                   uint32_t *rb_depth_cntl)
751 {
752    if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk)
753       return;
754 
755    /* On some GPUs it is necessary to enable z test for depth bounds test when
756     * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to
757     * pass z test. Relevant tests:
758     *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
759     *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
760     */
761    *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
762                      A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS);
763 }
764 
765 static void
tu_cs_emit_draw_state(struct tu_cs * cs,uint32_t id,struct tu_draw_state state)766 tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
767 {
768    uint32_t enable_mask;
769    switch (id) {
770    case TU_DRAW_STATE_VS:
771    case TU_DRAW_STATE_FS:
772    case TU_DRAW_STATE_VPC:
773    /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
774     * when resources would actually be used in the binning shader.
775     * Presumably the overhead of prefetching the resources isn't
776     * worth it.
777     */
778    case TU_DRAW_STATE_DESC_SETS_LOAD:
779       enable_mask = CP_SET_DRAW_STATE__0_GMEM |
780                     CP_SET_DRAW_STATE__0_SYSMEM;
781       break;
782    case TU_DRAW_STATE_VS_BINNING:
783    case TU_DRAW_STATE_GS_BINNING:
784       enable_mask = CP_SET_DRAW_STATE__0_BINNING;
785       break;
786    case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
787       enable_mask = CP_SET_DRAW_STATE__0_GMEM;
788       break;
789    case TU_DRAW_STATE_PRIM_MODE_GMEM:
790       /* On a7xx the prim mode is the same for gmem and sysmem, and it no
791        * longer depends on dynamic state, so we reuse the gmem state for
792        * everything:
793        */
794       if (cs->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
795          enable_mask = CP_SET_DRAW_STATE__0_GMEM |
796                        CP_SET_DRAW_STATE__0_SYSMEM |
797                        CP_SET_DRAW_STATE__0_BINNING;
798       } else {
799          enable_mask = CP_SET_DRAW_STATE__0_GMEM;
800       }
801       break;
802    case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
803       enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
804       break;
805    case TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM:
806       if (!cs->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
807          /* By also applying the state during binning we ensure that there
808          * is no rotation applied, by previous A6XX_GRAS_SC_CNTL::rotation.
809          */
810          enable_mask =
811             CP_SET_DRAW_STATE__0_SYSMEM | CP_SET_DRAW_STATE__0_BINNING;
812       } else {
813          static_assert(TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM ==
814                        TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE);
815          enable_mask = CP_SET_DRAW_STATE__0_GMEM |
816                        CP_SET_DRAW_STATE__0_SYSMEM |
817                        CP_SET_DRAW_STATE__0_BINNING;
818       }
819 
820       break;
821    default:
822       enable_mask = CP_SET_DRAW_STATE__0_GMEM |
823                     CP_SET_DRAW_STATE__0_SYSMEM |
824                     CP_SET_DRAW_STATE__0_BINNING;
825       break;
826    }
827 
828    STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);
829 
830    /* We need to reload the descriptors every time the descriptor sets
831     * change. However, the commands we send only depend on the pipeline
832     * because the whole point is to cache descriptors which are used by the
833     * pipeline. There's a problem here, in that the firmware has an
834     * "optimization" which skips executing groups that are set to the same
835     * value as the last draw. This means that if the descriptor sets change
836     * but not the pipeline, we'd try to re-execute the same buffer which
837     * the firmware would ignore and we wouldn't pre-load the new
838     * descriptors. Set the DIRTY bit to avoid this optimization.
839     *
840     * We set the dirty bit for shader draw states because they contain
841     * CP_LOAD_STATE packets that are invalidated by the PROGRAM_CONFIG draw
842     * state, so if PROGRAM_CONFIG changes but one of the shaders stays the
843     * same then we still need to re-emit everything. The GLES blob which
844     * implements separate shader draw states does the same thing.
845     *
846     * We also need to set this bit for draw states which may be patched by the
847     * GPU, because their underlying memory may change between setting the draw
848     * state.
849     */
850    if (id == TU_DRAW_STATE_DESC_SETS_LOAD ||
851        id == TU_DRAW_STATE_VS ||
852        id == TU_DRAW_STATE_VS_BINNING ||
853        id == TU_DRAW_STATE_HS ||
854        id == TU_DRAW_STATE_DS ||
855        id == TU_DRAW_STATE_GS ||
856        id == TU_DRAW_STATE_GS_BINNING ||
857        id == TU_DRAW_STATE_FS ||
858        state.writeable)
859       enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
860 
861    tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
862                   enable_mask |
863                   CP_SET_DRAW_STATE__0_GROUP_ID(id) |
864                   COND(!state.size || !state.iova, CP_SET_DRAW_STATE__0_DISABLE));
865    tu_cs_emit_qw(cs, state.iova);
866 }
867 
868 void
tu6_emit_msaa(struct tu_cs * cs,VkSampleCountFlagBits vk_samples,bool msaa_disable)869 tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples,
870               bool msaa_disable)
871 {
872    const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
873    msaa_disable |= (samples == MSAA_ONE);
874    tu_cs_emit_regs(cs,
875                    A6XX_SP_TP_RAS_MSAA_CNTL(samples),
876                    A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
877                                              .msaa_disable = msaa_disable));
878 
879    tu_cs_emit_regs(cs,
880                    A6XX_GRAS_RAS_MSAA_CNTL(samples),
881                    A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
882                                             .msaa_disable = msaa_disable));
883 
884    tu_cs_emit_regs(cs,
885                    A6XX_RB_RAS_MSAA_CNTL(samples),
886                    A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
887                                           .msaa_disable = msaa_disable));
888 }
889 
890 static void
tu6_update_msaa(struct tu_cmd_buffer * cmd)891 tu6_update_msaa(struct tu_cmd_buffer *cmd)
892 {
893    VkSampleCountFlagBits samples =
894       cmd->vk.dynamic_graphics_state.ms.rasterization_samples;;
895 
896    /* The samples may not be set by the pipeline or dynamically if raster
897     * discard is enabled. We can set any valid value, but don't set the
898     * default invalid value of 0.
899     */
900    if (samples == 0)
901       samples = VK_SAMPLE_COUNT_1_BIT;
902    tu6_emit_msaa(&cmd->draw_cs, samples, cmd->state.msaa_disable);
903 }
904 
905 static void
tu6_update_msaa_disable(struct tu_cmd_buffer * cmd)906 tu6_update_msaa_disable(struct tu_cmd_buffer *cmd)
907 {
908    VkPrimitiveTopology topology =
909       (VkPrimitiveTopology)cmd->vk.dynamic_graphics_state.ia.primitive_topology;
910    bool is_line =
911       topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST ||
912       topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY ||
913       topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP ||
914       topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY ||
915       (topology == VK_PRIMITIVE_TOPOLOGY_PATCH_LIST &&
916        cmd->state.shaders[MESA_SHADER_TESS_EVAL] &&
917        cmd->state.shaders[MESA_SHADER_TESS_EVAL]->variant &&
918        cmd->state.shaders[MESA_SHADER_TESS_EVAL]->variant->key.tessellation == IR3_TESS_ISOLINES);
919    bool msaa_disable = is_line &&
920       cmd->vk.dynamic_graphics_state.rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
921 
922    if (cmd->state.msaa_disable != msaa_disable) {
923       cmd->state.msaa_disable = msaa_disable;
924       tu6_update_msaa(cmd);
925    }
926 }
927 
928 static bool
use_hw_binning(struct tu_cmd_buffer * cmd)929 use_hw_binning(struct tu_cmd_buffer *cmd)
930 {
931    const struct tu_framebuffer *fb = cmd->state.framebuffer;
932    const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
933 
934    /* XFB commands are emitted for BINNING || SYSMEM, which makes it
935     * incompatible with non-hw binning GMEM rendering. this is required because
936     * some of the XFB commands need to only be executed once.
937     * use_sysmem_rendering() should have made sure we only ended up here if no
938     * XFB was used.
939     */
940    if (cmd->state.rp.xfb_used) {
941       assert(tiling->binning_possible);
942       return true;
943    }
944 
945    /* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT emulates GL_PRIMITIVES_GENERATED,
946     * which wasn't designed to care about tilers and expects the result not to
947     * be multiplied by tile count.
948     * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
949     */
950    if (cmd->state.rp.has_prim_generated_query_in_rp ||
951        cmd->state.prim_generated_query_running_before_rp) {
952       assert(tiling->binning_possible);
953       return true;
954    }
955 
956    return tiling->binning;
957 }
958 
959 static bool
use_sysmem_rendering(struct tu_cmd_buffer * cmd,struct tu_renderpass_result ** autotune_result)960 use_sysmem_rendering(struct tu_cmd_buffer *cmd,
961                      struct tu_renderpass_result **autotune_result)
962 {
963    if (TU_DEBUG(SYSMEM))
964       return true;
965 
966    /* can't fit attachments into gmem */
967    if (!cmd->state.tiling->possible)
968       return true;
969 
970    if (cmd->state.framebuffer->layers > 1)
971       return true;
972 
973    /* Use sysmem for empty render areas */
974    if (cmd->state.render_area.extent.width == 0 ||
975        cmd->state.render_area.extent.height == 0)
976       return true;
977 
978    if (cmd->state.rp.has_tess)
979       return true;
980 
981    if (cmd->state.rp.disable_gmem)
982       return true;
983 
984    /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
985    if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible)
986       return true;
987 
988    /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
989     * GMEM rendering, see use_hw_binning.
990     */
991    if ((cmd->state.rp.has_prim_generated_query_in_rp ||
992         cmd->state.prim_generated_query_running_before_rp) &&
993        !cmd->state.tiling->binning_possible)
994       return true;
995 
996    if (TU_DEBUG(GMEM))
997       return false;
998 
999    bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
1000                                             cmd, autotune_result);
1001    if (*autotune_result) {
1002       list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
1003    }
1004 
1005    return use_sysmem;
1006 }
1007 
1008 /* Optimization: there is no reason to load gmem if there is no
1009  * geometry to process. COND_REG_EXEC predicate is set here,
1010  * but the actual skip happens in tu_load_gmem_attachment() and tile_store_cs,
1011  * for each blit separately.
1012  */
1013 static void
tu6_emit_cond_for_load_stores(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t pipe,uint32_t slot,bool skip_wfm)1014 tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1015                               uint32_t pipe, uint32_t slot, bool skip_wfm)
1016 {
1017    if (cmd->state.tiling->binning_possible &&
1018        cmd->state.pass->has_cond_load_store) {
1019       tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1020       tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
1021                      A6XX_CP_REG_TEST_0_BIT(slot) |
1022                      COND(skip_wfm, A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME));
1023    } else {
1024       /* COND_REG_EXECs are not emitted in non-binning case */
1025    }
1026 }
1027 
1028 template <chip CHIP>
1029 static void
tu6_emit_tile_select(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t tx,uint32_t ty,uint32_t pipe,uint32_t slot,const struct tu_image_view * fdm)1030 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
1031                      struct tu_cs *cs,
1032                      uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot,
1033                      const struct tu_image_view *fdm)
1034 {
1035    struct tu_physical_device *phys_dev = cmd->device->physical_device;
1036    const struct tu_tiling_config *tiling = cmd->state.tiling;
1037    bool hw_binning = use_hw_binning(cmd);
1038 
1039    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1040    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_START) |
1041                   A6XX_CP_SET_MARKER_0_USES_GMEM);
1042 
1043    tu6_emit_bin_size<CHIP>(
1044       cs, tiling->tile0.width, tiling->tile0.height,
1045       {
1046          .render_mode = RENDERING_PASS,
1047          .force_lrz_write_dis = !phys_dev->info->a6xx.has_lrz_feedback,
1048          .buffers_location = BUFFERS_IN_GMEM,
1049          .lrz_feedback_zmode_mask =
1050             phys_dev->info->a6xx.has_lrz_feedback
1051                ? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z :
1052                   LRZ_FEEDBACK_EARLY_LRZ_LATE_Z)
1053                : LRZ_FEEDBACK_NONE,
1054       });
1055 
1056    tu_cs_emit_regs(cs,
1057                    A6XX_VFD_MODE_CNTL(RENDERING_PASS));
1058 
1059    const uint32_t x1 = tiling->tile0.width * tx;
1060    const uint32_t y1 = tiling->tile0.height * ty;
1061    const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
1062    const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
1063    tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
1064    tu6_emit_window_offset<CHIP>(cs, x1, y1);
1065 
1066    if (hw_binning) {
1067       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1068 
1069       tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1070       tu_cs_emit(cs, 0x0);
1071 
1072       tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
1073       tu_cs_emit(cs, tiling->pipe_sizes[pipe] |
1074                      CP_SET_BIN_DATA5_0_VSC_N(slot));
1075       tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
1076       tu_cs_emit(cs, pipe * 4);
1077       tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
1078    }
1079 
1080    tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, hw_binning);
1081 
1082    tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1083    tu_cs_emit(cs, !hw_binning);
1084 
1085    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1086    tu_cs_emit(cs, 0x0);
1087 
1088    if (fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm)) {
1089       unsigned views =
1090          cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
1091       const struct tu_framebuffer *fb = cmd->state.framebuffer;
1092       struct tu_frag_area raw_areas[views];
1093       if (fdm) {
1094          tu_fragment_density_map_sample(fdm,
1095                                         (x1 + MIN2(x2, fb->width)) / 2,
1096                                         (y1 + MIN2(y2, fb->height)) / 2,
1097                                         fb->width, fb->height, views,
1098                                         raw_areas);
1099       } else {
1100          for (unsigned i = 0; i < views; i++)
1101             raw_areas[i].width = raw_areas[i].height = 1.0f;
1102       }
1103 
1104       VkExtent2D frag_areas[views];
1105       for (unsigned i = 0; i < views; i++) {
1106          float floor_x, floor_y;
1107          float area = raw_areas[i].width * raw_areas[i].height;
1108          float frac_x = modff(raw_areas[i].width, &floor_x);
1109          float frac_y = modff(raw_areas[i].height, &floor_y);
1110          /* The spec allows rounding up one of the axes as long as the total
1111           * area is less than or equal to the original area. Take advantage of
1112           * this to try rounding up the number with the largest fraction.
1113           */
1114          if ((frac_x > frac_y ? (floor_x + 1.f) * floor_y :
1115                                  floor_x * (floor_y + 1.f)) <= area) {
1116             if (frac_x > frac_y)
1117                floor_x += 1.f;
1118             else
1119                floor_y += 1.f;
1120          }
1121          frag_areas[i].width = floor_x;
1122          frag_areas[i].height = floor_y;
1123 
1124          /* Make sure that the width/height divides the tile width/height so
1125           * we don't have to do extra awkward clamping of the edges of each
1126           * bin when resolving. Note that because the tile width is rounded to
1127           * a multiple of 32 any power of two 32 or less will work.
1128           *
1129           * TODO: Try to take advantage of the total area allowance here, too.
1130           */
1131          while (tiling->tile0.width % frag_areas[i].width != 0)
1132             frag_areas[i].width--;
1133          while (tiling->tile0.height % frag_areas[i].height != 0)
1134             frag_areas[i].height--;
1135       }
1136 
1137       /* If at any point we were forced to use the same scaling for all
1138        * viewports, we need to make sure that any users *not* using shared
1139        * scaling, including loads/stores, also consistently share the scaling.
1140        */
1141       if (cmd->state.rp.shared_viewport) {
1142          VkExtent2D frag_area = { UINT32_MAX, UINT32_MAX };
1143          for (unsigned i = 0; i < views; i++) {
1144             frag_area.width = MIN2(frag_area.width, frag_areas[i].width);
1145             frag_area.height = MIN2(frag_area.height, frag_areas[i].height);
1146          }
1147 
1148          for (unsigned i = 0; i < views; i++)
1149             frag_areas[i] = frag_area;
1150       }
1151 
1152       VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } };
1153       util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
1154                              struct tu_fdm_bin_patchpoint, patch) {
1155          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
1156          tu_cs_emit_qw(cs, patch->iova);
1157          patch->apply(cmd, cs, patch->data, bin, views, frag_areas);
1158       }
1159 
1160       /* Make the CP wait until the CP_MEM_WRITE's to the command buffers
1161        * land. When loading FS params via UBOs, we also need to invalidate
1162        * UCHE because the FS param patchpoint is read through UCHE.
1163        */
1164       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1165       if (cmd->device->compiler->load_shader_consts_via_preamble) {
1166          tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
1167          tu_cs_emit_wfi(cs);
1168       }
1169       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1170    }
1171 }
1172 
1173 template <chip CHIP>
1174 static void
tu6_emit_sysmem_resolve(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t layer_mask,uint32_t a,uint32_t gmem_a)1175 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
1176                         struct tu_cs *cs,
1177                         uint32_t layer_mask,
1178                         uint32_t a,
1179                         uint32_t gmem_a)
1180 {
1181    const struct tu_framebuffer *fb = cmd->state.framebuffer;
1182    const struct tu_image_view *dst = cmd->state.attachments[a];
1183    const struct tu_image_view *src = cmd->state.attachments[gmem_a];
1184 
1185    tu_resolve_sysmem<CHIP>(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
1186 }
1187 
1188 template <chip CHIP>
1189 static void
tu6_emit_sysmem_resolves(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_subpass * subpass)1190 tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
1191                          struct tu_cs *cs,
1192                          const struct tu_subpass *subpass)
1193 {
1194    if (subpass->resolve_attachments) {
1195       /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
1196        * Commands":
1197        *
1198        *    End-of-subpass multisample resolves are treated as color
1199        *    attachment writes for the purposes of synchronization.
1200        *    This applies to resolve operations for both color and
1201        *    depth/stencil attachments. That is, they are considered to
1202        *    execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
1203        *    pipeline stage and their writes are synchronized with
1204        *    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
1205        *    rendering within a subpass and any resolve operations at the end
1206        *    of the subpass occurs automatically, without need for explicit
1207        *    dependencies or pipeline barriers. However, if the resolve
1208        *    attachment is also used in a different subpass, an explicit
1209        *    dependency is needed.
1210        *
1211        * We use the CP_BLIT path for sysmem resolves, which is really a
1212        * transfer command, so we have to manually flush similar to the gmem
1213        * resolve case. However, a flush afterwards isn't needed because of the
1214        * last sentence and the fact that we're in sysmem mode.
1215        */
1216       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
1217       if (subpass->resolve_depth_stencil)
1218          tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
1219 
1220       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
1221 
1222       /* Wait for the flushes to land before using the 2D engine */
1223       tu_cs_emit_wfi(cs);
1224 
1225       for (unsigned i = 0; i < subpass->resolve_count; i++) {
1226          uint32_t a = subpass->resolve_attachments[i].attachment;
1227          if (a == VK_ATTACHMENT_UNUSED)
1228             continue;
1229 
1230          uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
1231 
1232          tu6_emit_sysmem_resolve<CHIP>(cmd, cs, subpass->multiview_mask, a, gmem_a);
1233       }
1234    }
1235 }
1236 
1237 template <chip CHIP>
1238 static void
tu6_emit_tile_store(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1239 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1240 {
1241    const struct tu_render_pass *pass = cmd->state.pass;
1242    const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
1243    const struct tu_framebuffer *fb = cmd->state.framebuffer;
1244 
1245    if (pass->has_fdm)
1246       tu_cs_set_writeable(cs, true);
1247 
1248    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1249    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
1250                   A6XX_CP_SET_MARKER_0_USES_GMEM);
1251 
1252    tu6_emit_blit_scissor(cmd, cs, true);
1253 
1254    struct tu_resolve_group resolve_group = {};
1255 
1256    /* Resolve should happen before store in case BLIT_EVENT_STORE_AND_CLEAR is
1257     * used for a store.
1258     */
1259    if (subpass->resolve_attachments) {
1260       for (unsigned i = 0; i < subpass->resolve_count; i++) {
1261          uint32_t a = subpass->resolve_attachments[i].attachment;
1262          if (a != VK_ATTACHMENT_UNUSED) {
1263             uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
1264             tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, gmem_a,
1265                                            fb->layers, subpass->multiview_mask, false);
1266          }
1267       }
1268    }
1269 
1270    for (uint32_t a = 0; a < pass->attachment_count; ++a) {
1271       if (pass->attachments[a].gmem) {
1272          const bool cond_exec_allowed = cmd->state.tiling->binning_possible &&
1273                                         cmd->state.pass->has_cond_load_store;
1274          tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, a,
1275                                   fb->layers, subpass->multiview_mask,
1276                                   cond_exec_allowed);
1277       }
1278    }
1279 
1280    tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
1281 
1282    if (pass->has_fdm)
1283       tu_cs_set_writeable(cs, false);
1284 }
1285 
1286 void
tu_disable_draw_states(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1287 tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1288 {
1289    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1290    tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1291                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1292                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1293    tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1294    tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1295 
1296    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1297 }
1298 
1299 template <chip CHIP>
1300 static void
tu6_init_static_regs(struct tu_device * dev,struct tu_cs * cs)1301 tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
1302 {
1303    const struct tu_physical_device *phys_dev = dev->physical_device;
1304 
1305    if (CHIP >= A7XX) {
1306       /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
1307        * static properties that can be set once, this requires a WFI to take effect.
1308        * While the newly introduced register RB_CCU_CNTL2 has properties that may
1309        * change per-RP and don't require a WFI to take effect, only CCU inval/flush
1310        * events are required.
1311        */
1312 
1313       enum a7xx_concurrent_resolve_mode resolve_mode = CONCURRENT_RESOLVE_MODE_2;
1314       if (TU_DEBUG(NO_CONCURRENT_RESOLVES))
1315          resolve_mode = CONCURRENT_RESOLVE_MODE_DISABLED;
1316 
1317       enum a7xx_concurrent_unresolve_mode unresolve_mode = CONCURRENT_UNRESOLVE_MODE_FULL;
1318       if (TU_DEBUG(NO_CONCURRENT_UNRESOLVES))
1319          unresolve_mode = CONCURRENT_UNRESOLVE_MODE_DISABLED;
1320 
1321       tu_cs_emit_regs(cs, RB_CCU_CNTL(A7XX,
1322          .gmem_fast_clear_disable =
1323            !dev->physical_device->info->a6xx.has_gmem_fast_clear,
1324          .concurrent_resolve_mode = resolve_mode,
1325          .concurrent_unresolve_mode = unresolve_mode,
1326       ));
1327    }
1328 
1329    for (size_t i = 0; i < ARRAY_SIZE(phys_dev->info->a6xx.magic_raw); i++) {
1330       auto magic_reg = phys_dev->info->a6xx.magic_raw[i];
1331       if (!magic_reg.reg)
1332          break;
1333 
1334       uint32_t value = magic_reg.value;
1335       switch(magic_reg.reg) {
1336          case REG_A6XX_TPL1_DBG_ECO_CNTL1:
1337             value = (value & ~A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT) |
1338                     (phys_dev->info->a7xx.enable_tp_ubwc_flag_hint
1339                         ? A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT
1340                         : 0);
1341             break;
1342       }
1343 
1344       tu_cs_emit_write_reg(cs, magic_reg.reg, value);
1345    }
1346 
1347    tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL,
1348                         phys_dev->info->a6xx.magic.RB_DBG_ECO_CNTL);
1349    tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);
1350    tu_cs_emit_write_reg(cs, REG_A6XX_SP_DBG_ECO_CNTL,
1351                         phys_dev->info->a6xx.magic.SP_DBG_ECO_CNTL);
1352    tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
1353    if (CHIP == A6XX)
1354       tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
1355    tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL,
1356                         phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
1357    if (CHIP == A6XX) {
1358       tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1359       tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1360    }
1361 
1362    tu_cs_emit_write_reg(cs, REG_A6XX_VPC_DBG_ECO_CNTL,
1363                         phys_dev->info->a6xx.magic.VPC_DBG_ECO_CNTL);
1364    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL,
1365                         phys_dev->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
1366    if (CHIP == A6XX) {
1367       tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_DBG_ECO_CNTL,
1368                            phys_dev->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
1369    }
1370    tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS,
1371                         phys_dev->info->a6xx.magic.SP_CHICKEN_BITS);
1372    tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); // 2 on a740 ???
1373    tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
1374    if (CHIP == A6XX)
1375       tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false));
1376    tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12,
1377                         phys_dev->info->a6xx.magic.UCHE_UNKNOWN_0E12);
1378    tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF,
1379                         phys_dev->info->a6xx.magic.UCHE_CLIENT_PF);
1380    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01,
1381                         phys_dev->info->a6xx.magic.RB_UNKNOWN_8E01);
1382    tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
1383    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
1384                                             .isammode = ISAMMODE_GL,
1385                                             .shared_consts_enable = false));
1386 
1387    /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
1388    tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1389    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1390    tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL,
1391                         phys_dev->info->a6xx.magic.PC_MODE_CNTL);
1392 
1393    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
1394 
1395    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
1396 
1397    if (CHIP == A6XX) {
1398       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
1399       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
1400       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
1401       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
1402       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
1403       tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
1404    }
1405 
1406    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
1407 
1408    tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
1409    tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
1410 
1411    tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
1412 
1413    tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
1414 
1415    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1416    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1417    if (CHIP == A6XX) {
1418       tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
1419       tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
1420    }
1421    tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
1422    tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
1423    tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL,
1424                         0x000000a0 |
1425                         A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1426    tu_cs_emit_regs(cs, HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfc));
1427 
1428    tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
1429 
1430    tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, phys_dev->info->a6xx.magic.PC_MODE_CNTL);
1431 
1432    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */
1433 
1434    tu_cs_emit_regs(cs,
1435                    A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo,
1436                                                      .bo_offset = gb_offset(bcolor_builtin)));
1437    tu_cs_emit_regs(cs,
1438                    A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo,
1439                                                         .bo_offset = gb_offset(bcolor_builtin)));
1440 
1441    if (CHIP == A7XX) {
1442       tu_cs_emit_regs(cs, TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0),
1443                       TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4),
1444                       TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee),
1445                       TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed),
1446                       TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0), );
1447    }
1448 
1449    if (CHIP >= A7XX) {
1450       /* Blob sets these two per draw. */
1451       tu_cs_emit_regs(cs, A7XX_PC_TESS_PARAM_SIZE(TU_TESS_PARAM_SIZE));
1452       /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
1453        * but the meaning of this additional space is not known,
1454        * so we play safe and don't add it.
1455        */
1456       tu_cs_emit_regs(cs, A7XX_PC_TESS_FACTOR_SIZE(TU_TESS_FACTOR_SIZE));
1457    }
1458 
1459    /* There is an optimization to skip executing draw states for draws with no
1460     * instances. Instead of simply skipping the draw, internally the firmware
1461     * sets a bit in PC_DRAW_INITIATOR that seemingly skips the draw. However
1462     * there is a hardware bug where this bit does not always cause the FS
1463     * early preamble to be skipped. Because the draw states were skipped,
1464     * SP_FS_CTRL_REG0, SP_FS_OBJ_START and so on are never updated and a
1465     * random FS preamble from the last draw is executed. If the last visible
1466     * draw is from the same submit, it shouldn't be a problem because we just
1467     * re-execute the same preamble and preambles don't have side effects, but
1468     * if it's from another process then we could execute a garbage preamble
1469     * leading to hangs and faults. To make sure this doesn't happen, we reset
1470     * SP_FS_CTRL_REG0 here, making sure that the EARLYPREAMBLE bit isn't set
1471     * so any leftover early preamble doesn't get executed. Other stages don't
1472     * seem to be affected.
1473     */
1474    if (phys_dev->info->a6xx.has_early_preamble) {
1475       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0());
1476    }
1477 
1478    /* Workaround for draw state with constlen not being applied for
1479     * zero-instance draw calls. See IR3_CONST_ALLOC_DRIVER_PARAMS allocation
1480     * for more info.
1481     */
1482    tu_cs_emit_pkt4(
1483       cs, CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL, 1);
1484    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(8) | A6XX_HLSQ_VS_CNTL_ENABLED);
1485 }
1486 
1487 /* Set always-identical registers used specifically for GMEM */
1488 static void
tu7_emit_tile_render_begin_regs(struct tu_cs * cs)1489 tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
1490 {
1491    tu_cs_emit_regs(cs,
1492                   A7XX_RB_UNKNOWN_8812(0x0));
1493    tu_cs_emit_regs(cs,
1494                 A7XX_RB_UNKNOWN_8E06(0x0));
1495 
1496    tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8007(0x0));
1497 
1498    tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
1499    tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8E09(0x4));
1500 
1501    tu_cs_emit_regs(cs, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1502 }
1503 
1504 /* Emit the bin restore preamble, which runs in between bins when L1
1505  * preemption with skipsaverestore happens and we switch back to this context.
1506  * We need to restore static registers normally programmed at cmdbuf start
1507  * which weren't saved, and we need to program the CCU state which is normally
1508  * programmed before rendering the bins and isn't saved/restored by the CP
1509  * because it is always the same for GMEM render passes.
1510  */
1511 template <chip CHIP>
1512 static void
tu_emit_bin_preamble(struct tu_device * dev,struct tu_cs * cs)1513 tu_emit_bin_preamble(struct tu_device *dev, struct tu_cs *cs)
1514 {
1515    struct tu_physical_device *phys_dev = dev->physical_device;
1516 
1517    tu6_init_static_regs<CHIP>(dev, cs);
1518    emit_rb_ccu_cntl<CHIP>(cs, dev, true);
1519 
1520    if (CHIP == A6XX) {
1521       tu_cs_emit_regs(cs,
1522                      A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1523 
1524       tu_cs_emit_regs(cs,
1525                      A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1526    }
1527 
1528    if (CHIP == A7XX) {
1529       tu7_emit_tile_render_begin_regs(cs);
1530    }
1531 
1532    /* TODO use CP_MEM_TO_SCRATCH_MEM on a7xx. The VSC scratch mem should be
1533     * automatically saved, unlike GPU registers, so we wouldn't have to
1534     * manually restore this state.
1535     */
1536    tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1537    tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VSC_STATE(0)) |
1538                   CP_MEM_TO_REG_0_CNT(32));
1539    tu_cs_emit_qw(cs, dev->global_bo->iova + gb_offset(vsc_state));
1540 }
1541 
1542 VkResult
tu_init_bin_preamble(struct tu_device * device)1543 tu_init_bin_preamble(struct tu_device *device)
1544 {
1545    struct tu_cs preamble_cs;
1546    VkResult result = tu_cs_begin_sub_stream(&device->sub_cs, 256, &preamble_cs);
1547    if (result != VK_SUCCESS)
1548       return vk_startup_errorf(device->instance, result, "bin restore");
1549 
1550    TU_CALLX(device, tu_emit_bin_preamble)(device, &preamble_cs);
1551 
1552    device->bin_preamble_entry = tu_cs_end_sub_stream(&device->sub_cs, &preamble_cs);
1553 
1554    return VK_SUCCESS;
1555 }
1556 
1557 template <chip CHIP>
1558 static void
tu6_init_hw(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1559 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1560 {
1561    struct tu_device *dev = cmd->device;
1562    const struct tu_physical_device *phys_dev = dev->physical_device;
1563 
1564    if (CHIP == A6XX) {
1565       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
1566    } else {
1567       tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
1568       tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
1569                      CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
1570 
1571       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
1572       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
1573       tu_emit_raw_event_write<CHIP>(cmd, cs, UNK_40, false);
1574       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
1575       tu_cs_emit_wfi(cs);
1576    }
1577 
1578    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1579          .vs_state = true,
1580          .hs_state = true,
1581          .ds_state = true,
1582          .gs_state = true,
1583          .fs_state = true,
1584          .cs_state = true,
1585          .cs_ibo = true,
1586          .gfx_ibo = true,
1587          .cs_shared_const = true,
1588          .gfx_shared_const = true,
1589          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
1590          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
1591 
1592    tu_cs_emit_wfi(cs);
1593 
1594    if (dev->dbg_cmdbuf_stomp_cs) {
1595       tu_cs_emit_call(cs, dev->dbg_cmdbuf_stomp_cs);
1596    }
1597 
1598    cmd->state.cache.pending_flush_bits &=
1599       ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
1600 
1601    tu6_init_static_regs<CHIP>(cmd->device, cs);
1602 
1603    emit_rb_ccu_cntl<CHIP>(cs, cmd->device, false);
1604    cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
1605 
1606    tu_disable_draw_states(cmd, cs);
1607 
1608    if (phys_dev->info->a7xx.cmdbuf_start_a725_quirk) {
1609       tu_cs_reserve(cs, 3 + 4);
1610       tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1611       tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(THREAD_MODE) |
1612                      CP_COND_REG_EXEC_0_BR | CP_COND_REG_EXEC_0_LPAC);
1613       tu_cs_emit(cs, RENDER_MODE_CP_COND_REG_EXEC_1_DWORDS(4));
1614       tu_cs_emit_ib(cs, &dev->cmdbuf_start_a725_quirk_entry);
1615    }
1616 
1617    tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3);
1618    tu_cs_emit_qw(cs, cmd->device->bin_preamble_entry.bo->iova +
1619                      cmd->device->bin_preamble_entry.offset);
1620    tu_cs_emit(cs, CP_SET_AMBLE_2_DWORDS(cmd->device->bin_preamble_entry.size /
1621                                         sizeof(uint32_t)) |
1622                   CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE));
1623 
1624    tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3);
1625    tu_cs_emit_qw(cs, 0);
1626    tu_cs_emit(cs, CP_SET_AMBLE_2_TYPE(PREAMBLE_AMBLE_TYPE));
1627 
1628    tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3);
1629    tu_cs_emit_qw(cs, 0);
1630    tu_cs_emit(cs, CP_SET_AMBLE_2_TYPE(POSTAMBLE_AMBLE_TYPE));
1631 
1632    tu_cs_sanity_check(cs);
1633 }
1634 
1635 static void
update_vsc_pipe(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t num_vsc_pipes)1636 update_vsc_pipe(struct tu_cmd_buffer *cmd,
1637                 struct tu_cs *cs,
1638                 uint32_t num_vsc_pipes)
1639 {
1640    const struct tu_tiling_config *tiling = cmd->state.tiling;
1641 
1642    tu_cs_emit_regs(cs,
1643                    A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
1644                                      .height = tiling->tile0.height));
1645 
1646    tu_cs_emit_regs(cs,
1647                    A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1648                                       .ny = tiling->tile_count.height));
1649 
1650    tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), num_vsc_pipes);
1651    tu_cs_emit_array(cs, tiling->pipe_config, num_vsc_pipes);
1652 
1653    tu_cs_emit_regs(cs,
1654                    A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
1655                    A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
1656 
1657    tu_cs_emit_regs(cs,
1658                    A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
1659                    A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
1660 
1661    tu_cs_emit_regs(cs, A7XX_VSC_UNKNOWN_0D08(0));
1662 }
1663 
1664 static void
emit_vsc_overflow_test(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1665 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1666 {
1667    const struct tu_tiling_config *tiling = cmd->state.tiling;
1668    const uint32_t used_pipe_count =
1669       tiling->pipe_count.width * tiling->pipe_count.height;
1670 
1671    for (int i = 0; i < used_pipe_count; i++) {
1672       tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1673       tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1674             CP_COND_WRITE5_0_WRITE_MEMORY);
1675       tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
1676       tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1677       tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
1678       tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1679       tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
1680       tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
1681 
1682       tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1683       tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1684             CP_COND_WRITE5_0_WRITE_MEMORY);
1685       tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
1686       tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1687       tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
1688       tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1689       tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
1690       tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
1691    }
1692 
1693    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1694 }
1695 
1696 template <chip CHIP>
1697 static void
tu6_emit_binning_pass(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1698 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1699 {
1700    struct tu_physical_device *phys_dev = cmd->device->physical_device;
1701    const struct tu_framebuffer *fb = cmd->state.framebuffer;
1702 
1703    /* If this command buffer may be executed multiple times, then
1704     * viewports/scissor states may have been changed by previous executions
1705     * and we need to reset them before executing the binning IB.
1706     */
1707    if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
1708        cmd->fdm_bin_patchpoints.size != 0) {
1709       unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
1710       VkExtent2D unscaled_frag_areas[num_views];
1711       for (unsigned i = 0; i < num_views; i++)
1712          unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
1713       VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } };
1714       util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
1715                              struct tu_fdm_bin_patchpoint, patch) {
1716          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
1717          tu_cs_emit_qw(cs, patch->iova);
1718          patch->apply(cmd, cs, patch->data, bin, num_views, unscaled_frag_areas);
1719       }
1720 
1721       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1722       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1723    }
1724 
1725    tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1726 
1727    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1728    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
1729 
1730    tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1731    tu_cs_emit(cs, 0x1);
1732 
1733    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1734    tu_cs_emit(cs, 0x1);
1735 
1736    tu_cs_emit_wfi(cs);
1737 
1738    tu_cs_emit_regs(cs,
1739                    A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
1740 
1741    update_vsc_pipe(cmd, cs, phys_dev->info->num_vsc_pipes);
1742 
1743    if (CHIP == A6XX) {
1744       tu_cs_emit_regs(cs,
1745                      A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1746 
1747       tu_cs_emit_regs(cs,
1748                      A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1749    }
1750 
1751    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1752    tu_cs_emit(cs, UNK_2C);
1753 
1754    tu_cs_emit_regs(cs,
1755                    A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1756 
1757    tu_cs_emit_regs(cs,
1758                    A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1759 
1760    trace_start_binning_ib(&cmd->trace, cs);
1761 
1762    /* emit IB to binning drawcmds: */
1763    tu_cs_emit_call(cs, &cmd->draw_cs);
1764 
1765    trace_end_binning_ib(&cmd->trace, cs);
1766 
1767    /* switching from binning pass to GMEM pass will cause a switch from
1768     * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
1769     * so make sure these states are re-emitted
1770     * (eventually these states shouldn't exist at all with shader prologue)
1771     * only VS and GS are invalidated, as FS isn't emitted in binning pass,
1772     * and we don't use HW binning when tesselation is used
1773     */
1774    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1775    tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1776                   CP_SET_DRAW_STATE__0_DISABLE |
1777                   CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_CONST));
1778    tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1779    tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1780 
1781    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1782    tu_cs_emit(cs, UNK_2D);
1783 
1784    /* This flush is probably required because the VSC, which produces the
1785     * visibility stream, is a client of UCHE, whereas the CP needs to read the
1786     * visibility stream (without caching) to do draw skipping. The
1787     * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1788     * submitted are finished before reading the VSC regs (in
1789     * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
1790     * part of draws).
1791     */
1792    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_CLEAN);
1793 
1794    tu_cs_emit_wfi(cs);
1795 
1796    tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1797 
1798    emit_vsc_overflow_test(cmd, cs);
1799 
1800    tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1801    tu_cs_emit(cs, 0x0);
1802 
1803    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1804    tu_cs_emit(cs, 0x0);
1805 }
1806 
1807 static struct tu_draw_state
tu_emit_input_attachments(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,bool gmem)1808 tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
1809                           const struct tu_subpass *subpass,
1810                           bool gmem)
1811 {
1812    const struct tu_tiling_config *tiling = cmd->state.tiling;
1813 
1814    /* note: we can probably emit input attachments just once for the whole
1815     * renderpass, this would avoid emitting both sysmem/gmem versions
1816     *
1817     * emit two texture descriptors for each input, as a workaround for
1818     * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)
1819     * tu_shader lowers uint input attachment loads to use the 2nd descriptor
1820     * in the pair
1821     * TODO: a smarter workaround
1822     */
1823 
1824    if (!subpass->input_count)
1825       return (struct tu_draw_state) {};
1826 
1827    struct tu_cs_memory texture;
1828    VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
1829                                  A6XX_TEX_CONST_DWORDS, &texture);
1830    if (result != VK_SUCCESS) {
1831       vk_command_buffer_set_error(&cmd->vk, result);
1832       return (struct tu_draw_state) {};
1833    }
1834 
1835    for (unsigned i = 0; i < subpass->input_count * 2; i++) {
1836       uint32_t a = subpass->input_attachments[i / 2].attachment;
1837       if (a == VK_ATTACHMENT_UNUSED)
1838          continue;
1839 
1840       const struct tu_image_view *iview = cmd->state.attachments[a];
1841       const struct tu_render_pass_attachment *att =
1842          &cmd->state.pass->attachments[a];
1843       uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
1844       uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att, 0);
1845       uint32_t cpp = att->cpp;
1846 
1847       memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4);
1848 
1849       /* Cube descriptors require a different sampling instruction in shader,
1850        * however we don't know whether image is a cube or not until the start
1851        * of a renderpass. We have to patch the descriptor to make it compatible
1852        * with how it is sampled in shader.
1853        */
1854       enum a6xx_tex_type tex_type =
1855          (enum a6xx_tex_type)((dst[2] & A6XX_TEX_CONST_2_TYPE__MASK) >>
1856                               A6XX_TEX_CONST_2_TYPE__SHIFT);
1857       if (tex_type == A6XX_TEX_CUBE) {
1858          dst[2] &= ~A6XX_TEX_CONST_2_TYPE__MASK;
1859          dst[2] |= A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1860 
1861          uint32_t depth = (dst[5] & A6XX_TEX_CONST_5_DEPTH__MASK) >>
1862                           A6XX_TEX_CONST_5_DEPTH__SHIFT;
1863          dst[5] &= ~A6XX_TEX_CONST_5_DEPTH__MASK;
1864          dst[5] |= A6XX_TEX_CONST_5_DEPTH(depth * 6);
1865       }
1866 
1867       if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
1868          /* note this works because spec says fb and input attachments
1869           * must use identity swizzle
1870           *
1871           * Also we clear swap to WZYX.  This is because the view might have
1872           * picked XYZW to work better with border colors.
1873           */
1874          dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1875             A6XX_TEX_CONST_0_SWAP__MASK |
1876             A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1877             A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1878          if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {
1879             dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |
1880                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |
1881                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1882                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1883                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1884          } else {
1885             dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |
1886                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
1887                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1888                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1889                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1890          }
1891       }
1892 
1893       if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1894          dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1895          dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
1896          dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
1897          dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch);
1898          dst[3] = 0;
1899          dst[4] = iview->stencil_base_addr;
1900          dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
1901 
1902          cpp = att->samples;
1903          gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
1904       }
1905 
1906       if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
1907          continue;
1908 
1909       /* patched for gmem */
1910       dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1911       dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1912       dst[2] =
1913          A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1914          A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
1915       /* Note: it seems the HW implicitly calculates the array pitch with the
1916        * GMEM tiling, so we don't need to specify the pitch ourselves.
1917        */
1918       dst[3] = 0;
1919       dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1920       dst[5] &= A6XX_TEX_CONST_5_DEPTH__MASK;
1921       for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1922          dst[i] = 0;
1923    }
1924 
1925    struct tu_cs cs;
1926    struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
1927 
1928    tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
1929    tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1930                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1931                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1932                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1933                   CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
1934    tu_cs_emit_qw(&cs, texture.iova);
1935 
1936    tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1937 
1938    tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
1939 
1940    assert(cs.cur == cs.end); /* validate draw state size */
1941 
1942    return ds;
1943 }
1944 
1945 static void
tu_set_input_attachments(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass)1946 tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1947 {
1948    struct tu_cs *cs = &cmd->draw_cs;
1949 
1950    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
1951    tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
1952                          tu_emit_input_attachments(cmd, subpass, true));
1953    tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
1954                          tu_emit_input_attachments(cmd, subpass, false));
1955 }
1956 
1957 static void
tu_trace_start_render_pass(struct tu_cmd_buffer * cmd)1958 tu_trace_start_render_pass(struct tu_cmd_buffer *cmd)
1959 {
1960    if (!u_trace_enabled(&cmd->device->trace_context))
1961       return;
1962 
1963    uint32_t load_cpp = 0;
1964    uint32_t store_cpp = 0;
1965    uint32_t clear_cpp = 0;
1966    bool has_depth = false;
1967    char ubwc[MAX_RTS + 3];
1968    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; i++) {
1969       const struct tu_render_pass_attachment *attachment =
1970          &cmd->state.pass->attachments[i];
1971       if (attachment->load) {
1972          load_cpp += attachment->cpp;
1973       }
1974 
1975       if (attachment->store) {
1976          store_cpp += attachment->cpp;
1977       }
1978 
1979       if (attachment->clear_mask) {
1980          clear_cpp += attachment->cpp;
1981       }
1982 
1983       has_depth |= vk_format_has_depth(attachment->format);
1984    }
1985 
1986    uint8_t ubwc_len = 0;
1987    const struct tu_subpass *subpass = &cmd->state.pass->subpasses[0];
1988    for (uint32_t i = 0; i < subpass->color_count; i++) {
1989       uint32_t att = subpass->color_attachments[i].attachment;
1990       ubwc[ubwc_len++] = att == VK_ATTACHMENT_UNUSED ? '-'
1991                          : cmd->state.attachments[att]->view.ubwc_enabled
1992                             ? 'y'
1993                             : 'n';
1994    }
1995    if (subpass->depth_used) {
1996       ubwc[ubwc_len++] = '|';
1997       ubwc[ubwc_len++] =
1998          cmd->state.attachments[subpass->depth_stencil_attachment.attachment]
1999                ->view.ubwc_enabled
2000             ? 'y'
2001             : 'n';
2002    }
2003    ubwc[ubwc_len] = '\0';
2004 
2005    uint32_t max_samples = 0;
2006    for (uint32_t i = 0; i < cmd->state.pass->subpass_count; i++) {
2007       max_samples = MAX2(max_samples, cmd->state.pass->subpasses[i].samples);
2008    }
2009 
2010    trace_start_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer,
2011                            cmd->state.tiling, max_samples, clear_cpp,
2012                            load_cpp, store_cpp, has_depth, ubwc);
2013 }
2014 
2015 template <chip CHIP>
2016 static void
tu_trace_end_render_pass(struct tu_cmd_buffer * cmd,bool gmem)2017 tu_trace_end_render_pass(struct tu_cmd_buffer *cmd, bool gmem)
2018 {
2019    if (!u_trace_enabled(&cmd->device->trace_context))
2020       return;
2021 
2022    uint32_t avg_per_sample_bandwidth =
2023       cmd->state.rp.drawcall_bandwidth_per_sample_sum /
2024       MAX2(cmd->state.rp.drawcall_count, 1);
2025 
2026    struct u_trace_address addr = {};
2027    if (cmd->state.lrz.image_view) {
2028       struct tu_image *image = cmd->state.lrz.image_view->image;
2029       addr.bo = image->bo;
2030       addr.offset = (image->iova - image->bo->iova) + image->lrz_fc_offset +
2031                     offsetof(fd_lrzfc_layout<CHIP>, dir_track);
2032    }
2033 
2034    trace_end_render_pass(&cmd->trace, &cmd->cs, gmem,
2035                          cmd->state.rp.drawcall_count,
2036                          avg_per_sample_bandwidth, cmd->state.lrz.valid,
2037                          cmd->state.rp.lrz_disable_reason, addr);
2038 }
2039 
2040 static void
tu_emit_renderpass_begin(struct tu_cmd_buffer * cmd)2041 tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd)
2042 {
2043    /* We need to re-emit any draw states that are patched in order for them to
2044     * be correctly added to the per-renderpass patchpoint list, even if they
2045     * are the same as before.
2046     */
2047    if (cmd->state.pass->has_fdm)
2048       cmd->state.dirty |= TU_CMD_DIRTY_FDM;
2049 
2050    /* We need to re-emit MSAA at the beginning of every renderpass because it
2051     * isn't part of a draw state that gets automatically re-emitted.
2052     */
2053    BITSET_SET(cmd->vk.dynamic_graphics_state.dirty,
2054               MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
2055    /* PC_PRIMITIVE_CNTL_0 isn't a part of a draw state and may be changed
2056     * by blits.
2057     */
2058    BITSET_SET(cmd->vk.dynamic_graphics_state.dirty,
2059               MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE);
2060 }
2061 
2062 template <chip CHIP>
2063 static void
tu6_sysmem_render_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)2064 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2065                         struct tu_renderpass_result *autotune_result)
2066 {
2067    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2068 
2069    tu_lrz_sysmem_begin<CHIP>(cmd, cs);
2070 
2071    assert(fb->width > 0 && fb->height > 0);
2072    tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
2073    tu6_emit_window_offset<CHIP>(cs, 0, 0);
2074 
2075    tu6_emit_bin_size<CHIP>(cs, 0, 0, {
2076       .render_mode = RENDERING_PASS,
2077       .force_lrz_write_dis =
2078          !cmd->device->physical_device->info->a6xx.has_lrz_feedback,
2079       .buffers_location = BUFFERS_IN_SYSMEM,
2080       .lrz_feedback_zmode_mask =
2081          cmd->device->physical_device->info->a6xx.has_lrz_feedback
2082             ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z
2083             : LRZ_FEEDBACK_NONE,
2084    });
2085 
2086    if (CHIP == A7XX) {
2087       tu_cs_emit_regs(cs,
2088                      A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem
2089       tu_cs_emit_regs(cs,
2090          A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
2091 
2092       tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8007(0x0));
2093 
2094       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
2095       tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8E09(0x4));
2096 
2097       tu_cs_emit_regs(cs, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
2098    }
2099 
2100    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
2101    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_DIRECT_RENDER));
2102 
2103    /* A7XX TODO: blob doesn't use CP_SKIP_IB2_ENABLE_* */
2104    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2105    tu_cs_emit(cs, 0x0);
2106 
2107    tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
2108 
2109    tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
2110    tu_cs_emit(cs, 0x1);
2111 
2112    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
2113    tu_cs_emit(cs, 0x0);
2114 
2115    tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
2116 
2117    tu_cs_sanity_check(cs);
2118 }
2119 
2120 template <chip CHIP>
2121 static void
tu6_sysmem_render_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)2122 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2123                       struct tu_renderpass_result *autotune_result)
2124 {
2125    tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
2126 
2127    /* Do any resolves of the last subpass. These are handled in the
2128     * tile_store_cs in the gmem path.
2129     */
2130    tu6_emit_sysmem_resolves<CHIP>(cmd, cs, cmd->state.subpass);
2131 
2132    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
2133 
2134    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2135    tu_cs_emit(cs, 0x0);
2136 
2137    tu_lrz_sysmem_end<CHIP>(cmd, cs);
2138 
2139    tu_cs_sanity_check(cs);
2140 }
2141 
2142 template <chip CHIP>
2143 static void
tu6_tile_render_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)2144 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2145                       struct tu_renderpass_result *autotune_result)
2146 {
2147    struct tu_physical_device *phys_dev = cmd->device->physical_device;
2148    const struct tu_tiling_config *tiling = cmd->state.tiling;
2149    tu_lrz_tiling_begin<CHIP>(cmd, cs);
2150 
2151    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2152    tu_cs_emit(cs, 0x0);
2153 
2154    if (CHIP >= A7XX) {
2155       tu7_emit_tile_render_begin_regs(cs);
2156    }
2157 
2158    tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_GMEM);
2159 
2160    if (use_hw_binning(cmd)) {
2161       if (!cmd->vsc_initialized) {
2162          tu6_lazy_init_vsc(cmd);
2163       }
2164 
2165       /* We always emit VSC before each renderpass, because due to
2166        * skipsaverestore the underlying VSC registers may have become
2167        * invalid. Normally we'd need to WFI before setting these non-context
2168        * registers, but we should be safe because we're only setting it to the
2169        * same value it had before.
2170        *
2171        * TODO: On a6xx, we have to emit this per-bin or make the amble include
2172        * these registers, because CP_SET_BIN_DATA5_OFFSET will use the
2173        * register instead of the pseudo register and its value won't survive
2174        * across preemptions. The blob seems to take the second approach and
2175        * emits the preamble lazily.
2176        */
2177       tu_emit_vsc<CHIP>(cmd, cs);
2178 
2179       tu6_emit_bin_size<CHIP>(cs, tiling->tile0.width, tiling->tile0.height,
2180                               {
2181                                  .render_mode = BINNING_PASS,
2182                                  .buffers_location = BUFFERS_IN_GMEM,
2183                                  .lrz_feedback_zmode_mask =
2184                                     phys_dev->info->a6xx.has_lrz_feedback
2185                                        ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
2186                                        : LRZ_FEEDBACK_NONE
2187                               });
2188 
2189       tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);
2190 
2191       tu6_emit_binning_pass<CHIP>(cmd, cs);
2192 
2193       if (CHIP == A6XX) {
2194          tu_cs_emit_regs(cs,
2195                         A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
2196 
2197          tu_cs_emit_regs(cs,
2198                         A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
2199       }
2200 
2201       tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2202       tu_cs_emit(cs, 0x1);
2203       tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1);
2204       tu_cs_emit(cs, 0x1);
2205    } else {
2206       if (tiling->binning_possible) {
2207          /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since
2208           * the actual binner didn't run.
2209           */
2210          int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height;
2211          tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count);
2212          for (int i = 0; i < pipe_count; i++)
2213             tu_cs_emit(cs, ~0);
2214       }
2215    }
2216 
2217    if (tiling->binning_possible) {
2218       /* Upload state regs to memory to be restored on skipsaverestore
2219        * preemption.
2220        */
2221       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
2222       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_STATE_REG(0)) |
2223                      CP_REG_TO_MEM_0_CNT(32));
2224       tu_cs_emit_qw(cs, global_iova(cmd, vsc_state));
2225    }
2226 
2227    tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
2228 
2229    tu_cs_sanity_check(cs);
2230 }
2231 
2232 template <chip CHIP>
2233 static void
tu6_render_tile(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t tx,uint32_t ty,uint32_t pipe,uint32_t slot,const struct tu_image_view * fdm)2234 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2235                 uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot,
2236                 const struct tu_image_view *fdm)
2237 {
2238    tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tx, ty, pipe, slot, fdm);
2239    tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
2240 
2241    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
2242 
2243    /* Primitives that passed all tests are still counted in in each
2244     * tile even with HW binning beforehand. Do not permit it.
2245     */
2246    if (cmd->state.prim_generated_query_running_before_rp)
2247       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
2248 
2249    tu_cs_emit_call(cs, &cmd->draw_cs);
2250 
2251    if (cmd->state.prim_generated_query_running_before_rp)
2252       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
2253 
2254    if (use_hw_binning(cmd)) {
2255       tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
2256       tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
2257                      A6XX_CP_SET_MARKER_0_USES_GMEM);
2258    }
2259 
2260    /* Predicate is changed in draw_cs so we have to re-emit it */
2261    if (cmd->state.rp.draw_cs_writes_to_cond_pred)
2262       tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
2263 
2264    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2265    tu_cs_emit(cs, 0x0);
2266 
2267    tu_cs_emit_call(cs, &cmd->tile_store_cs);
2268 
2269    tu_clone_trace_range(cmd, cs, cmd->trace_renderpass_start,
2270          cmd->trace_renderpass_end);
2271 
2272    tu_cs_emit_wfi(cs);
2273 
2274    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
2275    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_END));
2276 
2277    tu_cs_sanity_check(cs);
2278 
2279    trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
2280 }
2281 
2282 template <chip CHIP>
2283 static void
tu6_tile_render_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)2284 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2285                     struct tu_renderpass_result *autotune_result)
2286 {
2287    tu_autotune_end_renderpass<CHIP>(cmd, cs, autotune_result);
2288 
2289    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
2290 
2291    tu_lrz_tiling_end<CHIP>(cmd, cs);
2292 
2293    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
2294 
2295    tu_cs_sanity_check(cs);
2296 }
2297 
2298 template <chip CHIP>
2299 static void
tu_cmd_render_tiles(struct tu_cmd_buffer * cmd,struct tu_renderpass_result * autotune_result)2300 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
2301                     struct tu_renderpass_result *autotune_result)
2302 {
2303    const struct tu_tiling_config *tiling = cmd->state.tiling;
2304    const struct tu_image_view *fdm = NULL;
2305 
2306    if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
2307       fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment];
2308    }
2309 
2310    /* Create gmem stores now (at EndRenderPass time)) because they needed to
2311     * know whether to allow their conditional execution, which was tied to a
2312     * state that was known only at the end of the renderpass.  They will be
2313     * called from tu6_render_tile().
2314     */
2315    tu_cs_begin(&cmd->tile_store_cs);
2316    tu6_emit_tile_store<CHIP>(cmd, &cmd->tile_store_cs);
2317    tu_cs_end(&cmd->tile_store_cs);
2318 
2319    cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
2320 
2321    tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
2322 
2323    /* Note: we reverse the order of walking the pipes and tiles on every
2324     * other row, to improve texture cache locality compared to raster order.
2325     */
2326    for (uint32_t py = 0; py < tiling->pipe_count.height; py++) {
2327       uint32_t pipe_row = py * tiling->pipe_count.width;
2328       for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) {
2329          uint32_t px;
2330          if (py & 1)
2331             px = tiling->pipe_count.width - 1 - pipe_row_i;
2332          else
2333             px = pipe_row_i;
2334          uint32_t pipe = pipe_row + px;
2335          uint32_t tx1 = px * tiling->pipe0.width;
2336          uint32_t ty1 = py * tiling->pipe0.height;
2337          uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
2338          uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
2339          uint32_t tile_row_stride = tx2 - tx1;
2340          uint32_t slot_row = 0;
2341          for (uint32_t ty = ty1; ty < ty2; ty++) {
2342             for (uint32_t tile_row_i = 0; tile_row_i < tile_row_stride; tile_row_i++) {
2343                uint32_t tx;
2344                if (ty & 1)
2345                   tx = tile_row_stride - 1 - tile_row_i;
2346                else
2347                   tx = tile_row_i;
2348                uint32_t slot = slot_row + tx;
2349                tu6_render_tile<CHIP>(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot, fdm);
2350             }
2351             slot_row += tile_row_stride;
2352          }
2353       }
2354    }
2355 
2356    tu6_tile_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
2357 
2358    tu_trace_end_render_pass<CHIP>(cmd, true);
2359 
2360    /* We have trashed the dynamically-emitted viewport, scissor, and FS params
2361     * via the patchpoints, so we need to re-emit them if they are reused for a
2362     * later render pass.
2363     */
2364    if (cmd->state.pass->has_fdm)
2365       cmd->state.dirty |= TU_CMD_DIRTY_FDM;
2366 
2367    /* tu6_render_tile has cloned these tracepoints for each tile */
2368    if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
2369       u_trace_disable_event_range(cmd->trace_renderpass_start,
2370                                   cmd->trace_renderpass_end);
2371 
2372    /* Reset the gmem store CS entry lists so that the next render pass
2373     * does its own stores.
2374     */
2375    tu_cs_discard_entries(&cmd->tile_store_cs);
2376 }
2377 
2378 template <chip CHIP>
2379 static void
tu_cmd_render_sysmem(struct tu_cmd_buffer * cmd,struct tu_renderpass_result * autotune_result)2380 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
2381                      struct tu_renderpass_result *autotune_result)
2382 {
2383    cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
2384 
2385    tu6_sysmem_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
2386 
2387    trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
2388 
2389    tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
2390 
2391    trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
2392 
2393    tu6_sysmem_render_end<CHIP>(cmd, &cmd->cs, autotune_result);
2394 
2395    tu_trace_end_render_pass<CHIP>(cmd, false);
2396 }
2397 
2398 template <chip CHIP>
2399 void
tu_cmd_render(struct tu_cmd_buffer * cmd_buffer)2400 tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
2401 {
2402    if (cmd_buffer->state.rp.has_tess)
2403       tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
2404 
2405    struct tu_renderpass_result *autotune_result = NULL;
2406    if (use_sysmem_rendering(cmd_buffer, &autotune_result))
2407       tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
2408    else
2409       tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result);
2410 
2411    /* Outside of renderpasses we assume all draw states are disabled. We do
2412     * this outside the draw CS for the normal case where 3d gmem stores aren't
2413     * used.
2414     */
2415    tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
2416 
2417 }
2418 
tu_reset_render_pass(struct tu_cmd_buffer * cmd_buffer)2419 static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
2420 {
2421    /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
2422       rendered */
2423    tu_cs_discard_entries(&cmd_buffer->draw_cs);
2424    tu_cs_begin(&cmd_buffer->draw_cs);
2425    tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
2426    tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2427 
2428    cmd_buffer->state.pass = NULL;
2429    cmd_buffer->state.subpass = NULL;
2430    cmd_buffer->state.framebuffer = NULL;
2431    cmd_buffer->state.attachments = NULL;
2432    cmd_buffer->state.clear_values = NULL;
2433    cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
2434    memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
2435 
2436    /* LRZ is not valid next time we use it */
2437    cmd_buffer->state.lrz.valid = false;
2438    cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
2439 
2440    /* Patchpoints have been executed */
2441    util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
2442    ralloc_free(cmd_buffer->patchpoints_ctx);
2443    cmd_buffer->patchpoints_ctx = NULL;
2444 }
2445 
2446 static VkResult
tu_create_cmd_buffer(struct vk_command_pool * pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)2447 tu_create_cmd_buffer(struct vk_command_pool *pool,
2448                      VkCommandBufferLevel level,
2449                      struct vk_command_buffer **cmd_buffer_out)
2450 {
2451    struct tu_device *device =
2452       container_of(pool->base.device, struct tu_device, vk);
2453    struct tu_cmd_buffer *cmd_buffer;
2454 
2455    cmd_buffer = (struct tu_cmd_buffer *) vk_zalloc2(
2456       &device->vk.alloc, NULL, sizeof(*cmd_buffer), 8,
2457       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2458 
2459    if (cmd_buffer == NULL)
2460       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2461 
2462    VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk,
2463                                             &tu_cmd_buffer_ops, level);
2464    if (result != VK_SUCCESS) {
2465       vk_free2(&device->vk.alloc, NULL, cmd_buffer);
2466       return result;
2467    }
2468 
2469    cmd_buffer->device = device;
2470 
2471    u_trace_init(&cmd_buffer->trace, &device->trace_context);
2472    list_inithead(&cmd_buffer->renderpass_autotune_results);
2473 
2474    tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096, "cmd cs");
2475    tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096, "draw cs");
2476    tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048, "tile store cs");
2477    tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "draw epilogue cs");
2478    tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048, "draw sub cs");
2479    tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw cs");
2480    tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw epiligoue cs");
2481 
2482    for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
2483       cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
2484 
2485    *cmd_buffer_out = &cmd_buffer->vk;
2486 
2487    return VK_SUCCESS;
2488 }
2489 
2490 static void
tu_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)2491 tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
2492 {
2493    struct tu_cmd_buffer *cmd_buffer =
2494       container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk);
2495 
2496    tu_cs_finish(&cmd_buffer->cs);
2497    tu_cs_finish(&cmd_buffer->draw_cs);
2498    tu_cs_finish(&cmd_buffer->tile_store_cs);
2499    tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
2500    tu_cs_finish(&cmd_buffer->sub_cs);
2501    tu_cs_finish(&cmd_buffer->pre_chain.draw_cs);
2502    tu_cs_finish(&cmd_buffer->pre_chain.draw_epilogue_cs);
2503 
2504    u_trace_fini(&cmd_buffer->trace);
2505 
2506    tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
2507 
2508    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
2509       if (cmd_buffer->descriptors[i].push_set.layout)
2510          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
2511                                         &cmd_buffer->descriptors[i].push_set.layout->vk);
2512       vk_free(&cmd_buffer->device->vk.alloc,
2513               cmd_buffer->descriptors[i].push_set.mapped_ptr);
2514    }
2515 
2516    ralloc_free(cmd_buffer->patchpoints_ctx);
2517    util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints);
2518 
2519    vk_command_buffer_finish(&cmd_buffer->vk);
2520    vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
2521             cmd_buffer);
2522 }
2523 
2524 static void
tu_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)2525 tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
2526                     UNUSED VkCommandBufferResetFlags flags)
2527 {
2528    struct tu_cmd_buffer *cmd_buffer =
2529       container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk);
2530 
2531    vk_command_buffer_reset(&cmd_buffer->vk);
2532 
2533    tu_cs_reset(&cmd_buffer->cs);
2534    tu_cs_reset(&cmd_buffer->draw_cs);
2535    tu_cs_reset(&cmd_buffer->tile_store_cs);
2536    tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
2537    tu_cs_reset(&cmd_buffer->sub_cs);
2538    tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
2539    tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
2540 
2541    tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
2542 
2543    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
2544       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
2545       if (cmd_buffer->descriptors[i].push_set.layout) {
2546          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
2547                                         &cmd_buffer->descriptors[i].push_set.layout->vk);
2548       }
2549       vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->descriptors[i].push_set.mapped_ptr);
2550       memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
2551       cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
2552       cmd_buffer->descriptors[i].max_sets_bound = 0;
2553       cmd_buffer->descriptors[i].max_dynamic_offset_size = 0;
2554    }
2555 
2556    u_trace_fini(&cmd_buffer->trace);
2557    u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
2558 
2559    cmd_buffer->state.max_vbs_bound = 0;
2560 
2561    cmd_buffer->vsc_initialized = false;
2562    cmd_buffer->prev_fsr_is_null = false;
2563 
2564    ralloc_free(cmd_buffer->patchpoints_ctx);
2565    cmd_buffer->patchpoints_ctx = NULL;
2566    util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
2567 }
2568 
2569 const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
2570    .create = tu_create_cmd_buffer,
2571    .reset = tu_reset_cmd_buffer,
2572    .destroy = tu_cmd_buffer_destroy,
2573 };
2574 
2575 /* Initialize the cache, assuming all necessary flushes have happened but *not*
2576  * invalidations.
2577  */
2578 static void
tu_cache_init(struct tu_cache_state * cache)2579 tu_cache_init(struct tu_cache_state *cache)
2580 {
2581    cache->flush_bits = 0;
2582    cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
2583 }
2584 
2585 /* Unlike the public entrypoint, this doesn't handle cache tracking, and
2586  * tracking the CCU state. It's used for the driver to insert its own command
2587  * buffer in the middle of a submit.
2588  */
2589 VkResult
tu_cmd_buffer_begin(struct tu_cmd_buffer * cmd_buffer,const VkCommandBufferBeginInfo * pBeginInfo)2590 tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
2591                     const VkCommandBufferBeginInfo *pBeginInfo)
2592 {
2593    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
2594 
2595    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2596    vk_dynamic_graphics_state_init(&cmd_buffer->vk.dynamic_graphics_state);
2597    cmd_buffer->vk.dynamic_graphics_state.vi = &cmd_buffer->state.vi;
2598    cmd_buffer->vk.dynamic_graphics_state.ms.sample_locations = &cmd_buffer->state.sl;
2599    cmd_buffer->state.index_size = 0xff; /* dirty restart index */
2600    cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */
2601 
2602    tu_cache_init(&cmd_buffer->state.cache);
2603    tu_cache_init(&cmd_buffer->state.renderpass_cache);
2604    cmd_buffer->usage_flags = pBeginInfo->flags;
2605 
2606    tu_cs_begin(&cmd_buffer->cs);
2607    tu_cs_begin(&cmd_buffer->draw_cs);
2608    tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2609 
2610    return VK_SUCCESS;
2611 }
2612 
2613 VKAPI_ATTR VkResult VKAPI_CALL
tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)2614 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2615                       const VkCommandBufferBeginInfo *pBeginInfo)
2616 {
2617    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2618    VkResult result = tu_cmd_buffer_begin(cmd_buffer, pBeginInfo);
2619    if (result != VK_SUCCESS)
2620       return result;
2621 
2622    /* setup initial configuration into command buffer */
2623    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2624       trace_start_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs, cmd_buffer);
2625 
2626       switch (cmd_buffer->queue_family_index) {
2627       case TU_QUEUE_GENERAL:
2628          TU_CALLX(cmd_buffer->device, tu6_init_hw)(cmd_buffer, &cmd_buffer->cs);
2629          break;
2630       default:
2631          break;
2632       }
2633    } else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2634       const bool pass_continue =
2635          pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2636 
2637       trace_start_cmd_buffer(&cmd_buffer->trace,
2638             pass_continue ? &cmd_buffer->draw_cs : &cmd_buffer->cs, cmd_buffer);
2639 
2640       assert(pBeginInfo->pInheritanceInfo);
2641 
2642       cmd_buffer->inherited_pipeline_statistics =
2643          pBeginInfo->pInheritanceInfo->pipelineStatistics;
2644 
2645       vk_foreach_struct_const(ext, pBeginInfo->pInheritanceInfo) {
2646          switch (ext->sType) {
2647          case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
2648             const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend =
2649                (VkCommandBufferInheritanceConditionalRenderingInfoEXT *) ext;
2650             cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
2651             break;
2652          }
2653          default:
2654             break;
2655          }
2656       }
2657 
2658       if (pass_continue) {
2659          const VkCommandBufferInheritanceRenderingInfo *rendering_info =
2660             vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
2661                                  COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
2662 
2663          if (TU_DEBUG(DYNAMIC)) {
2664             rendering_info =
2665                vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
2666                                                                 pBeginInfo);
2667          }
2668 
2669          if (rendering_info) {
2670             tu_setup_dynamic_inheritance(cmd_buffer, rendering_info);
2671             cmd_buffer->state.pass = &cmd_buffer->dynamic_pass;
2672             cmd_buffer->state.subpass = &cmd_buffer->dynamic_subpass;
2673 
2674             const VkRenderingAttachmentLocationInfoKHR *location_info =
2675                vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
2676                                     RENDERING_ATTACHMENT_LOCATION_INFO_KHR);
2677             if (location_info) {
2678                vk_common_CmdSetRenderingAttachmentLocationsKHR(commandBuffer,
2679                                                                location_info);
2680             }
2681          } else {
2682             cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2683             cmd_buffer->state.subpass =
2684                &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2685          }
2686          tu_fill_render_pass_state(&cmd_buffer->state.vk_rp,
2687                                    cmd_buffer->state.pass,
2688                                    cmd_buffer->state.subpass);
2689          vk_cmd_set_cb_attachment_count(&cmd_buffer->vk,
2690                                         cmd_buffer->state.subpass->color_count);
2691          cmd_buffer->state.dirty |= TU_CMD_DIRTY_SUBPASS;
2692 
2693          cmd_buffer->patchpoints_ctx = ralloc_context(NULL);
2694 
2695          /* We can't set the gmem layout here, because the state.pass only has
2696           * to be compatible (same formats/sample counts) with the primary's
2697           * renderpass, rather than exactly equal.
2698           */
2699 
2700          tu_lrz_begin_secondary_cmdbuf(cmd_buffer);
2701       } else {
2702          /* When executing in the middle of another command buffer, the CCU
2703           * state is unknown.
2704           */
2705          cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
2706       }
2707    }
2708 
2709    return VK_SUCCESS;
2710 }
2711 
2712 static struct tu_cs
tu_cmd_dynamic_state(struct tu_cmd_buffer * cmd,uint32_t id,uint32_t size)2713 tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
2714 {
2715    struct tu_cs cs;
2716 
2717    assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
2718    cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);
2719 
2720    /* note: this also avoids emitting draw states before renderpass clears,
2721     * which may use the 3D clear path (for MSAA cases)
2722     */
2723    if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
2724       return cs;
2725 
2726    tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2727    tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2728 
2729    return cs;
2730 }
2731 
2732 static void
tu_cmd_end_dynamic_state(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t id)2733 tu_cmd_end_dynamic_state(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
2734                          uint32_t id)
2735 {
2736    assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
2737    cmd->state.dynamic_state[id] = tu_cs_end_draw_state(&cmd->sub_cs, cs);
2738 
2739    /* note: this also avoids emitting draw states before renderpass clears,
2740     * which may use the 3D clear path (for MSAA cases)
2741     */
2742    if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
2743       return;
2744 
2745    tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2746    tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2747 }
2748 
2749 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)2750 tu_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
2751                          uint32_t firstBinding,
2752                          uint32_t bindingCount,
2753                          const VkBuffer *pBuffers,
2754                          const VkDeviceSize *pOffsets,
2755                          const VkDeviceSize *pSizes,
2756                          const VkDeviceSize *pStrides)
2757 {
2758    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2759    struct tu_cs cs;
2760 
2761    cmd->state.max_vbs_bound = MAX2(
2762       cmd->state.max_vbs_bound, firstBinding + bindingCount);
2763 
2764    if (pStrides) {
2765       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
2766                                         pStrides);
2767    }
2768 
2769    cmd->state.vertex_buffers.iova =
2770       tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * cmd->state.max_vbs_bound).iova;
2771 
2772    for (uint32_t i = 0; i < bindingCount; i++) {
2773       if (pBuffers[i] == VK_NULL_HANDLE) {
2774          cmd->state.vb[firstBinding + i].base = 0;
2775          cmd->state.vb[firstBinding + i].size = 0;
2776       } else {
2777          struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
2778          cmd->state.vb[firstBinding + i].base = buf->iova + pOffsets[i];
2779          cmd->state.vb[firstBinding + i].size =
2780             vk_buffer_range(&buf->vk, pOffsets[i], pSizes ? pSizes[i] : VK_WHOLE_SIZE);
2781       }
2782    }
2783 
2784    for (uint32_t i = 0; i < cmd->state.max_vbs_bound; i++) {
2785       tu_cs_emit_regs(&cs,
2786                       A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),
2787                       A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));
2788    }
2789 
2790    cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2791 }
2792 
2793 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)2794 tu_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
2795                           VkBuffer buffer,
2796                           VkDeviceSize offset,
2797                           VkDeviceSize size,
2798                           VkIndexType indexType)
2799 {
2800    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2801    VK_FROM_HANDLE(tu_buffer, buf, buffer);
2802 
2803    size = buf ? vk_buffer_range(&buf->vk, offset, size) : 0;
2804 
2805    uint32_t index_size, index_shift;
2806    uint32_t restart_index = vk_index_to_restart(indexType);
2807 
2808    switch (indexType) {
2809    case VK_INDEX_TYPE_UINT16:
2810       index_size = INDEX4_SIZE_16_BIT;
2811       index_shift = 1;
2812       break;
2813    case VK_INDEX_TYPE_UINT32:
2814       index_size = INDEX4_SIZE_32_BIT;
2815       index_shift = 2;
2816       break;
2817    case VK_INDEX_TYPE_UINT8_KHR:
2818       index_size = INDEX4_SIZE_8_BIT;
2819       index_shift = 0;
2820       break;
2821    default:
2822       unreachable("invalid VkIndexType");
2823    }
2824 
2825    if (buf) {
2826       /* initialize/update the restart index */
2827       if (cmd->state.index_size != index_size)
2828          tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
2829 
2830       cmd->state.index_va = buf->iova + offset;
2831       cmd->state.max_index_count = size >> index_shift;
2832       cmd->state.index_size = index_size;
2833    } else {
2834       cmd->state.index_va = 0;
2835       cmd->state.max_index_count = 0;
2836       cmd->state.index_size = 0;
2837    }
2838 }
2839 
2840 template <chip CHIP>
2841 static void
tu6_emit_descriptor_sets(struct tu_cmd_buffer * cmd,VkPipelineBindPoint bind_point)2842 tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
2843                          VkPipelineBindPoint bind_point)
2844 {
2845    struct tu_descriptor_state *descriptors_state =
2846       tu_get_descriptors_state(cmd, bind_point);
2847    uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
2848    struct tu_cs *cs, state_cs;
2849 
2850    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2851       sp_bindless_base_reg = __SP_BINDLESS_BASE_DESCRIPTOR<CHIP>(0, {}).reg;
2852       hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
2853 
2854       if (CHIP == A6XX) {
2855          cmd->state.desc_sets =
2856             tu_cs_draw_state(&cmd->sub_cs, &state_cs,
2857                              4 + 4 * descriptors_state->max_sets_bound +
2858                              (descriptors_state->max_dynamic_offset_size ? 6 : 0));
2859       } else {
2860          cmd->state.desc_sets =
2861             tu_cs_draw_state(&cmd->sub_cs, &state_cs,
2862                              3 + 2 * descriptors_state->max_sets_bound +
2863                              (descriptors_state->max_dynamic_offset_size ? 3 : 0));
2864       }
2865       cs = &state_cs;
2866    } else {
2867       assert(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
2868 
2869       sp_bindless_base_reg = __SP_CS_BINDLESS_BASE_DESCRIPTOR<CHIP>(0, {}).reg;
2870       hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
2871 
2872       cs = &cmd->cs;
2873    }
2874 
2875    tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound);
2876    tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound);
2877    if (CHIP == A6XX) {
2878       tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound);
2879       tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound);
2880    }
2881 
2882    /* Dynamic descriptors get the reserved descriptor set. */
2883    if (descriptors_state->max_dynamic_offset_size) {
2884       int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
2885       assert(reserved_set_idx >= 0); /* reserved set must be bound */
2886 
2887       tu_cs_emit_pkt4(cs, sp_bindless_base_reg + reserved_set_idx * 2, 2);
2888       tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
2889       if (CHIP == A6XX) {
2890          tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + reserved_set_idx * 2, 2);
2891          tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
2892       }
2893    }
2894 
2895    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
2896       .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? CHIP == A6XX ? 0x1f : 0xff : 0,
2897       .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? CHIP == A6XX ? 0x1f : 0xff : 0,
2898    ));
2899 
2900    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2901       assert(cs->cur == cs->end); /* validate draw state size */
2902       /* note: this also avoids emitting draw states before renderpass clears,
2903        * which may use the 3D clear path (for MSAA cases)
2904        */
2905       if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
2906          tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2907          tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
2908       }
2909    }
2910 }
2911 
2912 /* We lazily emit the draw state for desciptor sets at draw time, so that we can
2913  * batch together multiple tu_CmdBindDescriptorSets() calls.  ANGLE and zink
2914  * will often emit multiple bind calls in a draw.
2915  */
2916 static void
tu_dirty_desc_sets(struct tu_cmd_buffer * cmd,VkPipelineBindPoint pipelineBindPoint)2917 tu_dirty_desc_sets(struct tu_cmd_buffer *cmd,
2918                    VkPipelineBindPoint pipelineBindPoint)
2919 {
2920    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2921       cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS;
2922    } else {
2923       assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2924       cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS;
2925    }
2926 }
2927 
2928 static void
tu_bind_descriptor_sets(struct tu_cmd_buffer * cmd,const VkBindDescriptorSetsInfoKHR * info,VkPipelineBindPoint bind_point)2929 tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd,
2930                         const VkBindDescriptorSetsInfoKHR *info,
2931                         VkPipelineBindPoint bind_point)
2932 {
2933    VK_FROM_HANDLE(tu_pipeline_layout, layout, info->layout);
2934    unsigned dyn_idx = 0;
2935 
2936    struct tu_descriptor_state *descriptors_state =
2937       tu_get_descriptors_state(cmd, bind_point);
2938 
2939    descriptors_state->max_sets_bound =
2940       MAX2(descriptors_state->max_sets_bound,
2941            info->firstSet + info->descriptorSetCount);
2942 
2943    unsigned dynamic_offset_offset = 0;
2944    for (unsigned i = 0; i < info->firstSet; i++) {
2945       dynamic_offset_offset += layout->set[i].layout->dynamic_offset_size;
2946    }
2947 
2948    for (unsigned i = 0; i < info->descriptorSetCount; ++i) {
2949       unsigned idx = i + info->firstSet;
2950       VK_FROM_HANDLE(tu_descriptor_set, set, info->pDescriptorSets[i]);
2951 
2952       descriptors_state->sets[idx] = set;
2953       descriptors_state->set_iova[idx] = set ?
2954          (set->va | BINDLESS_DESCRIPTOR_64B) : 0;
2955 
2956       if (!set)
2957          continue;
2958 
2959       if (set->layout->has_inline_uniforms)
2960          cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2961 
2962       if (!set->layout->dynamic_offset_size)
2963          continue;
2964 
2965       uint32_t *src = set->dynamic_descriptors;
2966       uint32_t *dst = descriptors_state->dynamic_descriptors +
2967          dynamic_offset_offset / 4;
2968       for (unsigned j = 0; j < set->layout->binding_count; j++) {
2969          struct tu_descriptor_set_binding_layout *binding =
2970             &set->layout->binding[j];
2971          if (vk_descriptor_type_is_dynamic(binding->type)) {
2972             for (unsigned k = 0; k < binding->array_size; k++, dyn_idx++) {
2973                assert(dyn_idx < info->dynamicOffsetCount);
2974                uint32_t offset = info->pDynamicOffsets[dyn_idx];
2975                memcpy(dst, src, binding->size);
2976 
2977                if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
2978                   /* Note: we can assume here that the addition won't roll
2979                    * over and change the SIZE field.
2980                    */
2981                   uint64_t va = src[0] | ((uint64_t)src[1] << 32);
2982                   va += offset;
2983                   dst[0] = va;
2984                   dst[1] = va >> 32;
2985                } else {
2986                   uint32_t *dst_desc = dst;
2987                   for (unsigned i = 0;
2988                        i < binding->size / (4 * A6XX_TEX_CONST_DWORDS);
2989                        i++, dst_desc += A6XX_TEX_CONST_DWORDS) {
2990                      /* Note: A6XX_TEX_CONST_5_DEPTH is always 0 */
2991                      uint64_t va = dst_desc[4] | ((uint64_t)dst_desc[5] << 32);
2992                      uint32_t desc_offset =
2993                         (dst_desc[2] &
2994                          A6XX_TEX_CONST_2_STARTOFFSETTEXELS__MASK) >>
2995                         A6XX_TEX_CONST_2_STARTOFFSETTEXELS__SHIFT;
2996 
2997                      /* Use descriptor's format to determine the shift amount
2998                       * that's to be used on the offset value.
2999                       */
3000                      uint32_t format = (dst_desc[0] &
3001                                         A6XX_TEX_CONST_0_FMT__MASK) >>
3002                                        A6XX_TEX_CONST_0_FMT__SHIFT;
3003                      unsigned offset_shift;
3004                      switch (format) {
3005                      case FMT6_16_UINT:
3006                         offset_shift = 1;
3007                         break;
3008                      case FMT6_32_UINT:
3009                         offset_shift = 2;
3010                         break;
3011                      case FMT6_8_UINT:
3012                      default:
3013                         offset_shift = 0;
3014                         break;
3015                      }
3016 
3017                      va += desc_offset << offset_shift;
3018                      va += offset;
3019                      unsigned new_offset = (va & 0x3f) >> offset_shift;
3020                      va &= ~0x3full;
3021                      dst_desc[4] = va;
3022                      dst_desc[5] = va >> 32;
3023                      dst_desc[2] =
3024                         (dst_desc[2] & ~A6XX_TEX_CONST_2_STARTOFFSETTEXELS__MASK) |
3025                         A6XX_TEX_CONST_2_STARTOFFSETTEXELS(new_offset);
3026                   }
3027                }
3028 
3029                dst += binding->size / 4;
3030                src += binding->size / 4;
3031             }
3032          }
3033       }
3034 
3035       dynamic_offset_offset += layout->set[idx].layout->dynamic_offset_size;
3036    }
3037    assert(dyn_idx == info->dynamicOffsetCount);
3038 
3039    if (dynamic_offset_offset) {
3040       descriptors_state->max_dynamic_offset_size =
3041          MAX2(descriptors_state->max_dynamic_offset_size, dynamic_offset_offset);
3042 
3043       /* allocate and fill out dynamic descriptor set */
3044       struct tu_cs_memory dynamic_desc_set;
3045       int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
3046       VkResult result =
3047          tu_cs_alloc(&cmd->sub_cs,
3048                      descriptors_state->max_dynamic_offset_size /
3049                      (4 * A6XX_TEX_CONST_DWORDS),
3050                      A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
3051       if (result != VK_SUCCESS) {
3052          vk_command_buffer_set_error(&cmd->vk, result);
3053          return;
3054       }
3055 
3056       memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
3057              descriptors_state->max_dynamic_offset_size);
3058       assert(reserved_set_idx >= 0); /* reserved set must be bound */
3059       descriptors_state->set_iova[reserved_set_idx] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
3060    }
3061 
3062    tu_dirty_desc_sets(cmd, bind_point);
3063 }
3064 
3065 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)3066 tu_CmdBindDescriptorSets2KHR(
3067    VkCommandBuffer commandBuffer,
3068    const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
3069 {
3070    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3071 
3072    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
3073       tu_bind_descriptor_sets(cmd, pBindDescriptorSetsInfo,
3074                               VK_PIPELINE_BIND_POINT_COMPUTE);
3075    }
3076 
3077    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
3078       tu_bind_descriptor_sets(cmd, pBindDescriptorSetsInfo,
3079                               VK_PIPELINE_BIND_POINT_GRAPHICS);
3080    }
3081 }
3082 
3083 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer,uint32_t bufferCount,const VkDescriptorBufferBindingInfoEXT * pBindingInfos)3084 tu_CmdBindDescriptorBuffersEXT(
3085    VkCommandBuffer commandBuffer,
3086    uint32_t bufferCount,
3087    const VkDescriptorBufferBindingInfoEXT *pBindingInfos)
3088 {
3089    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3090 
3091    for (unsigned i = 0; i < bufferCount; i++)
3092       cmd->state.descriptor_buffer_iova[i] = pBindingInfos[i].address;
3093 }
3094 
3095 static void
tu_set_descriptor_buffer_offsets(struct tu_cmd_buffer * cmd,const VkSetDescriptorBufferOffsetsInfoEXT * info,VkPipelineBindPoint bind_point)3096 tu_set_descriptor_buffer_offsets(
3097    struct tu_cmd_buffer *cmd,
3098    const VkSetDescriptorBufferOffsetsInfoEXT *info,
3099    VkPipelineBindPoint bind_point)
3100 {
3101    VK_FROM_HANDLE(tu_pipeline_layout, layout, info->layout);
3102 
3103    struct tu_descriptor_state *descriptors_state =
3104       tu_get_descriptors_state(cmd, bind_point);
3105 
3106    descriptors_state->max_sets_bound = MAX2(descriptors_state->max_sets_bound,
3107                                             info->firstSet + info->setCount);
3108 
3109    for (unsigned i = 0; i < info->setCount; ++i) {
3110       unsigned idx = i + info->firstSet;
3111       struct tu_descriptor_set_layout *set_layout = layout->set[idx].layout;
3112 
3113       descriptors_state->set_iova[idx] =
3114          (cmd->state.descriptor_buffer_iova[info->pBufferIndices[i]] +
3115           info->pOffsets[i]) |
3116          BINDLESS_DESCRIPTOR_64B;
3117 
3118       if (set_layout->has_inline_uniforms)
3119          cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
3120    }
3121 
3122    tu_dirty_desc_sets(cmd, bind_point);
3123 }
3124 
3125 VKAPI_ATTR void VKAPI_CALL
tu_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo)3126 tu_CmdSetDescriptorBufferOffsets2EXT(
3127    VkCommandBuffer commandBuffer,
3128    const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo)
3129 {
3130    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3131 
3132    if (pSetDescriptorBufferOffsetsInfo->stageFlags &
3133        VK_SHADER_STAGE_COMPUTE_BIT) {
3134       tu_set_descriptor_buffer_offsets(cmd, pSetDescriptorBufferOffsetsInfo,
3135                                        VK_PIPELINE_BIND_POINT_COMPUTE);
3136    }
3137 
3138    if (pSetDescriptorBufferOffsetsInfo->stageFlags &
3139        VK_SHADER_STAGE_ALL_GRAPHICS) {
3140       tu_set_descriptor_buffer_offsets(cmd, pSetDescriptorBufferOffsetsInfo,
3141                                        VK_PIPELINE_BIND_POINT_GRAPHICS);
3142    }
3143 }
3144 
3145 static void
tu_bind_descriptor_buffer_embedded_samplers(struct tu_cmd_buffer * cmd,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * info,VkPipelineBindPoint bind_point)3146 tu_bind_descriptor_buffer_embedded_samplers(
3147    struct tu_cmd_buffer *cmd,
3148    const VkBindDescriptorBufferEmbeddedSamplersInfoEXT *info,
3149    VkPipelineBindPoint bind_point)
3150 {
3151    VK_FROM_HANDLE(tu_pipeline_layout, layout, info->layout);
3152 
3153    struct tu_descriptor_set_layout *set_layout =
3154       layout->set[info->set].layout;
3155 
3156    struct tu_descriptor_state *descriptors_state =
3157       tu_get_descriptors_state(cmd, bind_point);
3158 
3159    descriptors_state->max_sets_bound =
3160       MAX2(descriptors_state->max_sets_bound, info->set + 1);
3161 
3162    descriptors_state->set_iova[info->set] =
3163       set_layout->embedded_samplers->iova | BINDLESS_DESCRIPTOR_64B;
3164 
3165    tu_dirty_desc_sets(cmd, bind_point);
3166 }
3167 
3168 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindDescriptorBufferEmbeddedSamplers2EXT(VkCommandBuffer commandBuffer,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * pBindDescriptorBufferEmbeddedSamplersInfo)3169 tu_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
3170    VkCommandBuffer commandBuffer,
3171    const VkBindDescriptorBufferEmbeddedSamplersInfoEXT
3172       *pBindDescriptorBufferEmbeddedSamplersInfo)
3173 {
3174    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3175 
3176    if (pBindDescriptorBufferEmbeddedSamplersInfo->stageFlags &
3177        VK_SHADER_STAGE_COMPUTE_BIT) {
3178       tu_bind_descriptor_buffer_embedded_samplers(
3179          cmd, pBindDescriptorBufferEmbeddedSamplersInfo,
3180          VK_PIPELINE_BIND_POINT_COMPUTE);
3181    }
3182 
3183    if (pBindDescriptorBufferEmbeddedSamplersInfo->stageFlags &
3184        VK_SHADER_STAGE_ALL_GRAPHICS) {
3185       tu_bind_descriptor_buffer_embedded_samplers(
3186          cmd, pBindDescriptorBufferEmbeddedSamplersInfo,
3187          VK_PIPELINE_BIND_POINT_GRAPHICS);
3188    }
3189 }
3190 
3191 static VkResult
tu_push_descriptor_set_update_layout(struct tu_device * device,struct tu_descriptor_set * set,struct tu_descriptor_set_layout * layout)3192 tu_push_descriptor_set_update_layout(struct tu_device *device,
3193                                      struct tu_descriptor_set *set,
3194                                      struct tu_descriptor_set_layout *layout)
3195 {
3196    if (set->layout == layout)
3197       return VK_SUCCESS;
3198 
3199    if (set->layout)
3200       vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
3201    vk_descriptor_set_layout_ref(&layout->vk);
3202    set->layout = layout;
3203 
3204    if (set->host_size < layout->size) {
3205       void *new_buf =
3206          vk_realloc(&device->vk.alloc, set->mapped_ptr, layout->size, 8,
3207                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3208       if (!new_buf)
3209          return VK_ERROR_OUT_OF_HOST_MEMORY;
3210       set->mapped_ptr = (uint32_t *) new_buf;
3211       set->host_size = layout->size;
3212    }
3213    return VK_SUCCESS;
3214 }
3215 
3216 static void
tu_push_descriptor_set(struct tu_cmd_buffer * cmd,const VkPushDescriptorSetInfoKHR * info,VkPipelineBindPoint bind_point)3217 tu_push_descriptor_set(struct tu_cmd_buffer *cmd,
3218                        const VkPushDescriptorSetInfoKHR *info,
3219                        VkPipelineBindPoint bind_point)
3220 {
3221    VK_FROM_HANDLE(tu_pipeline_layout, pipe_layout, info->layout);
3222    struct tu_descriptor_set_layout *layout =
3223       pipe_layout->set[info->set].layout;
3224    struct tu_descriptor_set *set =
3225       &tu_get_descriptors_state(cmd, bind_point)->push_set;
3226 
3227    struct tu_cs_memory set_mem;
3228    VkResult result = tu_cs_alloc(&cmd->sub_cs,
3229                                  DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
3230                                  A6XX_TEX_CONST_DWORDS, &set_mem);
3231    if (result != VK_SUCCESS) {
3232       vk_command_buffer_set_error(&cmd->vk, result);
3233       return;
3234    }
3235 
3236    result = tu_push_descriptor_set_update_layout(cmd->device, set, layout);
3237    if (result != VK_SUCCESS) {
3238       vk_command_buffer_set_error(&cmd->vk, result);
3239       return;
3240    }
3241 
3242    tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),
3243                              info->descriptorWriteCount,
3244                              info->pDescriptorWrites, 0, NULL);
3245 
3246    memcpy(set_mem.map, set->mapped_ptr, layout->size);
3247    set->va = set_mem.iova;
3248 
3249    const VkDescriptorSet desc_set[] = { tu_descriptor_set_to_handle(set) };
3250    vk_common_CmdBindDescriptorSets(tu_cmd_buffer_to_handle(cmd), bind_point,
3251                                    info->layout, info->set, 1, desc_set, 0,
3252                                    NULL);
3253 }
3254 
3255 VKAPI_ATTR void VKAPI_CALL
tu_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)3256 tu_CmdPushDescriptorSet2KHR(
3257    VkCommandBuffer commandBuffer,
3258    const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
3259 {
3260    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3261 
3262    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
3263       tu_push_descriptor_set(cmd, pPushDescriptorSetInfo,
3264                              VK_PIPELINE_BIND_POINT_COMPUTE);
3265    }
3266 
3267    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
3268       tu_push_descriptor_set(cmd, pPushDescriptorSetInfo,
3269                              VK_PIPELINE_BIND_POINT_GRAPHICS);
3270    }
3271 }
3272 
3273 VKAPI_ATTR void VKAPI_CALL
tu_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)3274 tu_CmdPushDescriptorSetWithTemplate2KHR(
3275    VkCommandBuffer commandBuffer,
3276    const VkPushDescriptorSetWithTemplateInfoKHR
3277       *pPushDescriptorSetWithTemplateInfo)
3278 {
3279    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3280    VK_FROM_HANDLE(tu_pipeline_layout, pipe_layout,
3281                   pPushDescriptorSetWithTemplateInfo->layout);
3282    VK_FROM_HANDLE(
3283       tu_descriptor_update_template, templ,
3284       pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
3285    struct tu_descriptor_set_layout *layout =
3286       pipe_layout->set[pPushDescriptorSetWithTemplateInfo->set].layout;
3287    struct tu_descriptor_set *set =
3288       &tu_get_descriptors_state(cmd, templ->bind_point)->push_set;
3289 
3290    struct tu_cs_memory set_mem;
3291    VkResult result = tu_cs_alloc(&cmd->sub_cs,
3292                                  DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
3293                                  A6XX_TEX_CONST_DWORDS, &set_mem);
3294    if (result != VK_SUCCESS) {
3295       vk_command_buffer_set_error(&cmd->vk, result);
3296       return;
3297    }
3298 
3299    result = tu_push_descriptor_set_update_layout(cmd->device, set, layout);
3300    if (result != VK_SUCCESS) {
3301       vk_command_buffer_set_error(&cmd->vk, result);
3302       return;
3303    }
3304 
3305    tu_update_descriptor_set_with_template(
3306       cmd->device, set,
3307       pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate,
3308       pPushDescriptorSetWithTemplateInfo->pData);
3309 
3310    memcpy(set_mem.map, set->mapped_ptr, layout->size);
3311    set->va = set_mem.iova;
3312 
3313    const VkDescriptorSet desc_set[] = { tu_descriptor_set_to_handle(set) };
3314    vk_common_CmdBindDescriptorSets(
3315       tu_cmd_buffer_to_handle(cmd), templ->bind_point,
3316       pPushDescriptorSetWithTemplateInfo->layout,
3317       pPushDescriptorSetWithTemplateInfo->set, 1, desc_set, 0, NULL);
3318 }
3319 
3320 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3321 tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3322                                       uint32_t firstBinding,
3323                                       uint32_t bindingCount,
3324                                       const VkBuffer *pBuffers,
3325                                       const VkDeviceSize *pOffsets,
3326                                       const VkDeviceSize *pSizes)
3327 {
3328    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3329    struct tu_cs *cs = &cmd->draw_cs;
3330 
3331    /* using COND_REG_EXEC for xfb commands matches the blob behavior
3332     * presumably there isn't any benefit using a draw state when the
3333     * condition is (SYSMEM | BINNING)
3334     */
3335    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
3336                           CP_COND_REG_EXEC_0_SYSMEM |
3337                           CP_COND_REG_EXEC_0_BINNING);
3338 
3339    for (uint32_t i = 0; i < bindingCount; i++) {
3340       VK_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
3341       uint64_t iova = buf->iova + pOffsets[i];
3342       uint32_t size = buf->bo->size - (iova - buf->bo->iova);
3343       uint32_t idx = i + firstBinding;
3344 
3345       if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
3346          size = pSizes[i];
3347 
3348       /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
3349       uint32_t offset = iova & 0x1f;
3350       iova &= ~(uint64_t) 0x1f;
3351 
3352       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
3353       tu_cs_emit_qw(cs, iova);
3354       tu_cs_emit(cs, size + offset);
3355 
3356       cmd->state.streamout_offset[idx] = offset;
3357    }
3358 
3359    tu_cond_exec_end(cs);
3360 }
3361 
3362 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3363 tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3364                                 uint32_t firstCounterBuffer,
3365                                 uint32_t counterBufferCount,
3366                                 const VkBuffer *pCounterBuffers,
3367                                 const VkDeviceSize *pCounterBufferOffsets)
3368 {
3369    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3370    struct tu_cs *cs = &cmd->draw_cs;
3371 
3372    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
3373                           CP_COND_REG_EXEC_0_SYSMEM |
3374                           CP_COND_REG_EXEC_0_BINNING);
3375 
3376    tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
3377 
3378    /* TODO: only update offset for active buffers */
3379    for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
3380       tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
3381 
3382    for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
3383       uint32_t idx = firstCounterBuffer + i;
3384       uint32_t offset = cmd->state.streamout_offset[idx];
3385       uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
3386 
3387       if (!pCounterBuffers[i])
3388          continue;
3389 
3390       VK_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
3391 
3392       tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
3393       tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
3394                      CP_MEM_TO_REG_0_UNK31 |
3395                      CP_MEM_TO_REG_0_CNT(1));
3396       tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset);
3397 
3398       if (offset) {
3399          tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
3400          tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
3401                         CP_REG_RMW_0_SRC1_ADD);
3402          tu_cs_emit(cs, 0xffffffff);
3403          tu_cs_emit(cs, offset);
3404       }
3405    }
3406 
3407    tu_cond_exec_end(cs);
3408 }
3409 
3410 template <chip CHIP>
3411 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3412 tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3413                               uint32_t firstCounterBuffer,
3414                               uint32_t counterBufferCount,
3415                               const VkBuffer *pCounterBuffers,
3416                               const VkDeviceSize *pCounterBufferOffsets)
3417 {
3418    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3419    struct tu_cs *cs = &cmd->draw_cs;
3420 
3421    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
3422                           CP_COND_REG_EXEC_0_SYSMEM |
3423                           CP_COND_REG_EXEC_0_BINNING);
3424 
3425    tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
3426 
3427    /* TODO: only flush buffers that need to be flushed */
3428    for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3429       /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
3430       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
3431       tu_cs_emit_qw(cs, global_iova_arr(cmd, flush_base, i));
3432       tu_emit_event_write<CHIP>(cmd, cs, (enum fd_gpu_event) (FD_FLUSH_SO_0 + i));
3433    }
3434 
3435    for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
3436       uint32_t idx = firstCounterBuffer + i;
3437       uint32_t offset = cmd->state.streamout_offset[idx];
3438       uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
3439 
3440       if (!pCounterBuffers[i])
3441          continue;
3442 
3443       VK_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
3444 
3445       /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
3446       tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
3447       tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
3448                      COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
3449                      0x40000 | /* ??? */
3450                      CP_MEM_TO_REG_0_UNK31 |
3451                      CP_MEM_TO_REG_0_CNT(1));
3452       tu_cs_emit_qw(cs, global_iova_arr(cmd, flush_base, idx));
3453 
3454       if (offset) {
3455          tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
3456          tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
3457                         CP_REG_RMW_0_SRC1_ADD);
3458          tu_cs_emit(cs, 0xffffffff);
3459          tu_cs_emit(cs, -offset);
3460       }
3461 
3462       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
3463       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
3464                      CP_REG_TO_MEM_0_CNT(1));
3465       tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset);
3466    }
3467 
3468    tu_cond_exec_end(cs);
3469 
3470    cmd->state.rp.xfb_used = true;
3471 }
3472 TU_GENX(tu_CmdEndTransformFeedbackEXT);
3473 
3474 VKAPI_ATTR void VKAPI_CALL
tu_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)3475 tu_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
3476                         const VkPushConstantsInfoKHR *pPushConstantsInfo)
3477 {
3478    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3479    memcpy((char *) cmd->push_constants + pPushConstantsInfo->offset,
3480           pPushConstantsInfo->pValues, pPushConstantsInfo->size);
3481    cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
3482 }
3483 
3484 /* Clean everything which has been made available but we haven't actually
3485  * cleaned yet.
3486  */
3487 static void
tu_clean_all_pending(struct tu_cache_state * cache)3488 tu_clean_all_pending(struct tu_cache_state *cache)
3489 {
3490    cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_CLEAN;
3491    cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_CLEAN;
3492 }
3493 
3494 template <chip CHIP>
3495 VKAPI_ATTR VkResult VKAPI_CALL
tu_EndCommandBuffer(VkCommandBuffer commandBuffer)3496 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
3497 {
3498    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3499 
3500    /* We currently flush CCU at the end of the command buffer, like
3501     * what the blob does. There's implicit synchronization around every
3502     * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
3503     * know yet if this command buffer will be the last in the submit so we
3504     * have to defensively flush everything else.
3505     *
3506     * TODO: We could definitely do better than this, since these flushes
3507     * aren't required by Vulkan, but we'd need kernel support to do that.
3508     * Ideally, we'd like the kernel to flush everything afterwards, so that we
3509     * wouldn't have to do any flushes here, and when submitting multiple
3510     * command buffers there wouldn't be any unnecessary flushes in between.
3511     */
3512    if (cmd_buffer->state.pass) {
3513       tu_clean_all_pending(&cmd_buffer->state.renderpass_cache);
3514       tu_emit_cache_flush_renderpass<CHIP>(cmd_buffer);
3515 
3516       trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->draw_cs);
3517    } else {
3518       tu_clean_all_pending(&cmd_buffer->state.cache);
3519       cmd_buffer->state.cache.flush_bits |=
3520          TU_CMD_FLAG_CCU_CLEAN_COLOR |
3521          TU_CMD_FLAG_CCU_CLEAN_DEPTH;
3522       tu_emit_cache_flush<CHIP>(cmd_buffer);
3523 
3524       trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs);
3525    }
3526 
3527    tu_cs_end(&cmd_buffer->cs);
3528    tu_cs_end(&cmd_buffer->draw_cs);
3529    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3530 
3531    return vk_command_buffer_end(&cmd_buffer->vk);
3532 }
3533 TU_GENX(tu_EndCommandBuffer);
3534 
3535 static void
tu_bind_vs(struct tu_cmd_buffer * cmd,struct tu_shader * vs)3536 tu_bind_vs(struct tu_cmd_buffer *cmd, struct tu_shader *vs)
3537 {
3538    cmd->state.shaders[MESA_SHADER_VERTEX] = vs;
3539 }
3540 
3541 static void
tu_bind_tcs(struct tu_cmd_buffer * cmd,struct tu_shader * tcs)3542 tu_bind_tcs(struct tu_cmd_buffer *cmd, struct tu_shader *tcs)
3543 {
3544    cmd->state.shaders[MESA_SHADER_TESS_CTRL] = tcs;
3545 }
3546 
3547 static void
tu_bind_tes(struct tu_cmd_buffer * cmd,struct tu_shader * tes)3548 tu_bind_tes(struct tu_cmd_buffer *cmd, struct tu_shader *tes)
3549 {
3550    if (cmd->state.shaders[MESA_SHADER_TESS_EVAL] != tes) {
3551       cmd->state.shaders[MESA_SHADER_TESS_EVAL] = tes;
3552       cmd->state.dirty |= TU_CMD_DIRTY_TES;
3553 
3554       if (!cmd->state.tess_params.valid ||
3555           cmd->state.tess_params.output_upper_left !=
3556           tes->tes.tess_output_upper_left ||
3557           cmd->state.tess_params.output_lower_left !=
3558           tes->tes.tess_output_lower_left ||
3559           cmd->state.tess_params.spacing != tes->tes.tess_spacing) {
3560          cmd->state.tess_params.output_upper_left =
3561             tes->tes.tess_output_upper_left;
3562          cmd->state.tess_params.output_lower_left =
3563             tes->tes.tess_output_lower_left;
3564          cmd->state.tess_params.spacing = tes->tes.tess_spacing;
3565          cmd->state.tess_params.valid = true;
3566          cmd->state.dirty |= TU_CMD_DIRTY_TESS_PARAMS;
3567       }
3568    }
3569 }
3570 
3571 static void
tu_bind_gs(struct tu_cmd_buffer * cmd,struct tu_shader * gs)3572 tu_bind_gs(struct tu_cmd_buffer *cmd, struct tu_shader *gs)
3573 {
3574    cmd->state.shaders[MESA_SHADER_GEOMETRY] = gs;
3575 }
3576 
3577 static void
tu_bind_fs(struct tu_cmd_buffer * cmd,struct tu_shader * fs)3578 tu_bind_fs(struct tu_cmd_buffer *cmd, struct tu_shader *fs)
3579 {
3580    if (cmd->state.shaders[MESA_SHADER_FRAGMENT] != fs) {
3581       cmd->state.shaders[MESA_SHADER_FRAGMENT] = fs;
3582       cmd->state.dirty |= TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_FS;
3583    }
3584 }
3585 
3586 /* We cannot do this only at pipeline bind time since pipeline
3587  * could have been bound at any time before current renderpass,
3588  * e.g. in the previous renderpass.
3589  */
3590 static void
tu_pipeline_update_rp_state(struct tu_cmd_state * cmd_state)3591 tu_pipeline_update_rp_state(struct tu_cmd_state *cmd_state)
3592 {
3593    if (cmd_state->pipeline_disable_gmem &&
3594        !cmd_state->rp.disable_gmem) {
3595       /* VK_EXT_attachment_feedback_loop_layout allows feedback loop to involve
3596        * not only input attachments but also sampled images or image resources.
3597        * But we cannot just patch gmem for image in the descriptors.
3598        *
3599        * At the moment, in context of DXVK, it is expected that only a few
3600        * drawcalls in a frame would use feedback loop and they would be wrapped
3601        * in their own renderpasses, so it should be ok to force sysmem.
3602        *
3603        * However, there are two further possible optimizations if need would
3604        * arise for other translation layer:
3605        * - Tiling could be enabled if we ensure that there is no barrier in
3606        *   the renderpass;
3607        * - Check that both pipeline and attachments agree that feedback loop
3608        *   is needed.
3609        */
3610       perf_debug(
3611          cmd->device,
3612          "Disabling gmem due to VK_EXT_attachment_feedback_loop_layout");
3613       cmd_state->rp.disable_gmem = true;
3614    }
3615 
3616    if (cmd_state->pipeline_sysmem_single_prim_mode &&
3617        !cmd_state->rp.sysmem_single_prim_mode) {
3618       perf_debug(cmd->device, "single_prim_mode due to pipeline settings");
3619       cmd_state->rp.sysmem_single_prim_mode = true;
3620    }
3621 
3622    if (cmd_state->pipeline_has_tess) {
3623       cmd_state->rp.has_tess = true;
3624    }
3625 }
3626 
3627 VKAPI_ATTR void VKAPI_CALL
tu_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)3628 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
3629                    VkPipelineBindPoint pipelineBindPoint,
3630                    VkPipeline _pipeline)
3631 {
3632    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3633    VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
3634 
3635    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
3636       cmd->state.shaders[MESA_SHADER_COMPUTE] =
3637          pipeline->shaders[MESA_SHADER_COMPUTE];
3638       tu_cs_emit_state_ib(&cmd->cs,
3639                           pipeline->shaders[MESA_SHADER_COMPUTE]->state);
3640       cmd->state.compute_load_state = pipeline->load_state;
3641       return;
3642    }
3643 
3644    assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
3645 
3646    struct tu_graphics_pipeline *gfx_pipeline = tu_pipeline_to_graphics(pipeline);
3647    cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS | TU_CMD_DIRTY_SHADER_CONSTS |
3648                        TU_CMD_DIRTY_VS_PARAMS | TU_CMD_DIRTY_PROGRAM;
3649 
3650    tu_bind_vs(cmd, pipeline->shaders[MESA_SHADER_VERTEX]);
3651    tu_bind_tcs(cmd, pipeline->shaders[MESA_SHADER_TESS_CTRL]);
3652    tu_bind_tes(cmd, pipeline->shaders[MESA_SHADER_TESS_EVAL]);
3653    tu_bind_gs(cmd, pipeline->shaders[MESA_SHADER_GEOMETRY]);
3654    tu_bind_fs(cmd, pipeline->shaders[MESA_SHADER_FRAGMENT]);
3655 
3656    /* We precompile static state and count it as dynamic, so we have to
3657     * manually clear bitset that tells which dynamic state is set, in order to
3658     * make sure that future dynamic state will be emitted. The issue is that
3659     * framework remembers only a past REAL dynamic state and compares a new
3660     * dynamic state against it, and not against our static state masquaraded
3661     * as dynamic.
3662     */
3663    BITSET_ANDNOT(cmd->vk.dynamic_graphics_state.set,
3664                  cmd->vk.dynamic_graphics_state.set,
3665                  pipeline->static_state_mask);
3666 
3667    vk_cmd_set_dynamic_graphics_state(&cmd->vk,
3668                                      &gfx_pipeline->dynamic_state);
3669    cmd->state.program = pipeline->program;
3670 
3671    cmd->state.load_state = pipeline->load_state;
3672    cmd->state.prim_order_gmem = pipeline->prim_order.state_gmem;
3673    cmd->state.pipeline_sysmem_single_prim_mode = pipeline->prim_order.sysmem_single_prim_mode;
3674    cmd->state.pipeline_has_tess = pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
3675    cmd->state.pipeline_disable_gmem = gfx_pipeline->feedback_loop_may_involve_textures;
3676 
3677    tu_pipeline_update_rp_state(&cmd->state);
3678 
3679    if (pipeline->lrz_blend.valid) {
3680       if (cmd->state.blend_reads_dest != pipeline->lrz_blend.reads_dest) {
3681          cmd->state.blend_reads_dest = pipeline->lrz_blend.reads_dest;
3682          cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3683       }
3684    }
3685    cmd->state.pipeline_blend_lrz = pipeline->lrz_blend.valid;
3686 
3687    if (pipeline->bandwidth.valid)
3688       cmd->state.bandwidth = pipeline->bandwidth;
3689    cmd->state.pipeline_bandwidth = pipeline->bandwidth.valid;
3690 
3691    struct tu_cs *cs = &cmd->draw_cs;
3692 
3693    /* note: this also avoids emitting draw states before renderpass clears,
3694     * which may use the 3D clear path (for MSAA cases)
3695     */
3696    if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
3697       uint32_t mask = pipeline->set_state_mask;
3698 
3699       tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (10 + util_bitcount(mask)));
3700       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
3701       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, pipeline->program.vs_state);
3702       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, pipeline->program.vs_binning_state);
3703       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_HS, pipeline->program.hs_state);
3704       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DS, pipeline->program.ds_state);
3705       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS, pipeline->program.gs_state);
3706       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, pipeline->program.gs_binning_state);
3707       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, pipeline->program.fs_state);
3708       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, pipeline->program.vpc_state);
3709       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
3710 
3711       u_foreach_bit(i, mask)
3712          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
3713    }
3714 
3715    cmd->state.pipeline_draw_states = pipeline->set_state_mask;
3716    u_foreach_bit(i, pipeline->set_state_mask)
3717       cmd->state.dynamic_state[i] = pipeline->dynamic_state[i];
3718 
3719    if (pipeline->program.per_view_viewport != cmd->state.per_view_viewport) {
3720       cmd->state.per_view_viewport = pipeline->program.per_view_viewport;
3721       cmd->state.dirty |= TU_CMD_DIRTY_PER_VIEW_VIEWPORT;
3722    }
3723 
3724    if (gfx_pipeline->feedback_loops != cmd->state.pipeline_feedback_loops) {
3725       cmd->state.pipeline_feedback_loops = gfx_pipeline->feedback_loops;
3726       cmd->state.dirty |= TU_CMD_DIRTY_FEEDBACK_LOOPS | TU_CMD_DIRTY_LRZ;
3727    }
3728 
3729    if (pipeline->program.writes_shading_rate !=
3730           cmd->state.pipeline_writes_shading_rate ||
3731        pipeline->program.reads_shading_rate !=
3732           cmd->state.pipeline_reads_shading_rate ||
3733        pipeline->program.accesses_smask !=
3734           cmd->state.pipeline_accesses_smask) {
3735       cmd->state.pipeline_writes_shading_rate =
3736          pipeline->program.writes_shading_rate;
3737       cmd->state.pipeline_reads_shading_rate =
3738          pipeline->program.reads_shading_rate;
3739       cmd->state.pipeline_accesses_smask =
3740          pipeline->program.accesses_smask;
3741       cmd->state.dirty |= TU_CMD_DIRTY_SHADING_RATE;
3742    }
3743 
3744    bool raster_order_attachment_access =
3745       pipeline->output.raster_order_attachment_access ||
3746       pipeline->ds.raster_order_attachment_access;
3747    if (!cmd->state.raster_order_attachment_access_valid ||
3748        raster_order_attachment_access !=
3749        cmd->state.raster_order_attachment_access) {
3750       cmd->state.raster_order_attachment_access =
3751          raster_order_attachment_access;
3752       cmd->state.dirty |= TU_CMD_DIRTY_RAST_ORDER;
3753       cmd->state.raster_order_attachment_access_valid = true;
3754    }
3755 }
3756 
3757 void
tu_flush_for_access(struct tu_cache_state * cache,enum tu_cmd_access_mask src_mask,enum tu_cmd_access_mask dst_mask)3758 tu_flush_for_access(struct tu_cache_state *cache,
3759                     enum tu_cmd_access_mask src_mask,
3760                     enum tu_cmd_access_mask dst_mask)
3761 {
3762    BITMASK_ENUM(tu_cmd_flush_bits) flush_bits = 0;
3763 
3764    if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
3765       cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
3766    }
3767 
3768    if (src_mask & TU_ACCESS_CP_WRITE) {
3769       /* Flush the CP write queue.
3770        */
3771       cache->pending_flush_bits |=
3772          TU_CMD_FLAG_WAIT_MEM_WRITES |
3773          TU_CMD_FLAG_ALL_INVALIDATE;
3774    }
3775 
3776 #define SRC_FLUSH(domain, clean, invalidate) \
3777    if (src_mask & TU_ACCESS_##domain##_WRITE) {                      \
3778       cache->pending_flush_bits |= TU_CMD_FLAG_##clean |             \
3779          (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
3780    }
3781 
3782    SRC_FLUSH(UCHE, CACHE_CLEAN, CACHE_INVALIDATE)
3783    SRC_FLUSH(CCU_COLOR, CCU_CLEAN_COLOR, CCU_INVALIDATE_COLOR)
3784    SRC_FLUSH(CCU_DEPTH, CCU_CLEAN_DEPTH, CCU_INVALIDATE_DEPTH)
3785 
3786 #undef SRC_FLUSH
3787 
3788 #define SRC_INCOHERENT_FLUSH(domain, clean, invalidate)              \
3789    if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) {           \
3790       flush_bits |= TU_CMD_FLAG_##clean;                             \
3791       cache->pending_flush_bits |=                                   \
3792          (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
3793    }
3794 
3795    SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_CLEAN_COLOR, CCU_INVALIDATE_COLOR)
3796    SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_CLEAN_DEPTH, CCU_INVALIDATE_DEPTH)
3797 
3798 #undef SRC_INCOHERENT_FLUSH
3799 
3800    /* Treat host & sysmem write accesses the same, since the kernel implicitly
3801     * drains the queue before signalling completion to the host.
3802     */
3803    if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
3804       flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_CLEAN;
3805    }
3806 
3807 #define DST_FLUSH(domain, clean, invalidate) \
3808    if (dst_mask & (TU_ACCESS_##domain##_READ |                 \
3809                    TU_ACCESS_##domain##_WRITE)) {              \
3810       flush_bits |= cache->pending_flush_bits &                \
3811          (TU_CMD_FLAG_##invalidate |                           \
3812           (TU_CMD_FLAG_ALL_CLEAN & ~TU_CMD_FLAG_##clean));     \
3813    }
3814 
3815    DST_FLUSH(UCHE, CACHE_CLEAN, CACHE_INVALIDATE)
3816    DST_FLUSH(CCU_COLOR, CCU_CLEAN_COLOR, CCU_INVALIDATE_COLOR)
3817    DST_FLUSH(CCU_DEPTH, CCU_CLEAN_DEPTH, CCU_INVALIDATE_DEPTH)
3818 
3819 #undef DST_FLUSH
3820 
3821 #define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
3822    if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ |      \
3823                    TU_ACCESS_##domain##_INCOHERENT_WRITE)) {   \
3824       flush_bits |= TU_CMD_FLAG_##invalidate |                 \
3825           (cache->pending_flush_bits &                         \
3826            (TU_CMD_FLAG_ALL_CLEAN & ~TU_CMD_FLAG_##flush));    \
3827    }
3828 
3829    DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_CLEAN_COLOR, CCU_INVALIDATE_COLOR)
3830    DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_CLEAN_DEPTH, CCU_INVALIDATE_DEPTH)
3831 
3832    if (dst_mask & TU_ACCESS_BINDLESS_DESCRIPTOR_READ) {
3833       flush_bits |= TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE;
3834    }
3835 
3836    /* There are multiple incoherent copies of CCHE, so any read through it may
3837     * require invalidating it and we cannot optimize away invalidates.
3838     */
3839    if (dst_mask & TU_ACCESS_CCHE_READ) {
3840       flush_bits |= TU_CMD_FLAG_CCHE_INVALIDATE;
3841    }
3842 
3843    /* The blit cache is a special case dependency between CP_EVENT_WRITE::BLIT
3844     * (from GMEM loads/clears) to any GMEM attachment reads done via the UCHE
3845     * (Eg: Input attachments/CP_BLIT) which needs an explicit BLIT_CACHE_CLEAN
3846     * for the event blit writes to land, it has the following properties:
3847     * - Set on reads rather than on writes, like flushes.
3848     * - Not executed automatically if pending, like invalidates.
3849     * - Pending bits passed through to secondary command buffers, if they're
3850     *   continuing the render pass.
3851     */
3852    if (src_mask & TU_ACCESS_BLIT_WRITE_GMEM) {
3853       cache->pending_flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3854    }
3855 
3856    if ((dst_mask & TU_ACCESS_UCHE_READ_GMEM) &&
3857        (cache->pending_flush_bits & TU_CMD_FLAG_BLIT_CACHE_CLEAN)) {
3858       flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3859    }
3860 
3861 #undef DST_INCOHERENT_FLUSH
3862 
3863    cache->flush_bits |= flush_bits;
3864    cache->pending_flush_bits &= ~flush_bits;
3865 }
3866 
3867 /* When translating Vulkan access flags to which cache is accessed
3868  * (CCU/UCHE/sysmem), we should take into account both the access flags and
3869  * the stage so that accesses with MEMORY_READ_BIT/MEMORY_WRITE_BIT + a
3870  * specific stage return something sensible. The specification for
3871  * VK_KHR_synchronization2 says that we should do this:
3872  *
3873  *    Additionally, scoping the pipeline stages into the barrier structs
3874  *    allows the use of the MEMORY_READ and MEMORY_WRITE flags without
3875  *    sacrificing precision. The per-stage access flags should be used to
3876  *    disambiguate specific accesses in a given stage or set of stages - for
3877  *    instance, between uniform reads and sampling operations.
3878  *
3879  * Note that while in all known cases the stage is actually enough, we should
3880  * still narrow things down based on the access flags to handle "old-style"
3881  * barriers that may specify a wider range of stages but more precise access
3882  * flags. These helpers allow us to do both.
3883  */
3884 
3885 static bool
filter_read_access(VkAccessFlags2 flags,VkPipelineStageFlags2 stages,VkAccessFlags2 tu_flags,VkPipelineStageFlags2 tu_stages)3886 filter_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
3887                    VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
3888 {
3889    return (flags & (tu_flags | VK_ACCESS_2_MEMORY_READ_BIT)) &&
3890       (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT));
3891 }
3892 
3893 static bool
filter_write_access(VkAccessFlags2 flags,VkPipelineStageFlags2 stages,VkAccessFlags2 tu_flags,VkPipelineStageFlags2 tu_stages)3894 filter_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
3895                     VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
3896 {
3897    return (flags & (tu_flags | VK_ACCESS_2_MEMORY_WRITE_BIT)) &&
3898       (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT));
3899 }
3900 
3901 static bool
gfx_read_access(VkAccessFlags2 flags,VkPipelineStageFlags2 stages,VkAccessFlags2 tu_flags,VkPipelineStageFlags2 tu_stages)3902 gfx_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
3903                 VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
3904 {
3905    return filter_read_access(flags, stages, tu_flags,
3906                              tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT);
3907 }
3908 
3909 static bool
gfx_write_access(VkAccessFlags2 flags,VkPipelineStageFlags2 stages,VkAccessFlags2 tu_flags,VkPipelineStageFlags2 tu_stages)3910 gfx_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
3911                  VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
3912 {
3913    return filter_write_access(flags, stages, tu_flags,
3914                               tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT);
3915 }
3916 
3917 static enum tu_cmd_access_mask
vk2tu_access(VkAccessFlags2 flags,VkPipelineStageFlags2 stages,bool image_only,bool gmem)3918 vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only, bool gmem)
3919 {
3920    BITMASK_ENUM(tu_cmd_access_mask) mask = 0;
3921 
3922    if (gfx_read_access(flags, stages,
3923                        VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
3924                        VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT |
3925                        VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
3926                        VK_ACCESS_2_HOST_READ_BIT,
3927                        VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
3928                        VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT |
3929                        VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
3930                        VK_PIPELINE_STAGE_2_HOST_BIT))
3931       mask |= TU_ACCESS_SYSMEM_READ;
3932 
3933    if (gfx_write_access(flags, stages,
3934                         VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT,
3935                         VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT))
3936       mask |= TU_ACCESS_CP_WRITE;
3937 
3938    if (gfx_write_access(flags, stages,
3939                         VK_ACCESS_2_HOST_WRITE_BIT,
3940                         VK_PIPELINE_STAGE_2_HOST_BIT))
3941       mask |= TU_ACCESS_SYSMEM_WRITE;
3942 
3943 #define SHADER_STAGES \
3944    (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | \
3945     VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | \
3946     VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | \
3947     VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | \
3948     VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | \
3949     VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | \
3950     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
3951 
3952 
3953    if (gfx_read_access(flags, stages,
3954                        VK_ACCESS_2_INDEX_READ_BIT |
3955                        VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
3956                        VK_ACCESS_2_UNIFORM_READ_BIT |
3957                        VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT |
3958                        VK_ACCESS_2_SHADER_READ_BIT |
3959                        VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
3960                        VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
3961                        VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR,
3962                        VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
3963                        VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
3964                        VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
3965                        SHADER_STAGES))
3966        mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
3967 
3968    if (gfx_read_access(flags, stages,
3969                        VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT,
3970                        SHADER_STAGES))
3971        mask |= TU_ACCESS_UCHE_READ_GMEM;
3972 
3973    if (gfx_read_access(flags, stages,
3974                        VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT,
3975                        SHADER_STAGES)) {
3976       mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
3977               TU_ACCESS_CCHE_READ;
3978    }
3979 
3980    if (gfx_write_access(flags, stages,
3981                         VK_ACCESS_2_SHADER_WRITE_BIT |
3982                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
3983                         VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT,
3984                         VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
3985                         SHADER_STAGES))
3986        mask |= TU_ACCESS_UCHE_WRITE;
3987 
3988    /* When using GMEM, the CCU is always flushed automatically to GMEM, and
3989     * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
3990     * previous writes in sysmem mode when transitioning to GMEM. Therefore we
3991     * can ignore CCU and pretend that color attachments and transfers use
3992     * sysmem directly.
3993     */
3994 
3995    if (gfx_read_access(flags, stages,
3996                        VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
3997                        VK_ACCESS_2_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT,
3998                        VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) {
3999       if (gmem)
4000          mask |= TU_ACCESS_SYSMEM_READ;
4001       else
4002          mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
4003    }
4004 
4005    if (gfx_read_access(flags, stages,
4006                        VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT,
4007                        VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4008                        VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) {
4009       if (gmem)
4010          mask |= TU_ACCESS_SYSMEM_READ;
4011       else
4012          mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
4013    }
4014 
4015    if (gfx_write_access(flags, stages,
4016                         VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT,
4017                         VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) {
4018       if (gmem) {
4019          mask |= TU_ACCESS_SYSMEM_WRITE;
4020       } else {
4021          mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4022       }
4023    }
4024 
4025    if (gfx_write_access(flags, stages,
4026                         VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
4027                         VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4028                         VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) {
4029       if (gmem) {
4030          mask |= TU_ACCESS_SYSMEM_WRITE;
4031       } else {
4032          mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
4033       }
4034    }
4035 
4036    if (filter_write_access(flags, stages,
4037                            VK_ACCESS_2_TRANSFER_WRITE_BIT,
4038                            VK_PIPELINE_STAGE_2_COPY_BIT |
4039                            VK_PIPELINE_STAGE_2_BLIT_BIT |
4040                            VK_PIPELINE_STAGE_2_CLEAR_BIT |
4041                            VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4042                            VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) {
4043       if (gmem) {
4044          mask |= TU_ACCESS_SYSMEM_WRITE;
4045       } else if (image_only) {
4046          /* Because we always split up blits/copies of images involving
4047           * multiple layers, we always access each layer in the same way, with
4048           * the same base address, same format, etc. This means we can avoid
4049           * flushing between multiple writes to the same image. This elides
4050           * flushes between e.g. multiple blits to the same image.
4051           */
4052          mask |= TU_ACCESS_CCU_COLOR_WRITE;
4053       } else {
4054          mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4055       }
4056    }
4057 
4058    if (filter_read_access(flags, stages,
4059                           VK_ACCESS_2_TRANSFER_READ_BIT,
4060                           VK_PIPELINE_STAGE_2_COPY_BIT |
4061                           VK_PIPELINE_STAGE_2_BLIT_BIT |
4062                           VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4063                           VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) {
4064       mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_CCHE_READ;
4065    }
4066 
4067    return mask;
4068 }
4069 
4070 /* These helpers deal with legacy BOTTOM_OF_PIPE/TOP_OF_PIPE stages.
4071  */
4072 
4073 static VkPipelineStageFlags2
sanitize_src_stage(VkPipelineStageFlags2 stage_mask)4074 sanitize_src_stage(VkPipelineStageFlags2 stage_mask)
4075 {
4076    /* From the Vulkan spec:
4077     *
4078     *    VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is ...  equivalent to
4079     *    VK_PIPELINE_STAGE_2_NONE in the first scope.
4080     *
4081     *    VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is equivalent to
4082     *    VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0
4083     *    when specified in the first synchronization scope, ...
4084     */
4085    if (stage_mask & VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
4086       return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
4087 
4088    return stage_mask & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
4089 }
4090 
4091 static VkPipelineStageFlags2
sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)4092 sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)
4093 {
4094    /* From the Vulkan spec:
4095     *
4096     *    VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is equivalent to
4097     *    VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0
4098     *    when specified in the second synchronization scope, ...
4099     *
4100     *    VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is ... equivalent to
4101     *    VK_PIPELINE_STAGE_2_NONE in the second scope.
4102     *
4103     */
4104    if (stage_mask & VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)
4105       return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
4106 
4107    return stage_mask & ~VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT;
4108 }
4109 
4110 static enum tu_stage
vk2tu_single_stage(VkPipelineStageFlags2 vk_stage,bool dst)4111 vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
4112 {
4113    /* If the destination stage is executed on the CP, then the CP also has to
4114     * wait for any WFI's to finish. This is already done for draw calls,
4115     * including before indirect param reads, for the most part, so we just
4116     * need to WFI and can use TU_STAGE_GPU.
4117     *
4118     * However, some indirect draw opcodes, depending on firmware, don't have
4119     * implicit CP_WAIT_FOR_ME so we have to handle it manually.
4120     *
4121     * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
4122     * does CP_WAIT_FOR_ME, so we don't include them here.
4123     *
4124     * Currently we read the draw predicate using CP_MEM_TO_MEM, which
4125     * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
4126     * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
4127     * complete since it's written for DX11 where you can only predicate on the
4128     * result of a query object. So if we implement 64-bit comparisons in the
4129     * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
4130     * comparisons, then this will have to be dealt with.
4131     */
4132    if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT ||
4133        vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT ||
4134        vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT)
4135       return TU_STAGE_CP;
4136 
4137    if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT ||
4138        vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
4139       return dst ? TU_STAGE_CP : TU_STAGE_GPU;
4140 
4141    if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT)
4142       return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP;
4143 
4144    return TU_STAGE_GPU;
4145 }
4146 
4147 static enum tu_stage
vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)4148 vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
4149 {
4150    enum tu_stage stage = TU_STAGE_CP;
4151    u_foreach_bit64 (bit, vk_stages) {
4152       enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
4153       stage = MAX2(stage, new_stage);
4154    }
4155 
4156    return stage;
4157 }
4158 
4159 static enum tu_stage
vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages)4160 vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages)
4161 {
4162    enum tu_stage stage = TU_STAGE_BOTTOM;
4163    u_foreach_bit64 (bit, vk_stages) {
4164       enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
4165       stage = MIN2(stage, new_stage);
4166    }
4167 
4168    return stage;
4169 }
4170 
4171 static void
tu_flush_for_stage(struct tu_cache_state * cache,enum tu_stage src_stage,enum tu_stage dst_stage)4172 tu_flush_for_stage(struct tu_cache_state *cache,
4173                    enum tu_stage src_stage, enum tu_stage dst_stage)
4174 {
4175    /* Even if the source is the host or CP, the destination access could
4176     * generate invalidates that we have to wait to complete.
4177     */
4178    if (src_stage == TU_STAGE_CP &&
4179        (cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE))
4180       src_stage = TU_STAGE_GPU;
4181 
4182    if (src_stage >= dst_stage) {
4183       cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
4184       if (dst_stage == TU_STAGE_CP)
4185          cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
4186    }
4187 }
4188 
4189 void
tu_render_pass_state_merge(struct tu_render_pass_state * dst,const struct tu_render_pass_state * src)4190 tu_render_pass_state_merge(struct tu_render_pass_state *dst,
4191                            const struct tu_render_pass_state *src)
4192 {
4193    dst->xfb_used |= src->xfb_used;
4194    dst->has_tess |= src->has_tess;
4195    dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
4196    dst->has_zpass_done_sample_count_write_in_rp |= src->has_zpass_done_sample_count_write_in_rp;
4197    dst->disable_gmem |= src->disable_gmem;
4198    dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
4199    dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
4200    dst->shared_viewport |= src->shared_viewport;
4201 
4202    dst->drawcall_count += src->drawcall_count;
4203    dst->drawcall_bandwidth_per_sample_sum +=
4204       src->drawcall_bandwidth_per_sample_sum;
4205    if (!dst->lrz_disable_reason)
4206       dst->lrz_disable_reason = src->lrz_disable_reason;
4207 }
4208 
4209 void
tu_restore_suspended_pass(struct tu_cmd_buffer * cmd,struct tu_cmd_buffer * suspended)4210 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
4211                           struct tu_cmd_buffer *suspended)
4212 {
4213    cmd->state.pass = suspended->state.suspended_pass.pass;
4214    cmd->state.subpass = suspended->state.suspended_pass.subpass;
4215    cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer;
4216    cmd->state.attachments = suspended->state.suspended_pass.attachments;
4217    cmd->state.clear_values = suspended->state.suspended_pass.clear_values;
4218    cmd->state.render_area = suspended->state.suspended_pass.render_area;
4219    cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
4220    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
4221    cmd->state.lrz = suspended->state.suspended_pass.lrz;
4222 }
4223 
4224 /* Take the saved pre-chain in "secondary" and copy its commands to "cmd",
4225  * appending it after any saved-up commands in "cmd".
4226  */
4227 void
tu_append_pre_chain(struct tu_cmd_buffer * cmd,struct tu_cmd_buffer * secondary)4228 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
4229                     struct tu_cmd_buffer *secondary)
4230 {
4231    tu_cs_add_entries(&cmd->draw_cs, &secondary->pre_chain.draw_cs);
4232    tu_cs_add_entries(&cmd->draw_epilogue_cs,
4233                      &secondary->pre_chain.draw_epilogue_cs);
4234 
4235    tu_render_pass_state_merge(&cmd->state.rp,
4236                               &secondary->pre_chain.state);
4237    tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->pre_chain.trace_renderpass_start,
4238          secondary->pre_chain.trace_renderpass_end);
4239    util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
4240                                  &secondary->pre_chain.fdm_bin_patchpoints);
4241 }
4242 
4243 /* Take the saved post-chain in "secondary" and copy it to "cmd".
4244  */
4245 void
tu_append_post_chain(struct tu_cmd_buffer * cmd,struct tu_cmd_buffer * secondary)4246 tu_append_post_chain(struct tu_cmd_buffer *cmd,
4247                      struct tu_cmd_buffer *secondary)
4248 {
4249    tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
4250    tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
4251 
4252    tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start,
4253          secondary->trace_renderpass_end);
4254    cmd->state.rp = secondary->state.rp;
4255    util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
4256                                  &secondary->fdm_bin_patchpoints);
4257 }
4258 
4259 /* Assuming "secondary" is just a sequence of suspended and resuming passes,
4260  * copy its state to "cmd". This also works instead of tu_append_post_chain(),
4261  * but it's a bit slower because we don't assume that the chain begins in
4262  * "secondary" and therefore have to care about the command buffer's
4263  * renderpass state.
4264  */
4265 void
tu_append_pre_post_chain(struct tu_cmd_buffer * cmd,struct tu_cmd_buffer * secondary)4266 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
4267                          struct tu_cmd_buffer *secondary)
4268 {
4269    tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
4270    tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
4271 
4272    tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start,
4273          secondary->trace_renderpass_end);
4274    tu_render_pass_state_merge(&cmd->state.rp,
4275                               &secondary->state.rp);
4276    util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
4277                                  &secondary->fdm_bin_patchpoints);
4278 }
4279 
4280 /* Take the current render pass state and save it to "pre_chain" to be
4281  * combined later.
4282  */
4283 static void
tu_save_pre_chain(struct tu_cmd_buffer * cmd)4284 tu_save_pre_chain(struct tu_cmd_buffer *cmd)
4285 {
4286    tu_cs_add_entries(&cmd->pre_chain.draw_cs,
4287                      &cmd->draw_cs);
4288    tu_cs_add_entries(&cmd->pre_chain.draw_epilogue_cs,
4289                      &cmd->draw_epilogue_cs);
4290    cmd->pre_chain.trace_renderpass_start =
4291       cmd->trace_renderpass_start;
4292    cmd->pre_chain.trace_renderpass_end =
4293       cmd->trace_renderpass_end;
4294    cmd->pre_chain.state = cmd->state.rp;
4295    util_dynarray_append_dynarray(&cmd->pre_chain.fdm_bin_patchpoints,
4296                                  &cmd->fdm_bin_patchpoints);
4297    cmd->pre_chain.patchpoints_ctx = cmd->patchpoints_ctx;
4298    cmd->patchpoints_ctx = NULL;
4299 }
4300 
4301 VKAPI_ATTR void VKAPI_CALL
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)4302 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
4303                       uint32_t commandBufferCount,
4304                       const VkCommandBuffer *pCmdBuffers)
4305 {
4306    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4307    VkResult result;
4308 
4309    assert(commandBufferCount > 0);
4310 
4311    /* Emit any pending flushes. */
4312    if (cmd->state.pass) {
4313       tu_clean_all_pending(&cmd->state.renderpass_cache);
4314       TU_CALLX(cmd->device, tu_emit_cache_flush_renderpass)(cmd);
4315    } else {
4316       tu_clean_all_pending(&cmd->state.cache);
4317       TU_CALLX(cmd->device, tu_emit_cache_flush)(cmd);
4318    }
4319 
4320    for (uint32_t i = 0; i < commandBufferCount; i++) {
4321       VK_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
4322 
4323       if (secondary->usage_flags &
4324           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
4325          assert(tu_cs_is_empty(&secondary->cs));
4326 
4327          result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
4328          if (result != VK_SUCCESS) {
4329             vk_command_buffer_set_error(&cmd->vk, result);
4330             break;
4331          }
4332 
4333          result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
4334                &secondary->draw_epilogue_cs);
4335          if (result != VK_SUCCESS) {
4336             vk_command_buffer_set_error(&cmd->vk, result);
4337             break;
4338          }
4339 
4340          /* If LRZ was made invalid in secondary - we should disable
4341           * LRZ retroactively for the whole renderpass.
4342           */
4343          if (!secondary->state.lrz.valid)
4344             cmd->state.lrz.valid = false;
4345 
4346          tu_clone_trace(cmd, &cmd->draw_cs, &secondary->trace);
4347          tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
4348          util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
4349                                        &secondary->fdm_bin_patchpoints);
4350       } else {
4351          switch (secondary->state.suspend_resume) {
4352          case SR_NONE:
4353             assert(tu_cs_is_empty(&secondary->draw_cs));
4354             assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
4355             tu_cs_add_entries(&cmd->cs, &secondary->cs);
4356             tu_clone_trace(cmd, &cmd->cs, &secondary->trace);
4357             break;
4358 
4359          case SR_IN_PRE_CHAIN:
4360             /* cmd may be empty, which means that the chain begins before cmd
4361              * in which case we have to update its state.
4362              */
4363             if (cmd->state.suspend_resume == SR_NONE) {
4364                cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
4365                cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
4366             }
4367 
4368             /* The secondary is just a continuous suspend/resume chain so we
4369              * just have to append it to the the command buffer.
4370              */
4371             assert(tu_cs_is_empty(&secondary->cs));
4372             tu_append_pre_post_chain(cmd, secondary);
4373             break;
4374 
4375          case SR_AFTER_PRE_CHAIN:
4376          case SR_IN_CHAIN:
4377          case SR_IN_CHAIN_AFTER_PRE_CHAIN:
4378             if (secondary->state.suspend_resume == SR_AFTER_PRE_CHAIN ||
4379                 secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN) {
4380                /* In thse cases there is a `pre_chain` in the secondary which
4381                 * ends that we need to append to the primary.
4382                 */
4383 
4384                if (cmd->state.suspend_resume == SR_NONE)
4385                   cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
4386 
4387                tu_append_pre_chain(cmd, secondary);
4388 
4389                /* We're about to render, so we need to end the command stream
4390                 * in case there were any extra commands generated by copying
4391                 * the trace.
4392                 */
4393                tu_cs_end(&cmd->draw_cs);
4394                tu_cs_end(&cmd->draw_epilogue_cs);
4395 
4396                switch (cmd->state.suspend_resume) {
4397                case SR_NONE:
4398                case SR_IN_PRE_CHAIN:
4399                   /* The renderpass chain ends in the secondary but isn't
4400                    * started in the primary, so we have to move the state to
4401                    * `pre_chain`.
4402                    */
4403                   cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
4404                   tu_save_pre_chain(cmd);
4405                   cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
4406                   break;
4407                case SR_IN_CHAIN:
4408                case SR_IN_CHAIN_AFTER_PRE_CHAIN:
4409                   /* The renderpass ends in the secondary and starts somewhere
4410                    * earlier in this primary. Since the last render pass in
4411                    * the chain is in the secondary, we are technically outside
4412                    * of a render pass.  Fix that here by reusing the dynamic
4413                    * render pass that was setup for the last suspended render
4414                    * pass before the secondary.
4415                    */
4416                   tu_restore_suspended_pass(cmd, cmd);
4417 
4418                   TU_CALLX(cmd->device, tu_cmd_render)(cmd);
4419                   if (cmd->state.suspend_resume == SR_IN_CHAIN)
4420                      cmd->state.suspend_resume = SR_NONE;
4421                   else
4422                      cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
4423                   break;
4424                case SR_AFTER_PRE_CHAIN:
4425                   unreachable("resuming render pass is not preceded by suspending one");
4426                }
4427 
4428                tu_reset_render_pass(cmd);
4429             }
4430 
4431             tu_cs_add_entries(&cmd->cs, &secondary->cs);
4432 
4433             if (secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN ||
4434                 secondary->state.suspend_resume == SR_IN_CHAIN) {
4435                /* The secondary ends in a "post-chain" (the opposite of a
4436                 * pre-chain) that we need to copy into the current command
4437                 * buffer.
4438                 */
4439                cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
4440                tu_append_post_chain(cmd, secondary);
4441                cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
4442                cmd->state.suspended_pass = secondary->state.suspended_pass;
4443 
4444                switch (cmd->state.suspend_resume) {
4445                case SR_NONE:
4446                   cmd->state.suspend_resume = SR_IN_CHAIN;
4447                   break;
4448                case SR_AFTER_PRE_CHAIN:
4449                   cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
4450                   break;
4451                default:
4452                   unreachable("suspending render pass is followed by a not resuming one");
4453                }
4454             }
4455          }
4456       }
4457 
4458       cmd->state.index_size = secondary->state.index_size; /* for restart index update */
4459    }
4460    cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
4461 
4462    if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) {
4463       /* After a secondary command buffer is executed, LRZ is not valid
4464        * until it is cleared again.
4465        */
4466       cmd->state.lrz.valid = false;
4467    }
4468 
4469    /* After executing secondary command buffers, there may have been arbitrary
4470     * flushes executed, so when we encounter a pipeline barrier with a
4471     * srcMask, we have to assume that we need to invalidate. Therefore we need
4472     * to re-initialize the cache with all pending invalidate bits set.
4473     */
4474    if (cmd->state.pass) {
4475       struct tu_cache_state *cache = &cmd->state.renderpass_cache;
4476       BITMASK_ENUM(tu_cmd_flush_bits) retained_pending_flush_bits =
4477          cache->pending_flush_bits & TU_CMD_FLAG_BLIT_CACHE_CLEAN;
4478       tu_cache_init(cache);
4479       cache->pending_flush_bits |= retained_pending_flush_bits;
4480    } else {
4481       tu_cache_init(&cmd->state.cache);
4482    }
4483 }
4484 
4485 static void
tu_subpass_barrier(struct tu_cmd_buffer * cmd_buffer,const struct tu_subpass_barrier * barrier,bool external)4486 tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
4487                    const struct tu_subpass_barrier *barrier,
4488                    bool external)
4489 {
4490    /* Note: we don't know until the end of the subpass whether we'll use
4491     * sysmem, so assume sysmem here to be safe.
4492     */
4493    struct tu_cache_state *cache =
4494       external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
4495    VkPipelineStageFlags2 src_stage_vk =
4496       sanitize_src_stage(barrier->src_stage_mask);
4497    VkPipelineStageFlags2 dst_stage_vk =
4498       sanitize_dst_stage(barrier->dst_stage_mask);
4499    BITMASK_ENUM(tu_cmd_access_mask) src_flags =
4500       vk2tu_access(barrier->src_access_mask, src_stage_vk, false, false);
4501    BITMASK_ENUM(tu_cmd_access_mask) dst_flags =
4502       vk2tu_access(barrier->dst_access_mask, dst_stage_vk, false, false);
4503 
4504    if (barrier->incoherent_ccu_color)
4505       src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4506    if (barrier->incoherent_ccu_depth)
4507       src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
4508 
4509    tu_flush_for_access(cache, src_flags, dst_flags);
4510 
4511    enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk);
4512    enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk);
4513    tu_flush_for_stage(cache, src_stage, dst_stage);
4514 }
4515 
4516 template <chip CHIP>
4517 static void
tu_emit_subpass_begin_gmem(struct tu_cmd_buffer * cmd,struct tu_resolve_group * resolve_group)4518 tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
4519 {
4520    struct tu_cs *cs = &cmd->draw_cs;
4521    uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
4522 
4523    /* If we might choose to bin, then put the loads under a check for geometry
4524     * having been binned to this tile.  If we don't choose to bin in the end,
4525     * then we will have manually set those registers to say geometry is present.
4526     *
4527     * However, if the draw CS has a write to the condition for some other reason
4528     * (perf queries), then we can't do this optimization since the
4529     * start-of-the-CS geometry condition will have been overwritten.
4530     */
4531    bool cond_load_allowed = cmd->state.tiling->binning &&
4532                             cmd->state.pass->has_cond_load_store &&
4533                             !cmd->state.rp.draw_cs_writes_to_cond_pred;
4534 
4535    if (cmd->state.pass->has_fdm)
4536       tu_cs_set_writeable(cs, true);
4537 
4538    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
4539 
4540    /* Emit gmem loads that are first used in this subpass. */
4541    bool emitted_scissor = false;
4542    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
4543       struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
4544       if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) {
4545          if (!emitted_scissor) {
4546             tu6_emit_blit_scissor(cmd, cs, true);
4547             emitted_scissor = true;
4548          }
4549          tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
4550                                        cond_load_allowed, false);
4551       }
4552    }
4553 
4554    if (!cmd->device->physical_device->info->a7xx.has_generic_clear) {
4555       /* Emit gmem clears that are first used in this subpass. */
4556       emitted_scissor = false;
4557       for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
4558          struct tu_render_pass_attachment *att =
4559             &cmd->state.pass->attachments[i];
4560          if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
4561             if (!emitted_scissor) {
4562                tu6_emit_blit_scissor(cmd, cs, false);
4563                emitted_scissor = true;
4564             }
4565             tu_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, i);
4566          }
4567       }
4568    }
4569 
4570    tu_cond_exec_end(cs); /* CP_COND_EXEC_0_RENDER_MODE_GMEM */
4571 
4572    if (cmd->state.pass->has_fdm)
4573       tu_cs_set_writeable(cs, false);
4574 
4575 }
4576 
4577 /* Emits sysmem clears that are first used in this subpass. */
4578 template <chip CHIP>
4579 static void
tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer * cmd)4580 tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
4581 {
4582    if (cmd->device->physical_device->info->a7xx.has_generic_clear)
4583       return;
4584 
4585    struct tu_cs *cs = &cmd->draw_cs;
4586    uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
4587 
4588    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4589    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
4590       struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
4591       if (att->clear_mask && att->first_subpass_idx == subpass_idx)
4592          tu_clear_sysmem_attachment<CHIP>(cmd, cs, i);
4593    }
4594    tu_cond_exec_end(cs); /* sysmem */
4595 }
4596 
4597 static void
tu7_emit_subpass_clear(struct tu_cmd_buffer * cmd,struct tu_resolve_group * resolve_group)4598 tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
4599 {
4600    if (cmd->state.render_area.extent.width == 0 ||
4601        cmd->state.render_area.extent.height == 0)
4602       return;
4603 
4604    struct tu_cs *cs = &cmd->draw_cs;
4605    uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
4606 
4607    bool emitted_scissor = false;
4608    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
4609       struct tu_render_pass_attachment *att =
4610          &cmd->state.pass->attachments[i];
4611       if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
4612          if (!emitted_scissor) {
4613             tu6_emit_blit_scissor(cmd, cs, false);
4614             emitted_scissor = true;
4615          }
4616          tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
4617       }
4618    }
4619 }
4620 
4621 static void
tu7_emit_subpass_shading_rate(struct tu_cmd_buffer * cmd,const struct tu_subpass * subpass,struct tu_cs * cs)4622 tu7_emit_subpass_shading_rate(struct tu_cmd_buffer *cmd,
4623                               const struct tu_subpass *subpass,
4624                               struct tu_cs *cs)
4625 {
4626    if (subpass->fsr_attachment == VK_ATTACHMENT_UNUSED) {
4627       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_BUFFER_DESC(),
4628                       A7XX_GRAS_FSR_BUFFER_SIZE());
4629       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_BUFFER_PITCH());
4630       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_BUFFER_BASE());
4631       /* We need to invalidate cache when changing to NULL FSR attachment, but
4632        * only once.
4633        */
4634       if (!cmd->prev_fsr_is_null) {
4635          tu_emit_raw_event_write<A7XX>(cmd, cs, LRZ_Q_CACHE_INVALIDATE,
4636                                        false);
4637          cmd->prev_fsr_is_null = true;
4638       }
4639       return;
4640    }
4641 
4642    const struct tu_image_view *iview =
4643       cmd->state.attachments[subpass->fsr_attachment];
4644    assert(iview->vk.format == VK_FORMAT_R8_UINT);
4645 
4646    tu_cs_emit_regs(
4647       cs,
4648       A7XX_GRAS_FSR_BUFFER_DESC(.layered = true,
4649                                 .tile_mode =
4650                                    (a6xx_tile_mode) iview->image->layout[0]
4651                                       .tile_mode, ),
4652       A7XX_GRAS_FSR_BUFFER_SIZE(.width = iview->view.width,
4653                                 .height = iview->view.height));
4654    tu_cs_emit_regs(
4655       cs, A7XX_GRAS_FSR_BUFFER_PITCH(.pitch = iview->view.pitch,
4656                                      .array_pitch = iview->view.layer_size));
4657    tu_cs_emit_regs(cs,
4658                    A7XX_GRAS_FSR_BUFFER_BASE(.qword = iview->view.base_addr));
4659 
4660    tu_emit_raw_event_write<A7XX>(cmd, cs, LRZ_Q_CACHE_INVALIDATE, false);
4661    cmd->prev_fsr_is_null = false;
4662 }
4663 
4664 /* emit loads, clears, and mrt/zs/msaa/ubwc state for the subpass that is
4665  * starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2())
4666  *
4667  * Clears and loads have to happen at this point, because with
4668  * VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT the loads may depend on the output of
4669  * a previous aliased attachment's store.
4670  */
4671 template <chip CHIP>
4672 static void
tu_emit_subpass_begin(struct tu_cmd_buffer * cmd)4673 tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)
4674 {
4675    tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
4676 
4677    struct tu_resolve_group resolve_group = {};
4678 
4679    tu_emit_subpass_begin_gmem<CHIP>(cmd, &resolve_group);
4680    tu_emit_subpass_begin_sysmem<CHIP>(cmd);
4681    if (cmd->device->physical_device->info->a7xx.has_generic_clear) {
4682       tu7_emit_subpass_clear(cmd, &resolve_group);
4683    }
4684 
4685    tu_emit_resolve_group<CHIP>(cmd, &cmd->draw_cs, &resolve_group);
4686 
4687    tu6_emit_zs<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs);
4688    tu6_emit_mrt<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs);
4689    tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs, false);
4690 
4691    if (CHIP >= A7XX) {
4692       tu7_emit_subpass_shading_rate(cmd, cmd->state.subpass, &cmd->draw_cs);
4693    }
4694 
4695    tu_set_input_attachments(cmd, cmd->state.subpass);
4696 
4697    vk_cmd_set_cb_attachment_count(&cmd->vk, cmd->state.subpass->color_count);
4698 
4699    cmd->state.dirty |= TU_CMD_DIRTY_SUBPASS;
4700 }
4701 
4702 template <chip CHIP>
4703 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,const VkSubpassBeginInfo * pSubpassBeginInfo)4704 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
4705                        const VkRenderPassBeginInfo *pRenderPassBegin,
4706                        const VkSubpassBeginInfo *pSubpassBeginInfo)
4707 {
4708    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4709 
4710    if (TU_DEBUG(DYNAMIC)) {
4711       vk_common_CmdBeginRenderPass2(commandBuffer, pRenderPassBegin,
4712                                     pSubpassBeginInfo);
4713       return;
4714    }
4715 
4716    VK_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
4717    VK_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
4718 
4719    const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo =
4720       vk_find_struct_const(pRenderPassBegin->pNext,
4721                            RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4722 
4723    cmd->state.pass = pass;
4724    cmd->state.subpass = pass->subpasses;
4725    cmd->state.framebuffer = fb;
4726    cmd->state.render_area = pRenderPassBegin->renderArea;
4727 
4728    if (pass->attachment_count > 0) {
4729       VK_MULTIALLOC(ma);
4730       vk_multialloc_add(&ma, &cmd->state.attachments,
4731                         const struct tu_image_view *, pass->attachment_count);
4732       vk_multialloc_add(&ma, &cmd->state.clear_values, VkClearValue,
4733                         pRenderPassBegin->clearValueCount);
4734       if (!vk_multialloc_alloc(&ma, &cmd->vk.pool->alloc,
4735                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
4736          vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4737          return;
4738       }
4739    }
4740 
4741    if (cmd->device->dbg_renderpass_stomp_cs) {
4742       tu_cs_emit_call(&cmd->cs, cmd->device->dbg_renderpass_stomp_cs);
4743    }
4744 
4745    for (unsigned i = 0; i < pass->attachment_count; i++) {
4746       cmd->state.attachments[i] = pAttachmentInfo ?
4747          tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
4748          cmd->state.framebuffer->attachments[i].attachment;
4749    }
4750    if (pass->attachment_count) {
4751       for (unsigned i = 0; i < pRenderPassBegin->clearValueCount; i++)
4752             cmd->state.clear_values[i] = pRenderPassBegin->pClearValues[i];
4753    }
4754 
4755    tu_choose_gmem_layout(cmd);
4756 
4757    tu_trace_start_render_pass(cmd);
4758 
4759    /* Note: because this is external, any flushes will happen before draw_cs
4760     * gets called. However deferred flushes could have to happen later as part
4761     * of the subpass.
4762     */
4763    tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
4764    cmd->state.renderpass_cache.pending_flush_bits =
4765       cmd->state.cache.pending_flush_bits;
4766    cmd->state.renderpass_cache.flush_bits = 0;
4767 
4768    if (pass->subpasses[0].feedback_invalidate) {
4769       cmd->state.renderpass_cache.flush_bits |=
4770          TU_CMD_FLAG_CACHE_INVALIDATE | TU_CMD_FLAG_BLIT_CACHE_CLEAN |
4771          TU_CMD_FLAG_WAIT_FOR_IDLE;
4772    }
4773 
4774    tu_lrz_begin_renderpass<CHIP>(cmd);
4775 
4776    cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
4777 
4778    tu_emit_renderpass_begin(cmd);
4779    tu_emit_subpass_begin<CHIP>(cmd);
4780 
4781    cmd->patchpoints_ctx = ralloc_context(NULL);
4782 }
4783 TU_GENX(tu_CmdBeginRenderPass2);
4784 
4785 template <chip CHIP>
4786 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)4787 tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
4788                      const VkRenderingInfo *pRenderingInfo)
4789 {
4790    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4791 
4792    tu_setup_dynamic_render_pass(cmd, pRenderingInfo);
4793    tu_setup_dynamic_framebuffer(cmd, pRenderingInfo);
4794 
4795    cmd->state.pass = &cmd->dynamic_pass;
4796    cmd->state.subpass = &cmd->dynamic_subpass;
4797    cmd->state.framebuffer = &cmd->dynamic_framebuffer;
4798    cmd->state.render_area = pRenderingInfo->renderArea;
4799    cmd->state.blit_cache_cleaned = false;
4800 
4801    cmd->state.attachments = cmd->dynamic_attachments;
4802    cmd->state.clear_values = cmd->dynamic_clear_values;
4803 
4804    for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
4805       uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
4806       if (!pRenderingInfo->pColorAttachments[i].imageView)
4807          continue;
4808 
4809       cmd->state.clear_values[a] =
4810          pRenderingInfo->pColorAttachments[i].clearValue;
4811 
4812       VK_FROM_HANDLE(tu_image_view, view,
4813                      pRenderingInfo->pColorAttachments[i].imageView);
4814       cmd->state.attachments[a] = view;
4815 
4816       a = cmd->dynamic_subpass.resolve_attachments[i].attachment;
4817       if (a != VK_ATTACHMENT_UNUSED) {
4818          VK_FROM_HANDLE(tu_image_view, resolve_view,
4819                         pRenderingInfo->pColorAttachments[i].resolveImageView);
4820          cmd->state.attachments[a] = resolve_view;
4821       }
4822    }
4823 
4824    uint32_t a = cmd->dynamic_subpass.depth_stencil_attachment.attachment;
4825    if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
4826       const struct VkRenderingAttachmentInfo *common_info =
4827          (pRenderingInfo->pDepthAttachment &&
4828           pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) ?
4829          pRenderingInfo->pDepthAttachment :
4830          pRenderingInfo->pStencilAttachment;
4831       if (common_info && common_info->imageView != VK_NULL_HANDLE) {
4832          VK_FROM_HANDLE(tu_image_view, view, common_info->imageView);
4833          cmd->state.attachments[a] = view;
4834          if (pRenderingInfo->pDepthAttachment) {
4835             cmd->state.clear_values[a].depthStencil.depth =
4836                pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
4837          }
4838 
4839          if (pRenderingInfo->pStencilAttachment) {
4840             cmd->state.clear_values[a].depthStencil.stencil =
4841                pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
4842          }
4843 
4844          if (cmd->dynamic_subpass.resolve_count >
4845              cmd->dynamic_subpass.color_count) {
4846             VK_FROM_HANDLE(tu_image_view, resolve_view,
4847                            common_info->resolveImageView);
4848             a = cmd->dynamic_subpass.resolve_attachments[cmd->dynamic_subpass.color_count].attachment;
4849             cmd->state.attachments[a] = resolve_view;
4850          }
4851       }
4852    }
4853 
4854    a = cmd->dynamic_pass.fragment_density_map.attachment;
4855    if (a != VK_ATTACHMENT_UNUSED) {
4856       const VkRenderingFragmentDensityMapAttachmentInfoEXT *fdm_info =
4857          vk_find_struct_const(pRenderingInfo->pNext,
4858                               RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_INFO_EXT);
4859       VK_FROM_HANDLE(tu_image_view, view, fdm_info->imageView);
4860       cmd->state.attachments[a] = view;
4861    }
4862 
4863    const VkRenderingAttachmentLocationInfoKHR ral_info = {
4864       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
4865       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
4866    };
4867    vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
4868 
4869    cmd->patchpoints_ctx = ralloc_context(NULL);
4870 
4871    a = cmd->dynamic_subpass.fsr_attachment;
4872    if (a != VK_ATTACHMENT_UNUSED) {
4873       const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_info =
4874          vk_find_struct_const(pRenderingInfo->pNext,
4875                               RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
4876       VK_FROM_HANDLE(tu_image_view, view, fsr_info->imageView);
4877       cmd->state.attachments[a] = view;
4878    }
4879 
4880    tu_choose_gmem_layout(cmd);
4881 
4882    cmd->state.renderpass_cache.pending_flush_bits =
4883       cmd->state.cache.pending_flush_bits;
4884    cmd->state.renderpass_cache.flush_bits = 0;
4885 
4886    bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
4887    bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT;
4888    cmd->state.suspending = suspending;
4889    cmd->state.resuming = resuming;
4890 
4891    if (!resuming && cmd->device->dbg_renderpass_stomp_cs) {
4892       tu_cs_emit_call(&cmd->cs, cmd->device->dbg_renderpass_stomp_cs);
4893    }
4894 
4895    /* We can't track LRZ across command buffer boundaries, so we have to
4896     * disable LRZ when resuming/suspending unless we can track on the GPU.
4897     */
4898    if ((resuming || suspending) &&
4899        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
4900       cmd->state.lrz.valid = false;
4901    } else {
4902       if (resuming)
4903          tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
4904       else
4905          tu_lrz_begin_renderpass<CHIP>(cmd);
4906    }
4907 
4908 
4909    if (suspending) {
4910       cmd->state.suspended_pass.pass = cmd->state.pass;
4911       cmd->state.suspended_pass.subpass = cmd->state.subpass;
4912       cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer;
4913       cmd->state.suspended_pass.render_area = cmd->state.render_area;
4914       cmd->state.suspended_pass.attachments = cmd->state.attachments;
4915       cmd->state.suspended_pass.clear_values = cmd->state.clear_values;
4916       cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
4917    }
4918 
4919    if (!resuming)
4920       tu_trace_start_render_pass(cmd);
4921 
4922    if (!resuming || cmd->state.suspend_resume == SR_NONE) {
4923       cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
4924    }
4925 
4926    if (!resuming) {
4927       tu_emit_renderpass_begin(cmd);
4928       tu_emit_subpass_begin<CHIP>(cmd);
4929    }
4930 
4931    if (suspending && !resuming) {
4932       /* entering a chain */
4933       switch (cmd->state.suspend_resume) {
4934       case SR_NONE:
4935          cmd->state.suspend_resume = SR_IN_CHAIN;
4936          break;
4937       case SR_AFTER_PRE_CHAIN:
4938          cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
4939          break;
4940       case SR_IN_PRE_CHAIN:
4941       case SR_IN_CHAIN:
4942       case SR_IN_CHAIN_AFTER_PRE_CHAIN:
4943          unreachable("suspending render pass not followed by resuming pass");
4944          break;
4945       }
4946    }
4947 
4948    if (resuming && cmd->state.suspend_resume == SR_NONE)
4949       cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
4950 }
4951 TU_GENX(tu_CmdBeginRendering);
4952 
4953 template <chip CHIP>
4954 VKAPI_ATTR void VKAPI_CALL
tu_CmdSetRenderingAttachmentLocationsKHR(VkCommandBuffer commandBuffer,const VkRenderingAttachmentLocationInfoKHR * pLocationInfo)4955 tu_CmdSetRenderingAttachmentLocationsKHR(
4956    VkCommandBuffer commandBuffer,
4957    const VkRenderingAttachmentLocationInfoKHR *pLocationInfo)
4958 {
4959    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4960 
4961    vk_common_CmdSetRenderingAttachmentLocationsKHR(commandBuffer, pLocationInfo);
4962 
4963    tu6_emit_mrt<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs);
4964    tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs, false);
4965 
4966    /* Because this is just a remapping and not a different "reference", there
4967     * doesn't need to be a barrier between accesses to the same attachment
4968     * with a different index. This is different from "classic" renderpasses.
4969     * Before a7xx the CCU includes the render target ID in the cache location
4970     * calculation, so we need to manually flush/invalidate color CCU here
4971     * since the same render target/attachment may be in a different location.
4972     */
4973    if (cmd->device->physical_device->info->chip == 6) {
4974       struct tu_cache_state *cache = &cmd->state.renderpass_cache;
4975       tu_flush_for_access(cache, TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
4976                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
4977       cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
4978    }
4979 }
4980 TU_GENX(tu_CmdSetRenderingAttachmentLocationsKHR);
4981 
4982 VKAPI_ATTR void VKAPI_CALL
tu_CmdSetRenderingInputAttachmentIndicesKHR(VkCommandBuffer commandBuffer,const VkRenderingInputAttachmentIndexInfoKHR * pLocationInfo)4983 tu_CmdSetRenderingInputAttachmentIndicesKHR(
4984    VkCommandBuffer commandBuffer,
4985    const VkRenderingInputAttachmentIndexInfoKHR *pLocationInfo)
4986 {
4987    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4988 
4989    vk_common_CmdSetRenderingInputAttachmentIndicesKHR(commandBuffer, pLocationInfo);
4990 
4991    const struct vk_input_attachment_location_state *ial =
4992       &cmd->vk.dynamic_graphics_state.ial;
4993 
4994    struct tu_subpass *subpass = &cmd->dynamic_subpass;
4995 
4996    for (unsigned i = 0; i < ARRAY_SIZE(cmd->dynamic_input_attachments); i++) {
4997       subpass->input_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
4998    }
4999 
5000    unsigned input_count = 0;
5001    for (unsigned i = 0; i < subpass->color_count; i++) {
5002       if (ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
5003          continue;
5004       subpass->input_attachments[ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET].attachment =
5005          subpass->color_attachments[i].attachment;
5006       input_count = MAX2(input_count, ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET + 1);
5007    }
5008 
5009    if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
5010       if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX) {
5011          subpass->input_attachments[0].attachment =
5012             subpass->depth_stencil_attachment.attachment;
5013          input_count = MAX2(input_count, 1);
5014       } else {
5015          subpass->input_attachments[ial->depth_att + TU_DYN_INPUT_ATT_OFFSET].attachment =
5016             subpass->depth_stencil_attachment.attachment;
5017          input_count = MAX2(input_count, ial->depth_att + TU_DYN_INPUT_ATT_OFFSET + 1);
5018       }
5019    }
5020 
5021    if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
5022       if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX) {
5023          subpass->input_attachments[0].attachment =
5024             subpass->depth_stencil_attachment.attachment;
5025          input_count = MAX2(input_count, 1);
5026       } else {
5027          subpass->input_attachments[ial->stencil_att + TU_DYN_INPUT_ATT_OFFSET].attachment =
5028             subpass->depth_stencil_attachment.attachment;
5029          input_count = MAX2(input_count, ial->stencil_att + TU_DYN_INPUT_ATT_OFFSET + 1);
5030       }
5031    }
5032 
5033    subpass->input_count = input_count;
5034 
5035    tu_set_input_attachments(cmd, cmd->state.subpass);
5036 }
5037 
5038 template <chip CHIP>
5039 VKAPI_ATTR void VKAPI_CALL
tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)5040 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
5041                    const VkSubpassBeginInfo *pSubpassBeginInfo,
5042                    const VkSubpassEndInfo *pSubpassEndInfo)
5043 {
5044    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
5045 
5046    if (TU_DEBUG(DYNAMIC)) {
5047       vk_common_CmdNextSubpass2(commandBuffer, pSubpassBeginInfo,
5048                                 pSubpassEndInfo);
5049       return;
5050    }
5051 
5052    const struct tu_render_pass *pass = cmd->state.pass;
5053    const struct tu_framebuffer *fb = cmd->state.framebuffer;
5054    struct tu_cs *cs = &cmd->draw_cs;
5055    const struct tu_subpass *last_subpass = cmd->state.subpass;
5056 
5057    const struct tu_subpass *subpass = cmd->state.subpass++;
5058 
5059    /* Track LRZ valid state
5060     *
5061     * TODO: Improve this tracking for keeping the state of the past depth/stencil images,
5062     * so if they become active again, we reuse its old state.
5063     */
5064    if (last_subpass->depth_stencil_attachment.attachment != subpass->depth_stencil_attachment.attachment) {
5065       cmd->state.lrz.valid = false;
5066       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
5067    }
5068 
5069    if (cmd->state.tiling->possible) {
5070       if (cmd->state.pass->has_fdm)
5071          tu_cs_set_writeable(cs, true);
5072 
5073       tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
5074 
5075       if (subpass->resolve_attachments) {
5076          tu6_emit_blit_scissor(cmd, cs, true);
5077 
5078          struct tu_resolve_group resolve_group = {};
5079 
5080          for (unsigned i = 0; i < subpass->resolve_count; i++) {
5081             uint32_t a = subpass->resolve_attachments[i].attachment;
5082             if (a == VK_ATTACHMENT_UNUSED)
5083                continue;
5084 
5085             uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
5086 
5087             tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, gmem_a,
5088                                            fb->layers, subpass->multiview_mask, false);
5089 
5090             if (!pass->attachments[a].gmem)
5091                continue;
5092 
5093             /* check if the resolved attachment is needed by later subpasses,
5094             * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
5095             */
5096             perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n");
5097             tu_load_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, false, true);
5098          }
5099 
5100          tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
5101       }
5102 
5103       tu_cond_exec_end(cs);
5104 
5105       if (cmd->state.pass->has_fdm)
5106          tu_cs_set_writeable(cs, false);
5107 
5108       tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
5109    }
5110 
5111    tu6_emit_sysmem_resolves<CHIP>(cmd, cs, subpass);
5112 
5113    if (cmd->state.tiling->possible)
5114       tu_cond_exec_end(cs);
5115 
5116    /* Handle dependencies for the next subpass */
5117    tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
5118 
5119    if (cmd->state.subpass->feedback_invalidate) {
5120       cmd->state.renderpass_cache.flush_bits |=
5121          TU_CMD_FLAG_CACHE_INVALIDATE | TU_CMD_FLAG_BLIT_CACHE_CLEAN |
5122          TU_CMD_FLAG_WAIT_FOR_IDLE;
5123    }
5124 
5125    tu_emit_subpass_begin<CHIP>(cmd);
5126 }
5127 TU_GENX(tu_CmdNextSubpass2);
5128 
5129 static uint32_t
tu6_user_consts_size(const struct tu_const_state * const_state,bool ldgk,gl_shader_stage type)5130 tu6_user_consts_size(const struct tu_const_state *const_state,
5131                      bool ldgk,
5132                      gl_shader_stage type)
5133 {
5134    uint32_t dwords = 0;
5135 
5136    if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
5137       unsigned num_units = const_state->push_consts.dwords;
5138       dwords += 4 + num_units;
5139       assert(num_units > 0);
5140    }
5141 
5142    if (ldgk) {
5143       dwords += 6 + (2 * const_state->num_inline_ubos + 4);
5144    } else {
5145       dwords += 8 * const_state->num_inline_ubos;
5146    }
5147 
5148    return dwords;
5149 }
5150 
5151 static void
tu6_emit_per_stage_push_consts(struct tu_cs * cs,const struct tu_const_state * const_state,const struct ir3_const_state * ir_const_state,gl_shader_stage type,uint32_t * push_constants)5152 tu6_emit_per_stage_push_consts(struct tu_cs *cs,
5153                                const struct tu_const_state *const_state,
5154                                const struct ir3_const_state *ir_const_state,
5155                                gl_shader_stage type,
5156                                uint32_t *push_constants)
5157 {
5158    if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
5159       unsigned num_units = const_state->push_consts.dwords;
5160       unsigned offset_vec4 =
5161          ir_const_state->allocs.consts[IR3_CONST_ALLOC_PUSH_CONSTS]
5162             .offset_vec4;
5163       assert(num_units > 0);
5164 
5165       /* DST_OFF and NUM_UNIT requires vec4 units */
5166       tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units);
5167       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset_vec4) |
5168             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
5169             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
5170             CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
5171             CP_LOAD_STATE6_0_NUM_UNIT(num_units / 4));
5172       tu_cs_emit(cs, 0);
5173       tu_cs_emit(cs, 0);
5174 
5175       unsigned lo = const_state->push_consts.lo_dwords;
5176       for (unsigned i = 0; i < num_units; i++)
5177          tu_cs_emit(cs, push_constants[i + lo]);
5178    }
5179 }
5180 
5181 static void
tu6_emit_inline_ubo(struct tu_cs * cs,const struct tu_const_state * const_state,unsigned constlen,gl_shader_stage type,struct tu_descriptor_state * descriptors)5182 tu6_emit_inline_ubo(struct tu_cs *cs,
5183                     const struct tu_const_state *const_state,
5184                     unsigned constlen,
5185                     gl_shader_stage type,
5186                     struct tu_descriptor_state *descriptors)
5187 {
5188    assert(const_state->num_inline_ubos == 0 || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
5189 
5190    /* Emit loads of inline uniforms. These load directly from the uniform's
5191     * storage space inside the descriptor set.
5192     */
5193    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
5194       const struct tu_inline_ubo *ubo = &const_state->ubos[i];
5195 
5196       if (constlen <= ubo->const_offset_vec4)
5197          continue;
5198 
5199       uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f;
5200 
5201       tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), ubo->push_address ? 7 : 3);
5202       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
5203             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
5204             CP_LOAD_STATE6_0_STATE_SRC(ubo->push_address ? SS6_DIRECT : SS6_INDIRECT) |
5205             CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
5206             CP_LOAD_STATE6_0_NUM_UNIT(MIN2(ubo->size_vec4, constlen - ubo->const_offset_vec4)));
5207       if (ubo->push_address) {
5208          tu_cs_emit(cs, 0);
5209          tu_cs_emit(cs, 0);
5210          tu_cs_emit_qw(cs, va + ubo->offset);
5211          tu_cs_emit(cs, 0);
5212          tu_cs_emit(cs, 0);
5213       } else {
5214          tu_cs_emit_qw(cs, va + ubo->offset);
5215       }
5216    }
5217 }
5218 
5219 static void
tu7_emit_inline_ubo(struct tu_cs * cs,const struct tu_const_state * const_state,const struct ir3_const_state * ir_const_state,unsigned constlen,gl_shader_stage type,struct tu_descriptor_state * descriptors)5220 tu7_emit_inline_ubo(struct tu_cs *cs,
5221                     const struct tu_const_state *const_state,
5222                     const struct ir3_const_state *ir_const_state,
5223                     unsigned constlen,
5224                     gl_shader_stage type,
5225                     struct tu_descriptor_state *descriptors)
5226 {
5227    uint64_t addresses[7] = {0};
5228    unsigned offset = const_state->inline_uniforms_ubo.idx;
5229 
5230    if (offset == -1)
5231       return;
5232 
5233    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
5234       const struct tu_inline_ubo *ubo = &const_state->ubos[i];
5235 
5236       uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f;
5237       addresses[i] = va + ubo->offset;
5238    }
5239 
5240    /* A7XX TODO: Emit data via sub_cs instead of NOP */
5241    uint64_t iova = tu_cs_emit_data_nop(cs, (uint32_t *)addresses, const_state->num_inline_ubos * 2, 4);
5242 
5243    tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5);
5244    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
5245             CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
5246             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
5247             CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
5248             CP_LOAD_STATE6_0_NUM_UNIT(1));
5249    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
5250    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
5251    int size_vec4s = DIV_ROUND_UP(const_state->num_inline_ubos * 2, 4);
5252    tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
5253 }
5254 
5255 static void
tu_emit_inline_ubo(struct tu_cs * cs,const struct tu_const_state * const_state,const struct ir3_const_state * ir_const_state,unsigned constlen,gl_shader_stage type,struct tu_descriptor_state * descriptors)5256 tu_emit_inline_ubo(struct tu_cs *cs,
5257                    const struct tu_const_state *const_state,
5258                    const struct ir3_const_state *ir_const_state,
5259                    unsigned constlen,
5260                    gl_shader_stage type,
5261                    struct tu_descriptor_state *descriptors)
5262 {
5263    if (!const_state->num_inline_ubos)
5264       return;
5265 
5266    if (cs->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk) {
5267       tu7_emit_inline_ubo(cs, const_state, ir_const_state, constlen, type, descriptors);
5268    } else {
5269       tu6_emit_inline_ubo(cs, const_state, constlen, type, descriptors);
5270    }
5271 }
5272 
5273 static void
tu6_emit_shared_consts(struct tu_cs * cs,const struct tu_push_constant_range * shared_consts,uint32_t * push_constants,bool compute)5274 tu6_emit_shared_consts(struct tu_cs *cs,
5275                        const struct tu_push_constant_range *shared_consts,
5276                        uint32_t *push_constants,
5277                        bool compute)
5278 {
5279    if (shared_consts->dwords > 0) {
5280       /* Offset and num_units for shared consts are in units of dwords. */
5281       unsigned num_units = shared_consts->dwords;
5282       unsigned offset = shared_consts->lo_dwords;
5283 
5284       enum a6xx_state_type st = compute ? ST6_UBO : ST6_CONSTANTS;
5285       uint32_t cp_load_state = compute ? CP_LOAD_STATE6_FRAG : CP_LOAD_STATE6;
5286 
5287       tu_cs_emit_pkt7(cs, cp_load_state, 3 + num_units);
5288       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
5289             CP_LOAD_STATE6_0_STATE_TYPE(st) |
5290             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
5291             CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
5292             CP_LOAD_STATE6_0_NUM_UNIT(num_units));
5293       tu_cs_emit(cs, 0);
5294       tu_cs_emit(cs, 0);
5295 
5296       for (unsigned i = 0; i < num_units; i++)
5297          tu_cs_emit(cs, push_constants[i + offset]);
5298    }
5299 }
5300 
5301 static void
tu7_emit_shared_preamble_consts(struct tu_cs * cs,const struct tu_push_constant_range * shared_consts,uint32_t * push_constants)5302 tu7_emit_shared_preamble_consts(
5303    struct tu_cs *cs,
5304    const struct tu_push_constant_range *shared_consts,
5305    uint32_t *push_constants)
5306 {
5307    tu_cs_emit_pkt4(cs, REG_A7XX_HLSQ_SHARED_CONSTS_IMM(shared_consts->lo_dwords),
5308                    shared_consts->dwords);
5309    tu_cs_emit_array(cs, push_constants + shared_consts->lo_dwords,
5310                     shared_consts->dwords);
5311 }
5312 
5313 static uint32_t
tu6_const_size(struct tu_cmd_buffer * cmd,const struct tu_push_constant_range * shared_consts,bool compute)5314 tu6_const_size(struct tu_cmd_buffer *cmd,
5315                const struct tu_push_constant_range *shared_consts,
5316                bool compute)
5317 {
5318    uint32_t dwords = 0;
5319 
5320    if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
5321       dwords += shared_consts->dwords + 4;
5322    } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
5323       dwords += shared_consts->dwords + 1;
5324    }
5325 
5326    bool ldgk = cmd->device->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
5327    if (compute) {
5328       dwords +=
5329          tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, ldgk, MESA_SHADER_COMPUTE);
5330    } else {
5331       for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
5332          dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, ldgk, (gl_shader_stage) type);
5333    }
5334 
5335    return dwords;
5336 }
5337 
5338 static struct tu_draw_state
tu_emit_consts(struct tu_cmd_buffer * cmd,bool compute)5339 tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
5340 {
5341    uint32_t dwords = 0;
5342    const struct tu_push_constant_range *shared_consts =
5343       compute ? &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state.push_consts :
5344       &cmd->state.program.shared_consts;
5345 
5346    dwords = tu6_const_size(cmd, shared_consts, compute);
5347 
5348    if (dwords == 0)
5349       return (struct tu_draw_state) {};
5350 
5351    struct tu_cs cs;
5352    tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
5353 
5354    if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
5355       tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute);
5356    } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
5357       tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants);
5358    }
5359 
5360    if (compute) {
5361       tu6_emit_per_stage_push_consts(
5362          &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
5363          cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state,
5364          MESA_SHADER_COMPUTE, cmd->push_constants);
5365       tu_emit_inline_ubo(
5366          &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
5367          cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state,
5368          cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
5369          MESA_SHADER_COMPUTE,
5370          tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE));
5371    } else {
5372       struct tu_descriptor_state *descriptors =
5373          tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
5374       for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) {
5375          const struct tu_program_descriptor_linkage *link =
5376             &cmd->state.program.link[type];
5377          tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state,
5378                                         &link->const_state,
5379                                         (gl_shader_stage) type,
5380                                         cmd->push_constants);
5381          tu_emit_inline_ubo(&cs, &link->tu_const_state,
5382                             &link->const_state, link->constlen,
5383                             (gl_shader_stage) type, descriptors);
5384       }
5385    }
5386 
5387    return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
5388 }
5389 
5390 /* Various frontends (ANGLE, zink at least) will enable stencil testing with
5391  * what works out to be no-op writes.  Simplify what they give us into flags
5392  * that LRZ can use.
5393  */
5394 static void
tu6_update_simplified_stencil_state(struct tu_cmd_buffer * cmd)5395 tu6_update_simplified_stencil_state(struct tu_cmd_buffer *cmd)
5396 {
5397    const struct vk_depth_stencil_state *ds =
5398       &cmd->vk.dynamic_graphics_state.ds;
5399    bool stencil_test_enable = ds->stencil.test_enable;
5400 
5401    if (!stencil_test_enable) {
5402       cmd->state.stencil_front_write = false;
5403       cmd->state.stencil_back_write = false;
5404       return;
5405    }
5406 
5407    bool stencil_front_writemask = ds->stencil.front.write_mask;
5408    bool stencil_back_writemask = ds->stencil.back.write_mask;
5409 
5410    VkStencilOp front_fail_op = (VkStencilOp)ds->stencil.front.op.fail;
5411    VkStencilOp front_pass_op = (VkStencilOp)ds->stencil.front.op.pass;
5412    VkStencilOp front_depth_fail_op = (VkStencilOp)ds->stencil.front.op.depth_fail;
5413    VkStencilOp back_fail_op = (VkStencilOp)ds->stencil.back.op.fail;
5414    VkStencilOp back_pass_op = (VkStencilOp)ds->stencil.back.op.pass;
5415    VkStencilOp back_depth_fail_op = (VkStencilOp)ds->stencil.back.op.depth_fail;
5416 
5417    bool stencil_front_op_writes =
5418       front_pass_op != VK_STENCIL_OP_KEEP ||
5419       front_fail_op != VK_STENCIL_OP_KEEP ||
5420       front_depth_fail_op != VK_STENCIL_OP_KEEP;
5421 
5422    bool stencil_back_op_writes =
5423       back_pass_op != VK_STENCIL_OP_KEEP ||
5424       back_fail_op != VK_STENCIL_OP_KEEP ||
5425       back_depth_fail_op != VK_STENCIL_OP_KEEP;
5426 
5427    cmd->state.stencil_front_write =
5428       stencil_front_op_writes && stencil_front_writemask;
5429    cmd->state.stencil_back_write =
5430       stencil_back_op_writes && stencil_back_writemask;
5431 }
5432 
5433 static bool
tu6_writes_depth(struct tu_cmd_buffer * cmd,bool depth_test_enable)5434 tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
5435 {
5436    bool depth_write_enable =
5437       cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
5438 
5439    VkCompareOp depth_compare_op = (VkCompareOp)
5440       cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
5441 
5442    bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;
5443 
5444    return depth_test_enable && depth_write_enable && depth_compare_op_writes;
5445 }
5446 
5447 static bool
tu6_writes_stencil(struct tu_cmd_buffer * cmd)5448 tu6_writes_stencil(struct tu_cmd_buffer *cmd)
5449 {
5450    return cmd->state.stencil_front_write || cmd->state.stencil_back_write;
5451 }
5452 
5453 static bool
tu_fs_reads_dynamic_ds_input_attachment(struct tu_cmd_buffer * cmd,const struct tu_shader * fs)5454 tu_fs_reads_dynamic_ds_input_attachment(struct tu_cmd_buffer *cmd,
5455                                         const struct tu_shader *fs)
5456 {
5457    uint8_t depth_att = cmd->vk.dynamic_graphics_state.ial.depth_att;
5458    if (depth_att == MESA_VK_ATTACHMENT_UNUSED)
5459       return false;
5460    unsigned depth_idx =
5461       (depth_att == MESA_VK_ATTACHMENT_NO_INDEX) ? 0 : depth_att + 1;
5462    return fs->fs.dynamic_input_attachments_used & (1u << depth_idx);
5463 }
5464 
5465 static void
tu6_build_depth_plane_z_mode(struct tu_cmd_buffer * cmd,struct tu_cs * cs)5466 tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
5467 {
5468    enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;
5469    bool depth_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
5470    bool depth_write = tu6_writes_depth(cmd, depth_test_enable);
5471    bool stencil_write = tu6_writes_stencil(cmd);
5472    const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
5473    const struct tu_render_pass *pass = cmd->state.pass;
5474    const struct tu_subpass *subpass = cmd->state.subpass;
5475 
5476    if ((fs->variant->has_kill ||
5477         (cmd->state.pipeline_feedback_loops & VK_IMAGE_ASPECT_DEPTH_BIT) ||
5478         (cmd->vk.dynamic_graphics_state.feedback_loops &
5479          VK_IMAGE_ASPECT_DEPTH_BIT) ||
5480         tu_fs_reads_dynamic_ds_input_attachment(cmd, fs)) &&
5481        (depth_write || stencil_write)) {
5482       zmode = (cmd->state.lrz.valid && cmd->state.lrz.enabled)
5483                  ? A6XX_EARLY_LRZ_LATE_Z
5484                  : A6XX_LATE_Z;
5485    }
5486 
5487    bool force_late_z =
5488       (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED &&
5489        pass->attachments[subpass->depth_stencil_attachment.attachment].format
5490        == VK_FORMAT_S8_UINT) ||
5491       fs->fs.lrz.force_late_z ||
5492       /* alpha-to-coverage can behave like a discard. */
5493       cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable;
5494    if ((force_late_z && !fs->variant->fs.early_fragment_tests) ||
5495        !depth_test_enable)
5496       zmode = A6XX_LATE_Z;
5497 
5498    /* User defined early tests take precedence above all else */
5499    if (fs->variant->fs.early_fragment_tests)
5500       zmode = A6XX_EARLY_Z;
5501 
5502    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
5503    tu_cs_emit(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
5504 
5505    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
5506    tu_cs_emit(cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
5507 }
5508 
5509 static uint32_t
fs_params_offset(struct tu_cmd_buffer * cmd)5510 fs_params_offset(struct tu_cmd_buffer *cmd)
5511 {
5512    const struct tu_program_descriptor_linkage *link =
5513       &cmd->state.program.link[MESA_SHADER_FRAGMENT];
5514    const struct ir3_const_state *const_state = &link->const_state;
5515 
5516    if (const_state->num_driver_params <= IR3_DP_FS_DYNAMIC)
5517       return 0;
5518 
5519    uint32_t param_offset =
5520       const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
5521 
5522    if (param_offset + IR3_DP_FS_DYNAMIC / 4 >= link->constlen)
5523       return 0;
5524 
5525    return param_offset + IR3_DP_FS_DYNAMIC / 4;
5526 }
5527 
5528 static uint32_t
fs_params_size(struct tu_cmd_buffer * cmd)5529 fs_params_size(struct tu_cmd_buffer *cmd)
5530 {
5531    const struct tu_program_descriptor_linkage *link =
5532       &cmd->state.program.link[MESA_SHADER_FRAGMENT];
5533    const struct ir3_const_state *const_state = &link->const_state;
5534 
5535    return DIV_ROUND_UP(const_state->num_driver_params - IR3_DP_FS_DYNAMIC, 4);
5536 }
5537 
5538 struct apply_fs_params_state {
5539    unsigned num_consts;
5540 };
5541 
5542 static void
fdm_apply_fs_params(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)5543 fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
5544                     struct tu_cs *cs,
5545                     void *data,
5546                     VkRect2D bin,
5547                     unsigned views,
5548                     VkExtent2D *frag_areas)
5549 {
5550    const struct apply_fs_params_state *state =
5551       (const struct apply_fs_params_state *)data;
5552    unsigned num_consts = state->num_consts;
5553 
5554    for (unsigned i = 0; i < num_consts; i++) {
5555       assert(i < views);
5556       VkExtent2D area = frag_areas[i];
5557       VkOffset2D offset = tu_fdm_per_bin_offset(area, bin);
5558 
5559       tu_cs_emit(cs, area.width);
5560       tu_cs_emit(cs, area.height);
5561       tu_cs_emit(cs, fui(offset.x));
5562       tu_cs_emit(cs, fui(offset.y));
5563    }
5564 }
5565 
5566 static void
tu_emit_fdm_params(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_shader * fs,unsigned num_units)5567 tu_emit_fdm_params(struct tu_cmd_buffer *cmd,
5568                    struct tu_cs *cs, struct tu_shader *fs,
5569                    unsigned num_units)
5570 {
5571    STATIC_ASSERT(IR3_DP_FS(frag_invocation_count) == IR3_DP_FS_DYNAMIC);
5572    tu_cs_emit(cs, fs->fs.per_samp ?
5573               cmd->vk.dynamic_graphics_state.ms.rasterization_samples : 1);
5574    tu_cs_emit(cs, 0);
5575    tu_cs_emit(cs, 0);
5576    tu_cs_emit(cs, 0);
5577 
5578    STATIC_ASSERT(IR3_DP_FS(frag_size) == IR3_DP_FS_DYNAMIC + 4);
5579    STATIC_ASSERT(IR3_DP_FS(frag_offset) == IR3_DP_FS_DYNAMIC + 6);
5580    if (num_units > 1) {
5581       if (fs->fs.has_fdm) {
5582          struct apply_fs_params_state state = {
5583             .num_consts = num_units - 1,
5584          };
5585          tu_create_fdm_bin_patchpoint(cmd, cs, 4 * (num_units - 1),
5586                                       fdm_apply_fs_params, state);
5587       } else {
5588          for (unsigned i = 1; i < num_units; i++) {
5589             tu_cs_emit(cs, 1);
5590             tu_cs_emit(cs, 1);
5591             tu_cs_emit(cs, fui(0.0f));
5592             tu_cs_emit(cs, fui(0.0f));
5593          }
5594       }
5595    }
5596 }
5597 
5598 static void
tu6_emit_fs_params(struct tu_cmd_buffer * cmd)5599 tu6_emit_fs_params(struct tu_cmd_buffer *cmd)
5600 {
5601    uint32_t offset = fs_params_offset(cmd);
5602 
5603    if (offset == 0) {
5604       cmd->state.fs_params = (struct tu_draw_state) {};
5605       return;
5606    }
5607 
5608    struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
5609 
5610    unsigned num_units = fs_params_size(cmd);
5611 
5612    if (fs->fs.has_fdm)
5613       tu_cs_set_writeable(&cmd->sub_cs, true);
5614 
5615    struct tu_cs cs;
5616    VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 4 + 4 * num_units, &cs);
5617    if (result != VK_SUCCESS) {
5618       tu_cs_set_writeable(&cmd->sub_cs, false);
5619       vk_command_buffer_set_error(&cmd->vk, result);
5620       return;
5621    }
5622 
5623    tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_units);
5624    tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
5625          CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
5626          CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
5627          CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
5628          CP_LOAD_STATE6_0_NUM_UNIT(num_units));
5629    tu_cs_emit(&cs, 0);
5630    tu_cs_emit(&cs, 0);
5631 
5632    tu_emit_fdm_params(cmd, &cs, fs, num_units);
5633 
5634    cmd->state.fs_params = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
5635 
5636    if (fs->fs.has_fdm)
5637       tu_cs_set_writeable(&cmd->sub_cs, false);
5638 }
5639 
5640 static void
tu7_emit_fs_params(struct tu_cmd_buffer * cmd)5641 tu7_emit_fs_params(struct tu_cmd_buffer *cmd)
5642 {
5643    struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
5644 
5645    int ubo_offset = fs->const_state.fdm_ubo.idx;
5646    if (ubo_offset < 0) {
5647       cmd->state.fs_params = (struct tu_draw_state) {};
5648       return;
5649    }
5650 
5651    unsigned num_units = DIV_ROUND_UP(fs->const_state.fdm_ubo.size, 4);
5652 
5653    if (fs->fs.has_fdm)
5654       tu_cs_set_writeable(&cmd->sub_cs, true);
5655 
5656    struct tu_cs cs;
5657    VkResult result =
5658       tu_cs_begin_sub_stream_aligned(&cmd->sub_cs, num_units, 4, &cs);
5659    if (result != VK_SUCCESS) {
5660       tu_cs_set_writeable(&cmd->sub_cs, false);
5661       vk_command_buffer_set_error(&cmd->vk, result);
5662       return;
5663    }
5664 
5665    tu_emit_fdm_params(cmd, &cs, fs, num_units);
5666 
5667    struct tu_draw_state fdm_ubo = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
5668 
5669    if (fs->fs.has_fdm)
5670       tu_cs_set_writeable(&cmd->sub_cs, false);
5671 
5672    result = tu_cs_begin_sub_stream(&cmd->sub_cs, 6, &cs);
5673    if (result != VK_SUCCESS) {
5674       vk_command_buffer_set_error(&cmd->vk, result);
5675       return;
5676    }
5677 
5678    tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 5);
5679    tu_cs_emit(&cs,
5680               CP_LOAD_STATE6_0_DST_OFF(ubo_offset) |
5681               CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
5682               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
5683               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
5684               CP_LOAD_STATE6_0_NUM_UNIT(1));
5685    tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
5686    tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
5687    tu_cs_emit_qw(&cs,
5688                  fdm_ubo.iova |
5689                  (uint64_t)A6XX_UBO_1_SIZE(num_units) << 32);
5690 
5691    cmd->state.fs_params = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
5692 }
5693 
5694 static void
tu_emit_fs_params(struct tu_cmd_buffer * cmd)5695 tu_emit_fs_params(struct tu_cmd_buffer *cmd)
5696 {
5697    if (cmd->device->compiler->load_shader_consts_via_preamble)
5698       tu7_emit_fs_params(cmd);
5699    else
5700       tu6_emit_fs_params(cmd);
5701 }
5702 
5703 static void
tu_flush_dynamic_input_attachments(struct tu_cmd_buffer * cmd)5704 tu_flush_dynamic_input_attachments(struct tu_cmd_buffer *cmd)
5705 {
5706    struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
5707 
5708    if (!fs->fs.dynamic_input_attachments_used)
5709       return;
5710 
5711    /* Input attachments may read data from a load op, so we have to invalidate
5712     * UCHE and force pending blits to complete unless we know it's already
5713     * been invalidated. This is the same as tu_subpass::feedback_invalidate
5714     * but for dynamic renderpasses.
5715     */
5716    if (!cmd->state.blit_cache_cleaned) {
5717       cmd->state.renderpass_cache.flush_bits |=
5718          TU_CMD_FLAG_CACHE_INVALIDATE | TU_CMD_FLAG_BLIT_CACHE_CLEAN |
5719          TU_CMD_FLAG_WAIT_FOR_IDLE;
5720    }
5721 }
5722 
5723 template <chip CHIP>
5724 static VkResult
tu6_draw_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool indexed,uint32_t draw_count)5725 tu6_draw_common(struct tu_cmd_buffer *cmd,
5726                 struct tu_cs *cs,
5727                 bool indexed,
5728                 /* note: draw_count is 0 for indirect */
5729                 uint32_t draw_count)
5730 {
5731    const struct tu_program_state *program = &cmd->state.program;
5732    struct tu_render_pass_state *rp = &cmd->state.rp;
5733 
5734    /* Emit state first, because it's needed for bandwidth calculations */
5735    uint32_t dynamic_draw_state_dirty = 0;
5736    if (!BITSET_IS_EMPTY(cmd->vk.dynamic_graphics_state.dirty) ||
5737        (cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS)) {
5738       dynamic_draw_state_dirty = tu_emit_draw_state<CHIP>(cmd);
5739    }
5740 
5741    /* Primitive restart value works in non-indexed draws, we have to disable
5742     * prim restart for such draws since we may read stale restart index.
5743     */
5744    if (cmd->state.last_draw_indexed != indexed) {
5745       cmd->state.last_draw_indexed = indexed;
5746       BITSET_SET(cmd->vk.dynamic_graphics_state.dirty,
5747                  MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE);
5748    }
5749 
5750    /* Fill draw stats for autotuner */
5751    rp->drawcall_count++;
5752 
5753    rp->drawcall_bandwidth_per_sample_sum +=
5754       cmd->state.bandwidth.color_bandwidth_per_sample;
5755 
5756    /* add depth memory bandwidth cost */
5757    const uint32_t depth_bandwidth = cmd->state.bandwidth.depth_cpp_per_sample;
5758    if (cmd->vk.dynamic_graphics_state.ds.depth.write_enable)
5759       rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth;
5760    if (cmd->vk.dynamic_graphics_state.ds.depth.test_enable)
5761       rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth;
5762 
5763    /* add stencil memory bandwidth cost */
5764    const uint32_t stencil_bandwidth =
5765       cmd->state.bandwidth.stencil_cpp_per_sample;
5766    if (cmd->vk.dynamic_graphics_state.ds.stencil.test_enable)
5767       rp->drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
5768 
5769    if (cmd->state.dirty & TU_CMD_DIRTY_FS)
5770       tu_flush_dynamic_input_attachments(cmd);
5771 
5772    tu_emit_cache_flush_renderpass<CHIP>(cmd);
5773 
5774   if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5775                   MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE) ||
5776       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5777                   MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
5778       (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
5779       bool primitive_restart_enabled =
5780          cmd->vk.dynamic_graphics_state.ia.primitive_restart_enable;
5781 
5782       bool primitive_restart = primitive_restart_enabled && indexed;
5783       bool provoking_vtx_last =
5784          cmd->vk.dynamic_graphics_state.rs.provoking_vertex ==
5785          VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
5786 
5787       uint32_t primitive_cntl_0 =
5788          A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart = primitive_restart,
5789                                   .provoking_vtx_last = provoking_vtx_last).value;
5790       tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(.dword = primitive_cntl_0));
5791       if (CHIP == A7XX) {
5792          tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0(.dword = primitive_cntl_0));
5793       }
5794    }
5795 
5796    struct tu_tess_params *tess_params = &cmd->state.tess_params;
5797    if ((cmd->state.dirty & TU_CMD_DIRTY_TESS_PARAMS) ||
5798        BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5799                    MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN) ||
5800        (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
5801       bool tess_upper_left_domain_origin =
5802          (VkTessellationDomainOrigin)cmd->vk.dynamic_graphics_state.ts.domain_origin ==
5803          VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
5804       tu_cs_emit_regs(cs, A6XX_PC_TESS_CNTL(
5805             .spacing = tess_params->spacing,
5806             .output = tess_upper_left_domain_origin ?
5807                tess_params->output_upper_left :
5808                tess_params->output_lower_left));
5809    }
5810 
5811    /* Early exit if there is nothing to emit, saves CPU cycles */
5812    uint32_t dirty = cmd->state.dirty;
5813    if (!dynamic_draw_state_dirty && !(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS))
5814       return VK_SUCCESS;
5815 
5816    bool dirty_lrz =
5817       (dirty & TU_CMD_DIRTY_LRZ) ||
5818       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5819                   MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
5820       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5821                   MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
5822       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5823                   MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
5824       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5825                   MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
5826       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5827                   MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
5828       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5829                   MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
5830       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5831                   MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5832       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5833                   MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
5834       BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5835                   MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
5836 
5837    if (dirty_lrz) {
5838       struct tu_cs cs;
5839       uint32_t size = 8 +
5840                       (cmd->device->physical_device->info->a6xx.lrz_track_quirk ? 2 : 0) +
5841                       (CHIP >= A7XX ? 2 : 0); // A7XX has extra packets from LRZ_CNTL2.
5842 
5843       cmd->state.lrz_and_depth_plane_state =
5844          tu_cs_draw_state(&cmd->sub_cs, &cs, size);
5845       tu6_update_simplified_stencil_state(cmd);
5846       tu6_emit_lrz<CHIP>(cmd, &cs);
5847       tu6_build_depth_plane_z_mode(cmd, &cs);
5848    }
5849 
5850    if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5851                    MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
5852       if (cmd->vk.dynamic_graphics_state.feedback_loops &&
5853           !cmd->state.rp.disable_gmem) {
5854          perf_debug(
5855             cmd->device,
5856             "Disabling gmem due to VK_EXT_attachment_feedback_loop_layout");
5857          cmd->state.rp.disable_gmem = true;
5858       }
5859    }
5860 
5861    if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5862                    MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
5863       cmd->state.vertex_buffers.size =
5864          util_last_bit(cmd->vk.dynamic_graphics_state.vi_bindings_valid) * 4;
5865       dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
5866    }
5867 
5868    if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
5869       cmd->state.shader_const = tu_emit_consts(cmd, false);
5870 
5871    if (dirty & TU_CMD_DIRTY_DESC_SETS)
5872       tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
5873 
5874    if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5875                    MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
5876        BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5877                    MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
5878        BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5879                    MESA_VK_DYNAMIC_RS_LINE_MODE) ||
5880        (cmd->state.dirty & TU_CMD_DIRTY_TES) ||
5881        (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
5882       tu6_update_msaa_disable(cmd);
5883    }
5884 
5885    if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5886                    MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
5887        (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
5888       tu6_update_msaa(cmd);
5889    }
5890 
5891    bool dirty_fs_params = false;
5892    if (BITSET_TEST(cmd->vk.dynamic_graphics_state.dirty,
5893                    MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
5894        (cmd->state.dirty & (TU_CMD_DIRTY_PROGRAM | TU_CMD_DIRTY_FDM))) {
5895       tu_emit_fs_params(cmd);
5896       dirty_fs_params = true;
5897    }
5898 
5899    /* for the first draw in a renderpass, re-emit all the draw states
5900     *
5901     * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
5902     * used, then draw states must be re-emitted. note however this only happens
5903     * in the sysmem path, so this can be skipped this for the gmem path (TODO)
5904     *
5905     * the two input attachment states are excluded because secondary command
5906     * buffer doesn't have a state ib to restore it, and not re-emitting them
5907     * is OK since CmdClearAttachments won't disable/overwrite them
5908     */
5909    if (dirty & TU_CMD_DIRTY_DRAW_STATE) {
5910       tu_pipeline_update_rp_state(&cmd->state);
5911 
5912       tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
5913 
5914       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, program->config_state);
5915       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS, program->vs_state);
5916       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_BINNING, program->vs_binning_state);
5917       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_HS, program->hs_state);
5918       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DS, program->ds_state);
5919       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS, program->gs_state);
5920       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_GS_BINNING, program->gs_binning_state);
5921       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS, program->fs_state);
5922       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VPC, program->vpc_state);
5923       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, cmd->state.prim_order_gmem);
5924       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
5925       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
5926       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.load_state);
5927       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
5928       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
5929       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_PARAMS, cmd->state.fs_params);
5930       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state);
5931 
5932       for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
5933          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
5934                                cmd->state.dynamic_state[i]);
5935       }
5936    } else {
5937       /* emit draw states that were just updated */
5938       uint32_t draw_state_count =
5939          util_bitcount(dynamic_draw_state_dirty) +
5940          ((dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 1 : 0) +
5941          ((dirty & TU_CMD_DIRTY_DESC_SETS) ? 1 : 0) +
5942          ((dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
5943          ((dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
5944          (dirty_fs_params ? 1 : 0) +
5945          (dirty_lrz ? 1 : 0);
5946 
5947       if (draw_state_count > 0)
5948          tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
5949 
5950       if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
5951          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
5952       if (dirty & TU_CMD_DIRTY_DESC_SETS) {
5953          /* tu6_emit_descriptor_sets emitted the cmd->state.desc_sets draw state. */
5954          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.load_state);
5955       }
5956       if (dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
5957          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
5958       u_foreach_bit (i, dynamic_draw_state_dirty) {
5959          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
5960                                cmd->state.dynamic_state[i]);
5961       }
5962       if (dirty & TU_CMD_DIRTY_VS_PARAMS)
5963          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
5964       if (dirty_fs_params)
5965          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_PARAMS, cmd->state.fs_params);
5966       if (dirty_lrz) {
5967          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state);
5968       }
5969    }
5970 
5971    tu_cs_sanity_check(cs);
5972 
5973    /* There are too many graphics dirty bits to list here, so just list the
5974     * bits to preserve instead. The only things not emitted here are
5975     * compute-related state.
5976     */
5977    cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS;
5978    BITSET_ZERO(cmd->vk.dynamic_graphics_state.dirty);
5979    return VK_SUCCESS;
5980 }
5981 
5982 static uint32_t
tu_draw_initiator(struct tu_cmd_buffer * cmd,enum pc_di_src_sel src_sel)5983 tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
5984 {
5985    enum pc_di_primtype primtype =
5986       tu6_primtype((VkPrimitiveTopology)cmd->vk.dynamic_graphics_state.ia.primitive_topology);
5987 
5988    if (primtype == DI_PT_PATCHES0)
5989       primtype = (enum pc_di_primtype) (primtype +
5990                                         cmd->vk.dynamic_graphics_state.ts.patch_control_points);
5991 
5992    uint32_t initiator =
5993       CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
5994       CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
5995       CP_DRAW_INDX_OFFSET_0_INDEX_SIZE((enum a4xx_index_size) cmd->state.index_size) |
5996       CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
5997 
5998    if (cmd->state.shaders[MESA_SHADER_GEOMETRY]->variant)
5999       initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
6000 
6001    const struct tu_shader *tes = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
6002    if (tes->variant) {
6003       switch (tes->variant->key.tessellation) {
6004       case IR3_TESS_TRIANGLES:
6005          initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
6006                       CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
6007          break;
6008       case IR3_TESS_ISOLINES:
6009          initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
6010                       CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
6011          break;
6012       case IR3_TESS_QUADS:
6013          initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
6014                       CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
6015          break;
6016       }
6017    }
6018    return initiator;
6019 }
6020 
6021 
6022 static uint32_t
vs_params_offset(struct tu_cmd_buffer * cmd)6023 vs_params_offset(struct tu_cmd_buffer *cmd)
6024 {
6025    const struct tu_program_descriptor_linkage *link =
6026       &cmd->state.program.link[MESA_SHADER_VERTEX];
6027    const struct ir3_const_state *const_state = &link->const_state;
6028 
6029    uint32_t param_offset =
6030       const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
6031 
6032    if (!ir3_const_can_upload(&const_state->allocs,
6033                              IR3_CONST_ALLOC_DRIVER_PARAMS, link->constlen))
6034       return 0;
6035 
6036    /* this layout is required by CP_DRAW_INDIRECT_MULTI */
6037    STATIC_ASSERT(IR3_DP_VS(draw_id) == 0);
6038    STATIC_ASSERT(IR3_DP_VS(vtxid_base) == 1);
6039    STATIC_ASSERT(IR3_DP_VS(instid_base) == 2);
6040 
6041    /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
6042    assert(param_offset != 0);
6043 
6044    return param_offset;
6045 }
6046 
6047 static void
tu6_emit_empty_vs_params(struct tu_cmd_buffer * cmd)6048 tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
6049 {
6050    if (cmd->state.vs_params.iova) {
6051       cmd->state.vs_params = (struct tu_draw_state) {};
6052       cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
6053    }
6054 }
6055 
6056 static void
tu6_emit_vs_params(struct tu_cmd_buffer * cmd,uint32_t draw_id,uint32_t vertex_offset,uint32_t first_instance)6057 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
6058                    uint32_t draw_id,
6059                    uint32_t vertex_offset,
6060                    uint32_t first_instance)
6061 {
6062    uint32_t offset = vs_params_offset(cmd);
6063 
6064    /* Beside re-emitting params when they are changed, we should re-emit
6065     * them after constants are invalidated via HLSQ_INVALIDATE_CMD or after we
6066     * emit an empty vs params.
6067     */
6068    if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS |
6069                              TU_CMD_DIRTY_PROGRAM)) &&
6070        cmd->state.vs_params.iova &&
6071        (offset == 0 || draw_id == cmd->state.last_vs_params.draw_id) &&
6072        vertex_offset == cmd->state.last_vs_params.vertex_offset &&
6073        first_instance == cmd->state.last_vs_params.first_instance) {
6074       return;
6075    }
6076 
6077    uint64_t consts_iova = 0;
6078    if (offset) {
6079       struct tu_cs_memory consts;
6080       VkResult result = tu_cs_alloc(&cmd->sub_cs, 1, 4, &consts);
6081       if (result != VK_SUCCESS) {
6082          vk_command_buffer_set_error(&cmd->vk, result);
6083          return;
6084       }
6085       consts.map[0] = draw_id;
6086       consts.map[1] = vertex_offset;
6087       consts.map[2] = first_instance;
6088       consts.map[3] = 0;
6089 
6090       consts_iova = consts.iova;
6091    }
6092 
6093    struct tu_cs cs;
6094    VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 4 : 0), &cs);
6095    if (result != VK_SUCCESS) {
6096       vk_command_buffer_set_error(&cmd->vk, result);
6097       return;
6098    }
6099 
6100    tu_cs_emit_regs(&cs,
6101                    A6XX_VFD_INDEX_OFFSET(vertex_offset),
6102                    A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
6103 
6104    /* It is implemented as INDIRECT load even on a750+ because with UBO
6105     * lowering it would be tricky to get const offset for to use in multidraw,
6106     * also we would need to ensure the offset is not 0.
6107     * TODO/A7XX: Rework vs params to use UBO lowering.
6108     */
6109    if (offset) {
6110       tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3);
6111       tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
6112             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
6113             CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
6114             CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
6115             CP_LOAD_STATE6_0_NUM_UNIT(1));
6116       tu_cs_emit_qw(&cs, consts_iova);
6117    }
6118 
6119    cmd->state.last_vs_params.vertex_offset = vertex_offset;
6120    cmd->state.last_vs_params.first_instance = first_instance;
6121    cmd->state.last_vs_params.draw_id = draw_id;
6122 
6123    struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
6124    cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
6125 
6126    cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
6127 }
6128 
6129 template <chip CHIP>
6130 VKAPI_ATTR void VKAPI_CALL
tu_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6131 tu_CmdDraw(VkCommandBuffer commandBuffer,
6132            uint32_t vertexCount,
6133            uint32_t instanceCount,
6134            uint32_t firstVertex,
6135            uint32_t firstInstance)
6136 {
6137    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6138    struct tu_cs *cs = &cmd->draw_cs;
6139 
6140    tu6_emit_vs_params(cmd, 0, firstVertex, firstInstance);
6141 
6142    tu6_draw_common<CHIP>(cmd, cs, false, vertexCount);
6143 
6144    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
6145    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
6146    tu_cs_emit(cs, instanceCount);
6147    tu_cs_emit(cs, vertexCount);
6148 }
6149 TU_GENX(tu_CmdDraw);
6150 
6151 template <chip CHIP>
6152 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)6153 tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
6154                    uint32_t drawCount,
6155                    const VkMultiDrawInfoEXT *pVertexInfo,
6156                    uint32_t instanceCount,
6157                    uint32_t firstInstance,
6158                    uint32_t stride)
6159 {
6160    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6161    struct tu_cs *cs = &cmd->draw_cs;
6162 
6163    if (!drawCount)
6164       return;
6165 
6166    bool has_tess = cmd->state.shaders[MESA_SHADER_TESS_CTRL]->variant;
6167 
6168    uint32_t max_vertex_count = 0;
6169    if (has_tess) {
6170       uint32_t i = 0;
6171       vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
6172          max_vertex_count = MAX2(max_vertex_count, draw->vertexCount);
6173       }
6174    }
6175 
6176    uint32_t i = 0;
6177    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
6178       tu6_emit_vs_params(cmd, i, draw->firstVertex, firstInstance);
6179 
6180       if (i == 0)
6181          tu6_draw_common<CHIP>(cmd, cs, false, max_vertex_count);
6182 
6183       if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) {
6184          tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
6185          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
6186          cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS;
6187       }
6188 
6189       tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
6190       tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
6191       tu_cs_emit(cs, instanceCount);
6192       tu_cs_emit(cs, draw->vertexCount);
6193    }
6194 }
6195 TU_GENX(tu_CmdDrawMultiEXT);
6196 
6197 template <chip CHIP>
6198 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6199 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6200                   uint32_t indexCount,
6201                   uint32_t instanceCount,
6202                   uint32_t firstIndex,
6203                   int32_t vertexOffset,
6204                   uint32_t firstInstance)
6205 {
6206    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6207    struct tu_cs *cs = &cmd->draw_cs;
6208 
6209    tu6_emit_vs_params(cmd, 0, vertexOffset, firstInstance);
6210 
6211    tu6_draw_common<CHIP>(cmd, cs, true, indexCount);
6212 
6213    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
6214    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
6215    tu_cs_emit(cs, instanceCount);
6216    tu_cs_emit(cs, indexCount);
6217    tu_cs_emit(cs, firstIndex);
6218    tu_cs_emit_qw(cs, cmd->state.index_va);
6219    tu_cs_emit(cs, cmd->state.max_index_count);
6220 }
6221 TU_GENX(tu_CmdDrawIndexed);
6222 
6223 template <chip CHIP>
6224 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)6225 tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
6226                           uint32_t drawCount,
6227                           const VkMultiDrawIndexedInfoEXT *pIndexInfo,
6228                           uint32_t instanceCount,
6229                           uint32_t firstInstance,
6230                           uint32_t stride,
6231                           const int32_t *pVertexOffset)
6232 {
6233    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6234    struct tu_cs *cs = &cmd->draw_cs;
6235 
6236    if (!drawCount)
6237       return;
6238 
6239    bool has_tess = cmd->state.shaders[MESA_SHADER_TESS_CTRL]->variant;
6240 
6241    uint32_t max_index_count = 0;
6242    if (has_tess) {
6243       uint32_t i = 0;
6244       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
6245          max_index_count = MAX2(max_index_count, draw->indexCount);
6246       }
6247    }
6248 
6249    uint32_t i = 0;
6250    vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
6251       int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
6252       tu6_emit_vs_params(cmd, i, vertexOffset, firstInstance);
6253 
6254       if (i == 0)
6255          tu6_draw_common<CHIP>(cmd, cs, true, max_index_count);
6256 
6257       if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) {
6258          tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
6259          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
6260          cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS;
6261       }
6262 
6263       tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
6264       tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
6265       tu_cs_emit(cs, instanceCount);
6266       tu_cs_emit(cs, draw->indexCount);
6267       tu_cs_emit(cs, draw->firstIndex);
6268       tu_cs_emit_qw(cs, cmd->state.index_va);
6269       tu_cs_emit(cs, cmd->state.max_index_count);
6270    }
6271 }
6272 TU_GENX(tu_CmdDrawMultiIndexedEXT);
6273 
6274 /* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
6275  * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
6276  * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
6277  * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
6278  * before draw opcodes that don't need it.
6279  */
6280 static void
draw_wfm(struct tu_cmd_buffer * cmd)6281 draw_wfm(struct tu_cmd_buffer *cmd)
6282 {
6283    cmd->state.renderpass_cache.flush_bits |=
6284       cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
6285    cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
6286 }
6287 
6288 template <chip CHIP>
6289 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6290 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6291                    VkBuffer _buffer,
6292                    VkDeviceSize offset,
6293                    uint32_t drawCount,
6294                    uint32_t stride)
6295 {
6296    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6297    VK_FROM_HANDLE(tu_buffer, buf, _buffer);
6298    struct tu_cs *cs = &cmd->draw_cs;
6299 
6300    tu6_emit_empty_vs_params(cmd);
6301 
6302    if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
6303       draw_wfm(cmd);
6304 
6305    tu6_draw_common<CHIP>(cmd, cs, false, 0);
6306 
6307    tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
6308    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
6309    tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
6310                   A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
6311    tu_cs_emit(cs, drawCount);
6312    tu_cs_emit_qw(cs, buf->iova + offset);
6313    tu_cs_emit(cs, stride);
6314 }
6315 TU_GENX(tu_CmdDrawIndirect);
6316 
6317 template <chip CHIP>
6318 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6319 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6320                           VkBuffer _buffer,
6321                           VkDeviceSize offset,
6322                           uint32_t drawCount,
6323                           uint32_t stride)
6324 {
6325    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6326    VK_FROM_HANDLE(tu_buffer, buf, _buffer);
6327    struct tu_cs *cs = &cmd->draw_cs;
6328 
6329    tu6_emit_empty_vs_params(cmd);
6330 
6331    if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
6332       draw_wfm(cmd);
6333 
6334    tu6_draw_common<CHIP>(cmd, cs, true, 0);
6335 
6336    tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
6337    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
6338    tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
6339                   A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
6340    tu_cs_emit(cs, drawCount);
6341    tu_cs_emit_qw(cs, cmd->state.index_va);
6342    tu_cs_emit(cs, cmd->state.max_index_count);
6343    tu_cs_emit_qw(cs, buf->iova + offset);
6344    tu_cs_emit(cs, stride);
6345 }
6346 TU_GENX(tu_CmdDrawIndexedIndirect);
6347 
6348 template <chip CHIP>
6349 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t drawCount,uint32_t stride)6350 tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
6351                         VkBuffer _buffer,
6352                         VkDeviceSize offset,
6353                         VkBuffer countBuffer,
6354                         VkDeviceSize countBufferOffset,
6355                         uint32_t drawCount,
6356                         uint32_t stride)
6357 {
6358    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6359    VK_FROM_HANDLE(tu_buffer, buf, _buffer);
6360    VK_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
6361    struct tu_cs *cs = &cmd->draw_cs;
6362 
6363    tu6_emit_empty_vs_params(cmd);
6364 
6365    /* It turns out that the firmware we have for a650 only partially fixed the
6366     * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
6367     * before reading indirect parameters. It waits for WFI's before reading
6368     * the draw parameters, but after reading the indirect count :(.
6369     */
6370    draw_wfm(cmd);
6371 
6372    tu6_draw_common<CHIP>(cmd, cs, false, 0);
6373 
6374    tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
6375    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
6376    tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
6377                   A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
6378    tu_cs_emit(cs, drawCount);
6379    tu_cs_emit_qw(cs, buf->iova + offset);
6380    tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset);
6381    tu_cs_emit(cs, stride);
6382 }
6383 TU_GENX(tu_CmdDrawIndirectCount);
6384 
6385 template <chip CHIP>
6386 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t drawCount,uint32_t stride)6387 tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
6388                                VkBuffer _buffer,
6389                                VkDeviceSize offset,
6390                                VkBuffer countBuffer,
6391                                VkDeviceSize countBufferOffset,
6392                                uint32_t drawCount,
6393                                uint32_t stride)
6394 {
6395    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6396    VK_FROM_HANDLE(tu_buffer, buf, _buffer);
6397    VK_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
6398    struct tu_cs *cs = &cmd->draw_cs;
6399 
6400    tu6_emit_empty_vs_params(cmd);
6401 
6402    draw_wfm(cmd);
6403 
6404    tu6_draw_common<CHIP>(cmd, cs, true, 0);
6405 
6406    tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
6407    tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
6408    tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
6409                   A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
6410    tu_cs_emit(cs, drawCount);
6411    tu_cs_emit_qw(cs, cmd->state.index_va);
6412    tu_cs_emit(cs, cmd->state.max_index_count);
6413    tu_cs_emit_qw(cs, buf->iova + offset);
6414    tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset);
6415    tu_cs_emit(cs, stride);
6416 }
6417 TU_GENX(tu_CmdDrawIndexedIndirectCount);
6418 
6419 template <chip CHIP>
6420 VKAPI_ATTR void VKAPI_CALL
tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)6421 tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
6422                                uint32_t instanceCount,
6423                                uint32_t firstInstance,
6424                                VkBuffer _counterBuffer,
6425                                VkDeviceSize counterBufferOffset,
6426                                uint32_t counterOffset,
6427                                uint32_t vertexStride)
6428 {
6429    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
6430    VK_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
6431    struct tu_cs *cs = &cmd->draw_cs;
6432 
6433    /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
6434     * Plus, for the common case where the counter buffer is written by
6435     * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
6436     * complete which means we need a WAIT_FOR_ME anyway.
6437     */
6438    draw_wfm(cmd);
6439 
6440    tu6_emit_vs_params(cmd, 0, 0, firstInstance);
6441 
6442    tu6_draw_common<CHIP>(cmd, cs, false, 0);
6443 
6444    tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
6445    if (CHIP == A6XX) {
6446       tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
6447    } else {
6448       tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
6449       /* On a7xx the counter value and offset are shifted right by 2, so
6450        * the vertexStride should also be in units of dwords.
6451        */
6452       vertexStride = vertexStride >> 2;
6453    }
6454    tu_cs_emit(cs, instanceCount);
6455    tu_cs_emit_qw(cs, buf->iova + counterBufferOffset);
6456    tu_cs_emit(cs, counterOffset);
6457    tu_cs_emit(cs, vertexStride);
6458 }
6459 TU_GENX(tu_CmdDrawIndirectByteCountEXT);
6460 
6461 struct tu_dispatch_info
6462 {
6463    /**
6464     * Determine the layout of the grid (in block units) to be used.
6465     */
6466    uint32_t blocks[3];
6467 
6468    /**
6469     * A starting offset for the grid. If unaligned is set, the offset
6470     * must still be aligned.
6471     */
6472    uint32_t offsets[3];
6473    /**
6474     * Whether it's an unaligned compute dispatch.
6475     */
6476    bool unaligned;
6477 
6478    /**
6479     * Indirect compute parameters resource.
6480     */
6481    struct tu_buffer *indirect;
6482    uint64_t indirect_offset;
6483 };
6484 
6485 static inline struct ir3_driver_params_cs
build_driver_params_cs(const struct ir3_shader_variant * variant,const struct tu_dispatch_info * info)6486 build_driver_params_cs(const struct ir3_shader_variant *variant,
6487                        const struct tu_dispatch_info *info)
6488 {
6489    unsigned subgroup_size = variant->info.subgroup_size;
6490    unsigned subgroup_shift = util_logbase2(subgroup_size);
6491 
6492    return (struct ir3_driver_params_cs) {
6493       .num_work_groups_x = info->blocks[0],
6494       .num_work_groups_y = info->blocks[1],
6495       .num_work_groups_z = info->blocks[2],
6496       .work_dim = 0,
6497       .base_group_x = info->offsets[0],
6498       .base_group_y = info->offsets[1],
6499       .base_group_z = info->offsets[2],
6500       .subgroup_size = subgroup_size,
6501       .local_group_size_x = 0,
6502       .local_group_size_y = 0,
6503       .local_group_size_z = 0,
6504       .subgroup_id_shift = subgroup_shift,
6505    };
6506 }
6507 
6508 template <chip CHIP>
6509 static void
tu_emit_compute_driver_params(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_dispatch_info * info)6510 tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
6511                               struct tu_cs *cs,
6512                               const struct tu_dispatch_info *info)
6513 {
6514    gl_shader_stage type = MESA_SHADER_COMPUTE;
6515    const struct tu_shader *shader = cmd->state.shaders[MESA_SHADER_COMPUTE];
6516    const struct ir3_shader_variant *variant = shader->variant;
6517    const struct ir3_const_state *const_state = variant->const_state;
6518    unsigned subgroup_size = variant->info.subgroup_size;
6519    unsigned subgroup_shift = util_logbase2(subgroup_size);
6520 
6521    if (cmd->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
6522       uint32_t num_consts = const_state->driver_params_ubo.size;
6523       if (num_consts == 0)
6524          return;
6525 
6526       bool direct_indirect_load =
6527          !(info->indirect_offset & 0xf) &&
6528          !(info->indirect && num_consts > IR3_DP_CS(base_group_x));
6529 
6530       uint64_t iova = 0;
6531 
6532       if (!info->indirect) {
6533          struct ir3_driver_params_cs driver_params =
6534             build_driver_params_cs(variant, info);
6535 
6536          assert(num_consts <= dword_sizeof(driver_params));
6537 
6538          struct tu_cs_memory consts;
6539          uint32_t consts_vec4 = DIV_ROUND_UP(num_consts, 4);
6540          VkResult result = tu_cs_alloc(&cmd->sub_cs, consts_vec4, 4, &consts);
6541          if (result != VK_SUCCESS) {
6542             vk_command_buffer_set_error(&cmd->vk, result);
6543             return;
6544          }
6545          memcpy(consts.map, &driver_params, num_consts * sizeof(uint32_t));
6546          iova = consts.iova;
6547       } else if (direct_indirect_load) {
6548          iova = info->indirect->iova + info->indirect_offset;
6549       } else {
6550          /* Vulkan guarantees only 4 byte alignment for indirect_offset.
6551           * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
6552           */
6553 
6554          uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
6555 
6556          /* Wait for any previous uses to finish. */
6557          tu_cs_emit_wfi(cs);
6558 
6559          for (uint32_t i = 0; i < 3; i++) {
6560             tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
6561             tu_cs_emit(cs, 0);
6562             tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i));
6563             tu_cs_emit_qw(cs, indirect_iova + i * sizeof(uint32_t));
6564          }
6565 
6566          /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
6567           * indirect dispatch.
6568           */
6569          if (info->indirect && num_consts > IR3_DP_CS(base_group_x)) {
6570             uint32_t indirect_driver_params[8] = {
6571                0, 0, 0, subgroup_size,
6572                0, 0, 0, subgroup_shift,
6573             };
6574             bool emit_local = num_consts > IR3_DP_CS(local_group_size_x);
6575             uint32_t emit_size = emit_local ? 8 : 4;
6576 
6577             tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + emit_size);
6578             tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, 0) + 4 * sizeof(uint32_t));
6579             for (uint32_t i = 0; i < emit_size; i++) {
6580                tu_cs_emit(cs, indirect_driver_params[i]);
6581             }
6582          }
6583 
6584          tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
6585          tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
6586          tu_cs_emit_wfi(cs);
6587 
6588          iova = global_iova(cmd, cs_indirect_xyz[0]);
6589       }
6590 
6591       tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5);
6592       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(const_state->driver_params_ubo.idx) |
6593                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
6594                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
6595                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
6596                CP_LOAD_STATE6_0_NUM_UNIT(1));
6597       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
6598       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
6599       int size_vec4s = DIV_ROUND_UP(num_consts, 4);
6600       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
6601 
6602    } else {
6603       uint32_t offset =
6604          const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
6605       if (!ir3_const_can_upload(&const_state->allocs,
6606                                 IR3_CONST_ALLOC_DRIVER_PARAMS,
6607                                 variant->constlen))
6608          return;
6609 
6610       uint32_t num_consts = MIN2(const_state->num_driver_params,
6611                                  (variant->constlen - offset) * 4);
6612 
6613       if (!info->indirect) {
6614          struct ir3_driver_params_cs driver_params =
6615             build_driver_params_cs(variant, info);
6616 
6617          assert(num_consts <= dword_sizeof(driver_params));
6618 
6619          /* push constants */
6620          tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
6621          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
6622                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
6623                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
6624                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
6625                   CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
6626          tu_cs_emit(cs, 0);
6627          tu_cs_emit(cs, 0);
6628          tu_cs_emit_array(cs, (uint32_t *)&driver_params, num_consts);
6629       } else if (!(info->indirect_offset & 0xf)) {
6630          tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
6631          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
6632                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
6633                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
6634                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
6635                      CP_LOAD_STATE6_0_NUM_UNIT(1));
6636          tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset);
6637       } else {
6638          /* Vulkan guarantees only 4 byte alignment for indirect_offset.
6639           * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
6640           */
6641 
6642          uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
6643 
6644          /* Wait for any previous uses to finish. */
6645          tu_cs_emit_wfi(cs);
6646 
6647          for (uint32_t i = 0; i < 3; i++) {
6648             tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
6649             tu_cs_emit(cs, 0);
6650             tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i));
6651             tu_cs_emit_qw(cs, indirect_iova + i * 4);
6652          }
6653 
6654          tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
6655          tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
6656          tu_cs_emit_wfi(cs);
6657 
6658          tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
6659          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
6660                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
6661                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
6662                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
6663                      CP_LOAD_STATE6_0_NUM_UNIT(1));
6664          tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
6665       }
6666 
6667       /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
6668        * indirect dispatch.
6669        */
6670       if (info->indirect && num_consts > IR3_DP_CS(base_group_x)) {
6671          bool emit_local = num_consts > IR3_DP_CS(local_group_size_x);
6672          tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0));
6673          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_CS(base_group_x) / 4)) |
6674                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
6675                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
6676                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
6677                   CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_CS(base_group_x)) / 4));
6678          tu_cs_emit_qw(cs, 0);
6679          tu_cs_emit(cs, 0); /* BASE_GROUP_X */
6680          tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
6681          tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
6682          tu_cs_emit(cs, subgroup_size);
6683          if (emit_local) {
6684             assert(num_consts == align(IR3_DP_CS(subgroup_id_shift), 4));
6685             tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
6686             tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
6687             tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
6688             tu_cs_emit(cs, subgroup_shift);
6689          }
6690       }
6691    }
6692 }
6693 
6694 template <chip CHIP>
6695 static void
tu_dispatch(struct tu_cmd_buffer * cmd,const struct tu_dispatch_info * info)6696 tu_dispatch(struct tu_cmd_buffer *cmd,
6697             const struct tu_dispatch_info *info)
6698 {
6699    if (!info->indirect &&
6700        (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0))
6701       return;
6702 
6703    struct tu_cs *cs = &cmd->cs;
6704    struct tu_shader *shader = cmd->state.shaders[MESA_SHADER_COMPUTE];
6705 
6706    bool emit_instrlen_workaround =
6707       shader->variant->instrlen >
6708       cmd->device->physical_device->info->a6xx.instr_cache_size;
6709 
6710    /* We don't use draw states for dispatches, so the bound pipeline
6711     * could be overwritten by reg stomping in a renderpass or blit.
6712     */
6713    if (cmd->device->dbg_renderpass_stomp_cs) {
6714       tu_cs_emit_state_ib(&cmd->cs, shader->state);
6715    }
6716 
6717    /* There appears to be a HW bug where in some rare circumstances it appears
6718     * to accidentally use the FS instrlen instead of the CS instrlen, which
6719     * affects all known gens. Based on various experiments it appears that the
6720     * issue is that when prefetching a branch destination and there is a cache
6721     * miss, when fetching from memory the HW bounds-checks the fetch against
6722     * SP_CS_INSTRLEN, except when one of the two register contexts is active
6723     * it accidentally fetches SP_FS_INSTRLEN from the other (inactive)
6724     * context. To workaround it we set the FS instrlen here and do a dummy
6725     * event to roll the context (because it fetches SP_FS_INSTRLEN from the
6726     * "wrong" context). Because the bug seems to involve cache misses, we
6727     * don't emit this if the entire CS program fits in cache, which will
6728     * hopefully be the majority of cases.
6729     *
6730     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5892
6731     */
6732    if (emit_instrlen_workaround) {
6733       tu_cs_emit_regs(cs, A6XX_SP_FS_INSTRLEN(shader->variant->instrlen));
6734       tu_emit_event_write<CHIP>(cmd, cs, FD_LABEL);
6735    }
6736 
6737    /* TODO: We could probably flush less if we add a compute_flush_bits
6738     * bitfield.
6739     */
6740    tu_emit_cache_flush<CHIP>(cmd);
6741 
6742    /* note: no reason to have this in a separate IB */
6743    tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true));
6744 
6745    tu_emit_compute_driver_params<CHIP>(cmd, cs, info);
6746 
6747    if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS) {
6748       tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
6749       tu_cs_emit_state_ib(cs, cmd->state.compute_load_state);
6750    }
6751 
6752    cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS;
6753 
6754    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
6755    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
6756 
6757    const uint16_t *local_size = shader->variant->local_size;
6758    const uint32_t *num_groups = info->blocks;
6759    tu_cs_emit_regs(cs,
6760                    HLSQ_CS_NDRANGE_0(CHIP, .kerneldim = 3,
6761                                            .localsizex = local_size[0] - 1,
6762                                            .localsizey = local_size[1] - 1,
6763                                            .localsizez = local_size[2] - 1),
6764                    HLSQ_CS_NDRANGE_1(CHIP, .globalsize_x = local_size[0] * num_groups[0]),
6765                    HLSQ_CS_NDRANGE_2(CHIP, .globaloff_x = 0),
6766                    HLSQ_CS_NDRANGE_3(CHIP, .globalsize_y = local_size[1] * num_groups[1]),
6767                    HLSQ_CS_NDRANGE_4(CHIP, .globaloff_y = 0),
6768                    HLSQ_CS_NDRANGE_5(CHIP, .globalsize_z = local_size[2] * num_groups[2]),
6769                    HLSQ_CS_NDRANGE_6(CHIP, .globaloff_z = 0));
6770 
6771    tu_cs_emit_regs(cs,
6772                    HLSQ_CS_KERNEL_GROUP_X(CHIP, 1),
6773                    HLSQ_CS_KERNEL_GROUP_Y(CHIP, 1),
6774                    HLSQ_CS_KERNEL_GROUP_Z(CHIP, 1));
6775 
6776    if (info->indirect) {
6777       uint64_t iova = info->indirect->iova + info->indirect_offset;
6778 
6779       trace_start_compute_indirect(&cmd->trace, cs);
6780 
6781       tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
6782       tu_cs_emit(cs, 0x00000000);
6783       tu_cs_emit_qw(cs, iova);
6784       tu_cs_emit(cs,
6785                  A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
6786                  A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
6787                  A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
6788 
6789       trace_end_compute_indirect(&cmd->trace, cs,
6790                                  (struct u_trace_address) {
6791                                     .bo = info->indirect->bo,
6792                                     .offset = info->indirect_offset,
6793                                  });
6794    } else {
6795       trace_start_compute(&cmd->trace, cs, info->indirect != NULL,
6796                           local_size[0], local_size[1], local_size[2],
6797                           info->blocks[0], info->blocks[1], info->blocks[2]);
6798 
6799       tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
6800       tu_cs_emit(cs, 0x00000000);
6801       tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
6802       tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
6803       tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
6804 
6805       trace_end_compute(&cmd->trace, cs);
6806    }
6807 
6808    /* For the workaround above, because it's using the "wrong" context for
6809     * SP_FS_INSTRLEN we should emit another dummy event write to avoid a
6810     * potential race between writing the register and the CP_EXEC_CS we just
6811     * did. We don't need to reset the register because it will be re-emitted
6812     * anyway when the next renderpass starts.
6813     */
6814    if (emit_instrlen_workaround) {
6815       tu_emit_event_write<CHIP>(cmd, cs, FD_LABEL);
6816    }
6817 }
6818 
6819 template <chip CHIP>
6820 VKAPI_ATTR void VKAPI_CALL
tu_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)6821 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
6822                    uint32_t base_x,
6823                    uint32_t base_y,
6824                    uint32_t base_z,
6825                    uint32_t x,
6826                    uint32_t y,
6827                    uint32_t z)
6828 {
6829    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
6830    struct tu_dispatch_info info = {};
6831 
6832    info.blocks[0] = x;
6833    info.blocks[1] = y;
6834    info.blocks[2] = z;
6835 
6836    info.offsets[0] = base_x;
6837    info.offsets[1] = base_y;
6838    info.offsets[2] = base_z;
6839    tu_dispatch<CHIP>(cmd_buffer, &info);
6840 }
6841 TU_GENX(tu_CmdDispatchBase);
6842 
6843 template <chip CHIP>
6844 VKAPI_ATTR void VKAPI_CALL
tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)6845 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
6846                        VkBuffer _buffer,
6847                        VkDeviceSize offset)
6848 {
6849    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
6850    VK_FROM_HANDLE(tu_buffer, buffer, _buffer);
6851    struct tu_dispatch_info info = {};
6852 
6853    info.indirect = buffer;
6854    info.indirect_offset = offset;
6855 
6856    tu_dispatch<CHIP>(cmd_buffer, &info);
6857 }
6858 TU_GENX(tu_CmdDispatchIndirect);
6859 
6860 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6861 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
6862                      const VkSubpassEndInfo *pSubpassEndInfo)
6863 {
6864    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
6865 
6866    if (TU_DEBUG(DYNAMIC)) {
6867       vk_common_CmdEndRenderPass2(commandBuffer, pSubpassEndInfo);
6868       return;
6869    }
6870 
6871    tu_cs_end(&cmd_buffer->draw_cs);
6872    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
6873    TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
6874 
6875    cmd_buffer->state.cache.pending_flush_bits |=
6876       cmd_buffer->state.renderpass_cache.pending_flush_bits;
6877    tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
6878 
6879    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
6880 
6881    tu_reset_render_pass(cmd_buffer);
6882 }
6883 
6884 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndRendering(VkCommandBuffer commandBuffer)6885 tu_CmdEndRendering(VkCommandBuffer commandBuffer)
6886 {
6887    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
6888 
6889    if (cmd_buffer->state.suspending)
6890       cmd_buffer->state.suspended_pass.lrz = cmd_buffer->state.lrz;
6891 
6892    if (!cmd_buffer->state.suspending) {
6893       tu_cs_end(&cmd_buffer->draw_cs);
6894       tu_cs_end(&cmd_buffer->draw_epilogue_cs);
6895 
6896       if (cmd_buffer->state.suspend_resume == SR_IN_PRE_CHAIN) {
6897          cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
6898          tu_save_pre_chain(cmd_buffer);
6899 
6900          /* Even we don't call tu_cmd_render here, renderpass is finished
6901           * and draw states should be disabled.
6902           */
6903          tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
6904       } else {
6905          TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
6906       }
6907 
6908       tu_reset_render_pass(cmd_buffer);
6909    }
6910 
6911    if (cmd_buffer->state.resuming && !cmd_buffer->state.suspending) {
6912       /* exiting suspend/resume chain */
6913       switch (cmd_buffer->state.suspend_resume) {
6914       case SR_IN_CHAIN:
6915          cmd_buffer->state.suspend_resume = SR_NONE;
6916          break;
6917       case SR_IN_PRE_CHAIN:
6918       case SR_IN_CHAIN_AFTER_PRE_CHAIN:
6919          cmd_buffer->state.suspend_resume = SR_AFTER_PRE_CHAIN;
6920          break;
6921       default:
6922          unreachable("suspending render pass not followed by resuming pass");
6923       }
6924    }
6925 }
6926 
6927 void
tu_barrier(struct tu_cmd_buffer * cmd,uint32_t dep_count,const VkDependencyInfo * dep_infos)6928 tu_barrier(struct tu_cmd_buffer *cmd,
6929            uint32_t dep_count,
6930            const VkDependencyInfo *dep_infos)
6931 {
6932    VkPipelineStageFlags2 srcStage = 0;
6933    VkPipelineStageFlags2 dstStage = 0;
6934    BITMASK_ENUM(tu_cmd_access_mask) src_flags = 0;
6935    BITMASK_ENUM(tu_cmd_access_mask) dst_flags = 0;
6936 
6937    /* Inside a renderpass, we don't know yet whether we'll be using sysmem
6938     * so we have to use the sysmem flushes.
6939     */
6940    bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
6941       !cmd->state.pass;
6942 
6943    for (uint32_t dep_idx = 0; dep_idx < dep_count; dep_idx++) {
6944       const VkDependencyInfo *dep_info = &dep_infos[dep_idx];
6945 
6946       for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
6947          VkPipelineStageFlags2 sanitized_src_stage =
6948             sanitize_src_stage(dep_info->pMemoryBarriers[i].srcStageMask);
6949          VkPipelineStageFlags2 sanitized_dst_stage =
6950             sanitize_dst_stage(dep_info->pMemoryBarriers[i].dstStageMask);
6951          src_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].srcAccessMask,
6952                                    sanitized_src_stage, false, gmem);
6953          dst_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].dstAccessMask,
6954                                    sanitized_dst_stage, false, gmem);
6955          srcStage |= sanitized_src_stage;
6956          dstStage |= sanitized_dst_stage;
6957       }
6958 
6959       for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
6960          VkPipelineStageFlags2 sanitized_src_stage =
6961             sanitize_src_stage(dep_info->pBufferMemoryBarriers[i].srcStageMask);
6962          VkPipelineStageFlags2 sanitized_dst_stage =
6963             sanitize_dst_stage(dep_info->pBufferMemoryBarriers[i].dstStageMask);
6964          src_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].srcAccessMask,
6965                                    sanitized_src_stage, false, gmem);
6966          dst_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].dstAccessMask,
6967                                    sanitized_dst_stage, false, gmem);
6968          srcStage |= sanitized_src_stage;
6969          dstStage |= sanitized_dst_stage;
6970       }
6971 
6972       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
6973          VkImageLayout old_layout = dep_info->pImageMemoryBarriers[i].oldLayout;
6974          if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
6975             /* The underlying memory for this image may have been used earlier
6976              * within the same queue submission for a different image, which
6977              * means that there may be old, stale cache entries which are in the
6978              * "wrong" location, which could cause problems later after writing
6979              * to the image. We don't want these entries being flushed later and
6980              * overwriting the actual image, so we need to flush the CCU.
6981              */
6982             VK_FROM_HANDLE(tu_image, image, dep_info->pImageMemoryBarriers[i].image);
6983 
6984             if (vk_format_is_depth_or_stencil(image->vk.format)) {
6985                src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
6986             } else {
6987                src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
6988             }
6989          }
6990          VkPipelineStageFlags2 sanitized_src_stage =
6991             sanitize_src_stage(dep_info->pImageMemoryBarriers[i].srcStageMask);
6992          VkPipelineStageFlags2 sanitized_dst_stage =
6993             sanitize_dst_stage(dep_info->pImageMemoryBarriers[i].dstStageMask);
6994          src_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].srcAccessMask,
6995                                    sanitized_src_stage, true, gmem);
6996          dst_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].dstAccessMask,
6997                                    sanitized_dst_stage, true, gmem);
6998          srcStage |= sanitized_src_stage;
6999          dstStage |= sanitized_dst_stage;
7000       }
7001    }
7002 
7003    if (cmd->state.pass) {
7004       const VkPipelineStageFlags framebuffer_space_stages =
7005          VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
7006          VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7007          VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
7008          VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
7009 
7010       /* We cannot have non-by-region "fb-space to fb-space" barriers.
7011        *
7012        * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency":
7013        *
7014        *    If the source and destination stage masks both include
7015        *    framebuffer-space stages, then dependencyFlags must include
7016        *    VK_DEPENDENCY_BY_REGION_BIT.
7017        *    [...]
7018        *    Each of the synchronization scopes and access scopes of a
7019        *    vkCmdPipelineBarrier2 or vkCmdPipelineBarrier command inside
7020        *    a render pass instance must be a subset of the scopes of one of
7021        *    the self-dependencies for the current subpass.
7022        *
7023        *    If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or
7024        *    VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier.
7025        *
7026        * By-region barriers are ok for gmem. All other barriers would involve
7027        * vtx stages which are NOT ok for gmem rendering.
7028        * See dep_invalid_for_gmem().
7029        */
7030       if ((srcStage & ~framebuffer_space_stages) ||
7031           (dstStage & ~framebuffer_space_stages)) {
7032          cmd->state.rp.disable_gmem = true;
7033       }
7034    }
7035 
7036    struct tu_cache_state *cache =
7037       cmd->state.pass  ? &cmd->state.renderpass_cache : &cmd->state.cache;
7038 
7039    /* a750 has a HW bug where writing a UBWC compressed image with a compute
7040     * shader followed by reading it as a texture (or readonly image) requires
7041     * a CACHE_CLEAN event. Some notes about this bug:
7042     * - It only happens after a blit happens.
7043     * - It's fast-clear related, it happens when the image is fast cleared
7044     *   before the write and the value read is (incorrectly) the fast clear
7045     *   color.
7046     * - CACHE_FLUSH is supposed to be the same as CACHE_CLEAN +
7047     *   CACHE_INVALIDATE, but it doesn't work whereas CACHE_CLEAN +
7048     *   CACHE_INVALIDATE does.
7049     *
7050     * The srcAccess can be replaced by a OpMemoryBarrier(MakeAvailable), so
7051     * we can't use that to insert the flush. Instead we use the shader source
7052     * stage.
7053     */
7054    if (cmd->device->physical_device->info->a7xx.ubwc_coherency_quirk &&
7055        (srcStage &
7056         (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
7057          VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
7058          VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
7059          VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
7060          VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
7061          VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
7062          VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
7063          VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))) {
7064       cache->flush_bits |= TU_CMD_FLAG_CACHE_CLEAN;
7065       cache->pending_flush_bits &= ~TU_CMD_FLAG_CACHE_CLEAN;
7066    }
7067 
7068    tu_flush_for_access(cache, src_flags, dst_flags);
7069 
7070    enum tu_stage src_stage = vk2tu_src_stage(srcStage);
7071    enum tu_stage dst_stage = vk2tu_dst_stage(dstStage);
7072    tu_flush_for_stage(cache, src_stage, dst_stage);
7073 }
7074 
7075 VKAPI_ATTR void VKAPI_CALL
tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7076 tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7077                        const VkDependencyInfo *pDependencyInfo)
7078 {
7079    VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
7080 
7081    tu_barrier(cmd_buffer, 1, pDependencyInfo);
7082 }
7083 
7084 template <chip CHIP>
7085 void
tu_write_event(struct tu_cmd_buffer * cmd,struct tu_event * event,VkPipelineStageFlags2 stageMask,unsigned value)7086 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
7087                VkPipelineStageFlags2 stageMask, unsigned value)
7088 {
7089    struct tu_cs *cs = &cmd->cs;
7090 
7091    /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
7092    assert(!cmd->state.pass);
7093 
7094    tu_emit_cache_flush<CHIP>(cmd);
7095 
7096    /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
7097     * read by the CP, so the draw indirect stage counts as top-of-pipe too.
7098     */
7099    VkPipelineStageFlags2 top_of_pipe_flags =
7100       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
7101       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
7102 
7103    if (!(stageMask & ~top_of_pipe_flags)) {
7104       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
7105       tu_cs_emit_qw(cs, event->bo->iova); /* ADDR_LO/HI */
7106       tu_cs_emit(cs, value);
7107    } else {
7108       /* Use a RB_DONE_TS event to wait for everything to complete. */
7109       if (CHIP == A6XX) {
7110          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
7111          tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
7112       } else {
7113          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
7114          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = RB_DONE_TS,
7115                                           .write_src = EV_WRITE_USER_32B,
7116                                           .write_dst = EV_DST_RAM,
7117                                           .write_enabled = true).value);
7118       }
7119 
7120       tu_cs_emit_qw(cs, event->bo->iova);
7121       tu_cs_emit(cs, value);
7122    }
7123 }
7124 TU_GENX(tu_write_event);
7125 
7126 template <chip CHIP>
7127 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)7128 tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
7129                                    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
7130 {
7131    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
7132 
7133    cmd->state.predication_active = true;
7134 
7135    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
7136 
7137    tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
7138    tu_cs_emit(cs, 1);
7139 
7140    /* Wait for any writes to the predicate to land */
7141    if (cmd->state.pass)
7142       tu_emit_cache_flush_renderpass<CHIP>(cmd);
7143    else
7144       tu_emit_cache_flush<CHIP>(cmd);
7145 
7146    VK_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
7147    uint64_t iova = buf->iova + pConditionalRenderingBegin->offset;
7148 
7149    /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
7150     * mandates 32-bit comparisons. Our workaround is to copy the the reference
7151     * value to the low 32-bits of a location where the high 32 bits are known
7152     * to be 0 and then compare that.
7153     */
7154    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
7155    tu_cs_emit(cs, 0);
7156    tu_cs_emit_qw(cs, global_iova(cmd, predicate));
7157    tu_cs_emit_qw(cs, iova);
7158 
7159    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
7160    tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
7161 
7162    bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
7163    tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
7164    tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
7165                   CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
7166    tu_cs_emit_qw(cs, global_iova(cmd, predicate));
7167 }
7168 TU_GENX(tu_CmdBeginConditionalRenderingEXT);
7169 
7170 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)7171 tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
7172 {
7173    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
7174 
7175    cmd->state.predication_active = false;
7176 
7177    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
7178 
7179    tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
7180    tu_cs_emit(cs, 0);
7181 }
7182 
7183 template <chip CHIP>
7184 void
tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)7185 tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,
7186                             VkPipelineStageFlagBits2 pipelineStage,
7187                             VkBuffer dstBuffer,
7188                             VkDeviceSize dstOffset,
7189                             uint32_t marker)
7190 {
7191    /* Almost the same as tu_write_event, but also allowed in renderpass */
7192    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
7193    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
7194 
7195    uint64_t va = buffer->iova + dstOffset;
7196 
7197    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
7198    struct tu_cache_state *cache =
7199       cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
7200 
7201    /* From the Vulkan 1.2.203 spec:
7202     *
7203     *    The access scope for buffer marker writes falls under
7204     *    the VK_ACCESS_TRANSFER_WRITE_BIT, and the pipeline stages for
7205     *    identifying the synchronization scope must include both pipelineStage
7206     *    and VK_PIPELINE_STAGE_TRANSFER_BIT.
7207     *
7208     * Transfer operations use CCU however here we write via CP.
7209     * Flush CCU in order to make the results of previous transfer
7210     * operation visible to CP.
7211     */
7212    tu_flush_for_access(cache, TU_ACCESS_NONE, TU_ACCESS_SYSMEM_WRITE);
7213 
7214    /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
7215     * read by the CP, so the draw indirect stage counts as top-of-pipe too.
7216     */
7217    VkPipelineStageFlags2 top_of_pipe_flags =
7218       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
7219       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
7220 
7221    bool is_top_of_pipe = !(pipelineStage & ~top_of_pipe_flags);
7222 
7223    /* We have to WFI only if we flushed CCU here and are using CP_MEM_WRITE.
7224     * Otherwise:
7225     * - We do CP_EVENT_WRITE(RB_DONE_TS) which should wait for flushes;
7226     * - There was a barrier to synchronize other writes with WriteBufferMarkerAMD
7227     *   and they had to include our pipelineStage which forces the WFI.
7228     */
7229    if (cache->flush_bits && is_top_of_pipe) {
7230       cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
7231    }
7232 
7233    if (cmd->state.pass) {
7234       tu_emit_cache_flush_renderpass<CHIP>(cmd);
7235    } else {
7236       tu_emit_cache_flush<CHIP>(cmd);
7237    }
7238 
7239    if (is_top_of_pipe) {
7240       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
7241       tu_cs_emit_qw(cs, va); /* ADDR_LO/HI */
7242       tu_cs_emit(cs, marker);
7243    } else {
7244       /* Use a RB_DONE_TS event to wait for everything to complete. */
7245       if (CHIP == A6XX) {
7246          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
7247          tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
7248       } else {
7249          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
7250          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = RB_DONE_TS,
7251                                           .write_src = EV_WRITE_USER_32B,
7252                                           .write_dst = EV_DST_RAM,
7253                                           .write_enabled = true).value);
7254       }
7255       tu_cs_emit_qw(cs, va);
7256       tu_cs_emit(cs, marker);
7257    }
7258 
7259    /* Make sure the result of this write is visible to others. */
7260    tu_flush_for_access(cache, TU_ACCESS_CP_WRITE, TU_ACCESS_NONE);
7261 }
7262 TU_GENX(tu_CmdWriteBufferMarker2AMD);
7263