• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 
si_get_wait_mem_scratch_bo(struct si_context * ctx,struct radeon_cmdbuf * cs,bool is_secure)9 static struct si_resource *si_get_wait_mem_scratch_bo(struct si_context *ctx,
10                                                       struct radeon_cmdbuf *cs, bool is_secure)
11 {
12    struct si_screen *sscreen = ctx->screen;
13 
14    assert(ctx->gfx_level < GFX11);
15 
16    if (likely(!is_secure)) {
17       return ctx->wait_mem_scratch;
18    } else {
19       assert(sscreen->info.has_tmz_support);
20       if (!ctx->wait_mem_scratch_tmz) {
21          ctx->wait_mem_scratch_tmz =
22             si_aligned_buffer_create(&sscreen->b,
23                                      PIPE_RESOURCE_FLAG_UNMAPPABLE |
24                                      SI_RESOURCE_FLAG_DRIVER_INTERNAL |
25                                      PIPE_RESOURCE_FLAG_ENCRYPTED,
26                                      PIPE_USAGE_DEFAULT, 4,
27                                      sscreen->info.tcc_cache_line_size);
28          si_cp_write_data(ctx, ctx->wait_mem_scratch_tmz, 0, 4, V_370_MEM, V_370_ME,
29                           &ctx->wait_mem_number);
30       }
31 
32       return ctx->wait_mem_scratch_tmz;
33    }
34 }
35 
get_reduced_barrier_flags(struct si_context * ctx)36 static unsigned get_reduced_barrier_flags(struct si_context *ctx)
37 {
38    unsigned flags = ctx->barrier_flags;
39 
40    if (!flags)
41       return 0;
42 
43    if (!ctx->has_graphics) {
44       /* Only process compute flags. */
45       flags &= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
46                SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA |
47                SI_BARRIER_SYNC_CS;
48    }
49 
50    /* Don't flush CB and DB if there have been no draw calls. */
51    if (ctx->num_draw_calls == ctx->last_cb_flush_num_draw_calls &&
52        ctx->num_decompress_calls == ctx->last_cb_flush_num_decompress_calls)
53       flags &= ~SI_BARRIER_SYNC_AND_INV_CB;
54 
55    if (ctx->num_draw_calls == ctx->last_db_flush_num_draw_calls &&
56        ctx->num_decompress_calls == ctx->last_db_flush_num_decompress_calls)
57       flags &= ~SI_BARRIER_SYNC_AND_INV_DB;
58 
59    if (!ctx->compute_is_busy)
60       flags &= ~SI_BARRIER_SYNC_CS;
61 
62    /* Track the last CB/DB flush. */
63    if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
64       ctx->num_cb_cache_flushes++;
65       ctx->last_cb_flush_num_draw_calls = ctx->num_draw_calls;
66       ctx->last_cb_flush_num_decompress_calls = ctx->num_decompress_calls;
67    }
68    if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
69       ctx->num_db_cache_flushes++;
70       ctx->last_db_flush_num_draw_calls = ctx->num_draw_calls;
71       ctx->last_db_flush_num_decompress_calls = ctx->num_decompress_calls;
72    }
73 
74    /* Skip VS and PS synchronization if they are idle. */
75    if (ctx->num_draw_calls == ctx->last_ps_sync_num_draw_calls)
76       flags &= ~SI_BARRIER_SYNC_VS & ~SI_BARRIER_SYNC_PS;
77    else if (ctx->num_draw_calls == ctx->last_vs_sync_num_draw_calls)
78       flags &= ~SI_BARRIER_SYNC_VS;
79 
80    /* Track the last VS/PS flush. Flushing CB or DB also waits for PS (obviously). */
81    if (flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_SYNC_PS)) {
82       ctx->last_ps_sync_num_draw_calls = ctx->num_draw_calls;
83       ctx->last_vs_sync_num_draw_calls = ctx->num_draw_calls;
84    } else if (SI_BARRIER_SYNC_VS) {
85       ctx->last_vs_sync_num_draw_calls = ctx->num_draw_calls;
86    }
87 
88    /* We use a TS event to flush CB/DB on GFX9+. */
89    bool uses_ts_event = ctx->gfx_level >= GFX9 &&
90                         flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB);
91 
92    /* TS events wait for everything. */
93    if (uses_ts_event)
94       flags &= ~SI_BARRIER_SYNC_VS & ~SI_BARRIER_SYNC_PS & ~SI_BARRIER_SYNC_CS;
95 
96    /* TS events wait for compute too. */
97    if (flags & SI_BARRIER_SYNC_CS || uses_ts_event)
98       ctx->compute_is_busy = false;
99 
100    if (flags & SI_BARRIER_SYNC_VS)
101       ctx->num_vs_flushes++;
102    if (flags & SI_BARRIER_SYNC_PS)
103       ctx->num_ps_flushes++;
104    if (flags & SI_BARRIER_SYNC_CS)
105       ctx->num_cs_flushes++;
106 
107    if (flags & SI_BARRIER_INV_L2)
108       ctx->num_L2_invalidates++;
109    else if (flags & SI_BARRIER_WB_L2)
110       ctx->num_L2_writebacks++;
111 
112    ctx->barrier_flags = 0;
113    return flags;
114 }
115 
si_handle_common_barrier_events(struct si_context * ctx,struct radeon_cmdbuf * cs,unsigned flags)116 static void si_handle_common_barrier_events(struct si_context *ctx, struct radeon_cmdbuf *cs,
117                                             unsigned flags)
118 {
119    radeon_begin(cs);
120 
121    if (flags & SI_BARRIER_EVENT_PIPELINESTAT_START && ctx->pipeline_stats_enabled != 1) {
122       radeon_event_write(V_028A90_PIPELINESTAT_START);
123       ctx->pipeline_stats_enabled = 1;
124    } else if (flags & SI_BARRIER_EVENT_PIPELINESTAT_STOP && ctx->pipeline_stats_enabled != 0) {
125       radeon_event_write(V_028A90_PIPELINESTAT_STOP);
126       ctx->pipeline_stats_enabled = 0;
127    }
128 
129    if (flags & SI_BARRIER_EVENT_VGT_FLUSH)
130       radeon_event_write(V_028A90_VGT_FLUSH);
131 
132    radeon_end();
133 }
134 
gfx10_emit_barrier(struct si_context * ctx,struct radeon_cmdbuf * cs)135 static void gfx10_emit_barrier(struct si_context *ctx, struct radeon_cmdbuf *cs)
136 {
137    assert(ctx->gfx_level >= GFX10);
138    uint32_t gcr_cntl = 0;
139    unsigned flags = get_reduced_barrier_flags(ctx);
140 
141    if (!flags)
142       return;
143 
144    si_handle_common_barrier_events(ctx, cs, flags);
145 
146    /* We don't need these. */
147    assert(!(flags & SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META));
148    assert(ctx->gfx_level < GFX12 || !(flags & SI_BARRIER_INV_L2_METADATA));
149 
150    if (flags & SI_BARRIER_INV_ICACHE)
151       gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
152    if (flags & SI_BARRIER_INV_SMEM)
153       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
154    if (flags & SI_BARRIER_INV_VMEM)
155       gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
156 
157    /* The L2 cache ops are:
158     * - INV: - invalidate lines that reflect memory (were loaded from memory)
159     *        - don't touch lines that were overwritten (were stored by gfx clients)
160     * - WB: - don't touch lines that reflect memory
161     *       - write back lines that were overwritten
162     * - WB | INV: - invalidate lines that reflect memory
163     *             - write back lines that were overwritten
164     *
165     * GLM doesn't support WB alone. If WB is set, INV must be set too.
166     */
167    if (flags & SI_BARRIER_INV_L2)
168       gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1); /* Writeback and invalidate everything in L2. */
169    else if (flags & SI_BARRIER_WB_L2)
170       gcr_cntl |= S_586_GL2_WB(1);
171 
172    /* Invalidate the metadata cache. */
173    if (ctx->gfx_level < GFX12 &&
174        flags & (SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA))
175       gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
176 
177    /* Flush CB/DB. Note that this also idles all shaders, including compute shaders. */
178    if (flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB)) {
179       unsigned cb_db_event = 0;
180 
181       /* Determine the TS event that we'll use to flush CB/DB. */
182       if ((flags & SI_BARRIER_SYNC_AND_INV_CB && flags & SI_BARRIER_SYNC_AND_INV_DB) ||
183           /* Gfx11 can't use the DB_META event and must use a full flush to flush DB_META. */
184           (ctx->gfx_level == GFX11 && flags & SI_BARRIER_SYNC_AND_INV_DB)) {
185          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
186       } else if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
187          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
188       } else {
189          assert(flags & SI_BARRIER_SYNC_AND_INV_DB);
190          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
191       }
192 
193       /* We must flush CMASK/FMASK/DCC separately if the main event only flushes CB_DATA. */
194       radeon_begin(cs);
195       if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_CB_DATA_TS)
196          radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
197 
198       /* We must flush HTILE separately if the main event only flushes DB_DATA. */
199       if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_DB_DATA_TS)
200          radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
201 
202       radeon_end();
203 
204       /* First flush CB/DB, then L1/L2. */
205       gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
206 
207       if (ctx->gfx_level >= GFX11) {
208          si_cp_release_mem_pws(ctx, cs, cb_db_event, gcr_cntl & C_586_GLI_INV);
209 
210          /* Wait for the event and invalidate remaining caches if needed. */
211          si_cp_acquire_mem_pws(ctx, cs, cb_db_event,
212                                flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME,
213                                gcr_cntl & ~C_586_GLI_INV, /* keep only GLI_INV */
214                                0, flags);
215 
216          gcr_cntl = 0; /* all done */
217          /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
218          flags &= ~SI_BARRIER_PFP_SYNC_ME;
219       } else {
220          /* GFX10 */
221          struct si_resource *wait_mem_scratch =
222            si_get_wait_mem_scratch_bo(ctx, cs, ctx->ws->cs_is_secure(cs));
223 
224          /* CB/DB flush and invalidate via RELEASE_MEM.
225           * Combine this with other cache flushes when possible.
226           */
227          uint64_t va = wait_mem_scratch->gpu_address;
228          ctx->wait_mem_number++;
229 
230          /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
231          unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
232          unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
233          unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
234          unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
235          assert(G_586_GL2_US(gcr_cntl) == 0);
236          assert(G_586_GL2_RANGE(gcr_cntl) == 0);
237          assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
238          unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
239          unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
240          unsigned gcr_seq = G_586_SEQ(gcr_cntl);
241 
242          gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
243                      C_586_GL2_WB; /* keep SEQ */
244 
245          si_cp_release_mem(ctx, cs, cb_db_event,
246                            S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
247                            S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
248                            S_490_SEQ(gcr_seq),
249                            EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
250                            EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
251                            SI_NOT_QUERY);
252 
253          if (unlikely(ctx->sqtt_enabled)) {
254             si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
255          }
256 
257          si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
258 
259          if (unlikely(ctx->sqtt_enabled)) {
260             si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
261          }
262       }
263    } else {
264       /* The TS event above also makes sure that PS and CS are idle, so we have to do this only
265        * if we are not flushing CB or DB.
266        */
267       radeon_begin(cs);
268       if (flags & SI_BARRIER_SYNC_PS)
269          radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
270       else if (flags & SI_BARRIER_SYNC_VS)
271          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
272 
273       if (flags & SI_BARRIER_SYNC_CS)
274          radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
275 
276       radeon_end();
277    }
278 
279    /* Ignore fields that only modify the behavior of other fields. */
280    if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
281       si_cp_acquire_mem(ctx, cs, gcr_cntl,
282                         flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME);
283    } else if (flags & SI_BARRIER_PFP_SYNC_ME) {
284       si_cp_pfp_sync_me(cs);
285    }
286 }
287 
gfx6_emit_barrier(struct si_context * sctx,struct radeon_cmdbuf * cs)288 static void gfx6_emit_barrier(struct si_context *sctx, struct radeon_cmdbuf *cs)
289 {
290    assert(sctx->gfx_level <= GFX9);
291    unsigned flags = get_reduced_barrier_flags(sctx);
292 
293    if (!flags)
294       return;
295 
296    si_handle_common_barrier_events(sctx, cs, flags);
297 
298    uint32_t cp_coher_cntl = 0;
299    const uint32_t flush_cb_db = flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB);
300 
301    /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
302     * bit is set. An alternative way is to write SQC_CACHES, but that
303     * doesn't seem to work reliably. Since the bug doesn't affect
304     * correctness (it only does more work than necessary) and
305     * the performance impact is likely negligible, there is no plan
306     * to add a workaround for it.
307     */
308 
309    if (flags & SI_BARRIER_INV_ICACHE)
310       cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
311    if (flags & SI_BARRIER_INV_SMEM)
312       cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
313 
314    if (sctx->gfx_level <= GFX8) {
315       if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
316          cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
317                           S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
318                           S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
319                           S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
320                           S_0085F0_CB7_DEST_BASE_ENA(1);
321 
322          /* Necessary for DCC */
323          if (sctx->gfx_level == GFX8)
324             si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
325                               EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
326       }
327       if (flags & SI_BARRIER_SYNC_AND_INV_DB)
328          cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
329    }
330 
331    radeon_begin(cs);
332 
333    /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
334    if (flags & SI_BARRIER_SYNC_AND_INV_CB)
335       radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
336 
337    /* Flush HTILE. SURFACE_SYNC will wait for idle. */
338    if (flags & (SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META))
339       radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
340 
341    /* Wait for shader engines to go idle.
342     * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
343     * for everything including CB/DB cache flushes.
344     *
345     * GFX6-8: SURFACE_SYNC with CB_ACTION_ENA doesn't do anything if there are no CB/DB bindings.
346     * Reproducible with: piglit/arb_framebuffer_no_attachments-atomic
347     *
348     * GFX9: The TS event is always written after full pipeline completion regardless of CB/DB
349     * bindings.
350     */
351    if (sctx->gfx_level <= GFX8 || !flush_cb_db) {
352       if (flags & SI_BARRIER_SYNC_PS)
353          radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
354       else if (flags & SI_BARRIER_SYNC_VS)
355          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
356    }
357 
358    if (flags & SI_BARRIER_SYNC_CS)
359       radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
360 
361    radeon_end();
362 
363    /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
364     * wait for idle on GFX9. We have to use a TS event.
365     */
366    if (sctx->gfx_level == GFX9 && flush_cb_db) {
367       uint64_t va;
368       unsigned tc_flags, cb_db_event;
369 
370       /* Set the CB/DB flush event. */
371       switch (flush_cb_db) {
372       case SI_BARRIER_SYNC_AND_INV_CB:
373          cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
374          break;
375       case SI_BARRIER_SYNC_AND_INV_DB:
376          cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
377          break;
378       default:
379          /* both CB & DB */
380          cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
381       }
382 
383       /* These are the only allowed combinations. If you need to
384        * do multiple operations at once, do them separately.
385        * All operations that invalidate L2 also seem to invalidate
386        * metadata. Volatile (VOL) and WC flushes are not listed here.
387        *
388        * TC    | TC_WB         = writeback & invalidate L2
389        * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
390        *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
391        * TC            | TC_NC = invalidate L2 for MTYPE == NC
392        * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
393        * TCL1                  = invalidate L1
394        */
395       tc_flags = 0;
396 
397       if (flags & SI_BARRIER_INV_L2_METADATA) {
398          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
399       }
400 
401       /* Ideally flush L2 together with CB/DB. */
402       if (flags & SI_BARRIER_INV_L2) {
403          /* Writeback and invalidate everything in L2 & L1. */
404          tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
405 
406          /* Clear the flags. */
407          flags &= ~(SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2);
408       }
409 
410       /* Do the flush (enqueue the event and wait for it). */
411       struct si_resource* wait_mem_scratch =
412         si_get_wait_mem_scratch_bo(sctx, cs, sctx->ws->cs_is_secure(cs));
413 
414       va = wait_mem_scratch->gpu_address;
415       sctx->wait_mem_number++;
416 
417       si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
418                         EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
419                         wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
420 
421       if (unlikely(sctx->sqtt_enabled)) {
422          si_sqtt_describe_barrier_start(sctx, cs);
423       }
424 
425       si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
426 
427       if (unlikely(sctx->sqtt_enabled)) {
428          si_sqtt_describe_barrier_end(sctx, cs, sctx->barrier_flags);
429       }
430    }
431 
432    /* GFX6-GFX8 only: When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC waits
433     * for idle, so it should be last.
434     *
435     * cp_coher_cntl should contain everything except TC flags at this point.
436     *
437     * GFX6-GFX7 don't support L2 write-back.
438     */
439    unsigned engine = flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME;
440 
441    if (flags & SI_BARRIER_INV_L2 || (sctx->gfx_level <= GFX7 && flags & SI_BARRIER_WB_L2)) {
442       /* Invalidate L1 & L2. WB must be set on GFX8+ when TC_ACTION is set. */
443       si_cp_acquire_mem(sctx, cs,
444                         cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
445                         S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8), engine);
446    } else {
447       /* L1 invalidation and L2 writeback must be done separately, because both operations can't
448        * be done together.
449        */
450       if (flags & SI_BARRIER_WB_L2) {
451          /* WB = write-back
452           * NC = apply to non-coherent MTYPEs
453           *      (i.e. MTYPE <= 1, which is what we use everywhere)
454           *
455           * WB doesn't work without NC.
456           *
457           * If we get here, the only flag that can't be executed together with WB_L2 is VMEM cache
458           * invalidation.
459           */
460          bool last_acquire_mem = !(flags & SI_BARRIER_INV_VMEM);
461 
462          si_cp_acquire_mem(sctx, cs,
463                            cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) |
464                            S_0301F0_TC_NC_ACTION_ENA(1),
465                            /* If this is not the last ACQUIRE_MEM, flush in ME.
466                             * We only want to synchronize with PFP in the last ACQUIRE_MEM. */
467                            last_acquire_mem ? engine : V_580_CP_ME);
468 
469          if (last_acquire_mem)
470             flags &= ~SI_BARRIER_PFP_SYNC_ME;
471          cp_coher_cntl = 0;
472       }
473 
474       if (flags & SI_BARRIER_INV_VMEM)
475          cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
476 
477       /* If there are still some cache flags left... */
478       if (cp_coher_cntl) {
479          si_cp_acquire_mem(sctx, cs, cp_coher_cntl, engine);
480          flags &= ~SI_BARRIER_PFP_SYNC_ME;
481       }
482 
483       /* This might be needed even without any cache flags, such as when doing buffer stores
484        * to an index buffer.
485        */
486       if (flags & SI_BARRIER_PFP_SYNC_ME)
487          si_cp_pfp_sync_me(cs);
488    }
489 }
490 
si_emit_barrier_as_atom(struct si_context * sctx,unsigned index)491 static void si_emit_barrier_as_atom(struct si_context *sctx, unsigned index)
492 {
493    sctx->emit_barrier(sctx, &sctx->gfx_cs);
494 }
495 
si_is_buffer_idle(struct si_context * sctx,struct si_resource * buf,unsigned usage)496 static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
497                               unsigned usage)
498 {
499    return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
500           sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage | RADEON_USAGE_DISALLOW_SLOW_REPLY);
501 }
502 
si_barrier_before_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)503 void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
504                                    unsigned num_buffers,
505                                    const struct pipe_shader_buffer *buffers,
506                                    unsigned writable_buffers_mask,
507                                    unsigned num_images,
508                                    const struct pipe_image_view *images)
509 {
510    for (unsigned i = 0; i < num_images; i++) {
511       /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
512       si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS,
513                                 images[i].u.tex.level, images[i].u.tex.first_layer,
514                                 images[i].u.tex.last_layer,
515                                 images[i].access & PIPE_IMAGE_ACCESS_WRITE);
516    }
517 
518    /* Don't sync if buffers are idle. */
519    const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
520                             SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
521                             SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
522                             SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
523    const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
524                             SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
525                             SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
526                             SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
527 
528    for (unsigned i = 0; i < num_buffers; i++) {
529       struct si_resource *buf = si_resource(buffers[i].buffer);
530 
531       if (!buf)
532          continue;
533 
534       /* We always wait for the last write. If the buffer is used for write, also wait
535        * for the last read.
536        */
537       if (!si_is_buffer_idle(sctx, buf, RADEON_USAGE_WRITE |
538                              (writable_buffers_mask & BITFIELD_BIT(i) ? RADEON_USAGE_READ : 0))) {
539          if (buf->bind_history & ps_mask)
540             sctx->barrier_flags |= SI_BARRIER_SYNC_PS;
541          else
542             sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
543 
544          if (buf->bind_history & cs_mask)
545             sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
546       }
547    }
548 
549    /* Don't sync if images are idle. */
550    for (unsigned i = 0; i < num_images; i++) {
551       struct si_resource *img = si_resource(images[i].resource);
552       bool writable = images[i].access & PIPE_IMAGE_ACCESS_WRITE;
553 
554       /* We always wait for the last write. If the buffer is used for write, also wait
555        * for the last read.
556        */
557       if (!si_is_buffer_idle(sctx, img, RADEON_USAGE_WRITE | (writable ? RADEON_USAGE_READ : 0))) {
558          si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true,
559                ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned);
560          sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
561       }
562    }
563 
564    /* Invalidate the VMEM cache only. The SMEM cache isn't used by shader buffers. */
565    sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
566    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
567 }
568 
si_barrier_after_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)569 void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
570                                   unsigned num_buffers,
571                                   const struct pipe_shader_buffer *buffers,
572                                   unsigned writable_buffers_mask,
573                                   unsigned num_images,
574                                   const struct pipe_image_view *images)
575 {
576    sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
577 
578    if (num_images) {
579       /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
580       sctx->barrier_flags |= sctx->gfx_level <= GFX8 ? SI_BARRIER_WB_L2 : 0;
581       /* Make sure image stores are visible to all CUs. */
582       sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
583    }
584 
585    /* Make sure buffer stores are visible to all CUs and also as index/indirect buffers. */
586    if (num_buffers)
587       sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | SI_BARRIER_PFP_SYNC_ME;
588 
589    /* We must set L2_cache_dirty for buffers because:
590     * - GFX6,12: CP DMA doesn't use L2.
591     * - GFX6-7,12: Index buffer reads don't use L2.
592     * - GFX6-8,12: CP doesn't use L2.
593     * - GFX6-8: CB/DB don't use L2.
594     *
595     * L2_cache_dirty is checked explicitly when buffers are used in those cases to enforce coherency.
596     */
597    while (writable_buffers_mask)
598       si_resource(buffers[u_bit_scan(&writable_buffers_mask)].buffer)->L2_cache_dirty = true;
599 
600    /* Make sure RBs see our DCC image stores if RBs and TCCs (L2 instances) are non-coherent. */
601    if (sctx->gfx_level >= GFX10 && sctx->screen->info.tcc_rb_non_coherent) {
602       for (unsigned i = 0; i < num_images; i++) {
603          if (vi_dcc_enabled((struct si_texture*)images[i].resource, images[i].u.tex.level) &&
604              images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
605              (sctx->screen->always_allow_dcc_stores ||
606               images[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE)) {
607             sctx->barrier_flags |= SI_BARRIER_INV_L2;
608             break;
609          }
610       }
611    }
612 
613    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
614 }
615 
si_set_dst_src_barrier_buffers(struct pipe_shader_buffer * buffers,struct pipe_resource * dst,struct pipe_resource * src)616 static void si_set_dst_src_barrier_buffers(struct pipe_shader_buffer *buffers,
617                                            struct pipe_resource *dst, struct pipe_resource *src)
618 {
619    assert(dst);
620    memset(buffers, 0, sizeof(buffers[0]) * 2);
621    /* Only the "buffer" field is going to be used. */
622    buffers[0].buffer = dst;
623    buffers[1].buffer = src;
624 }
625 
626 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_before_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)627 void si_barrier_before_simple_buffer_op(struct si_context *sctx, unsigned flags,
628                                         struct pipe_resource *dst, struct pipe_resource *src)
629 {
630    struct pipe_shader_buffer barrier_buffers[2];
631    si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
632    si_barrier_before_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
633 }
634 
635 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_after_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)636 void si_barrier_after_simple_buffer_op(struct si_context *sctx, unsigned flags,
637                                        struct pipe_resource *dst, struct pipe_resource *src)
638 {
639    struct pipe_shader_buffer barrier_buffers[2];
640    si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
641    si_barrier_after_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
642 }
643 
si_texture_barrier(struct pipe_context * ctx,unsigned flags)644 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
645 {
646    si_fb_barrier_after_rendering((struct si_context *)ctx, SI_FB_BARRIER_SYNC_CB);
647 }
648 
649 /* This enforces coherency between shader stores and any past and future access. */
si_memory_barrier(struct pipe_context * ctx,unsigned flags)650 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
651 {
652    struct si_context *sctx = (struct si_context *)ctx;
653 
654    /* Ignore PIPE_BARRIER_UPDATE_BUFFER - it synchronizes against updates like buffer_subdata. */
655    /* Ignore PIPE_BARRIER_UPDATE_TEXTURE - it synchronizes against updates like texture_subdata. */
656    /* Ignore PIPE_BARRIER_MAPPED_BUFFER - it synchronizes against buffer_map/unmap. */
657    /* Ignore PIPE_BARRIER_QUERY_BUFFER - the GL spec description is confusing, and the driver
658     * always inserts barriers around get_query_result_resource.
659     */
660    flags &= ~PIPE_BARRIER_UPDATE_BUFFER & ~PIPE_BARRIER_UPDATE_TEXTURE &
661             ~PIPE_BARRIER_MAPPED_BUFFER & ~PIPE_BARRIER_QUERY_BUFFER;
662 
663    if (!flags)
664       return;
665 
666    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
667 
668    if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
669       sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM;
670 
671    /* VMEM cache contents are written back to L2 automatically at the end of waves, but
672     * the contents of other VMEM caches might still be stale.
673     *
674     * TEXTURE and IMAGE mean sampler buffers and image buffers, respectively.
675     */
676    if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
677                 PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER))
678       sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
679 
680    if (flags & (PIPE_BARRIER_INDEX_BUFFER | PIPE_BARRIER_INDIRECT_BUFFER))
681       sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME;
682 
683    /* Index buffers use L2 since GFX8 */
684    if (flags & PIPE_BARRIER_INDEX_BUFFER &&
685        (sctx->gfx_level <= GFX7 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
686       sctx->barrier_flags |= SI_BARRIER_WB_L2;
687 
688    /* Indirect buffers use L2 since GFX9. */
689    if (flags & PIPE_BARRIER_INDIRECT_BUFFER &&
690        (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
691       sctx->barrier_flags |= SI_BARRIER_WB_L2;
692 
693    /* MSAA color images are flushed in si_decompress_textures when needed.
694     * Shaders never write to depth/stencil images.
695     */
696    if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
697       sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB;
698 
699       if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) {
700          if (sctx->screen->info.tcc_rb_non_coherent)
701             sctx->barrier_flags |= SI_BARRIER_INV_L2;
702          else /* We don't know which shaders do image stores with DCC: */
703             sctx->barrier_flags |= SI_BARRIER_INV_L2_METADATA;
704       } else if (sctx->gfx_level == GFX9) {
705          /* We have to invalidate L2 for MSAA and when DCC can have pipe_aligned=0. */
706          sctx->barrier_flags |= SI_BARRIER_INV_L2;
707       } else if (sctx->gfx_level <= GFX8) {
708          /* CB doesn't use L2 on GFX6-8.  */
709          sctx->barrier_flags |= SI_BARRIER_WB_L2;
710       }
711    }
712 
713    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
714 }
715 
si_set_sampler_depth_decompress_mask(struct si_context * sctx,struct si_texture * tex)716 static void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex)
717 {
718    assert(sctx->gfx_level < GFX12);
719 
720    /* Check all sampler bindings in all shaders where depth textures are bound, and update
721     * which samplers should be decompressed.
722     */
723    u_foreach_bit(sh, sctx->shader_has_depth_tex) {
724       u_foreach_bit(i, sctx->samplers[sh].has_depth_tex_mask) {
725          if (sctx->samplers[sh].views[i]->texture == &tex->buffer.b.b) {
726             sctx->samplers[sh].needs_depth_decompress_mask |= 1 << i;
727             sctx->shader_needs_decompress_mask |= 1 << sh;
728          }
729       }
730    }
731 }
732 
si_fb_barrier_before_rendering(struct si_context * sctx)733 void si_fb_barrier_before_rendering(struct si_context *sctx)
734 {
735    /* Wait for all shaders because all image loads must finish before CB/DB can write there. */
736    if (sctx->framebuffer.state.nr_cbufs || sctx->framebuffer.state.zsbuf) {
737       sctx->barrier_flags |= SI_BARRIER_SYNC_CS | SI_BARRIER_SYNC_PS;
738       si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
739    }
740 }
741 
si_fb_barrier_after_rendering(struct si_context * sctx,unsigned flags)742 void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags)
743 {
744    if (sctx->gfx_level < GFX12 && !sctx->decompression_enabled) {
745       /* Setting dirty_level_mask should ignore SI_FB_BARRIER_SYNC_* because it triggers
746        * decompression, which is not syncing.
747        */
748       if (sctx->framebuffer.state.zsbuf) {
749          struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
750          struct si_texture *tex = (struct si_texture *)surf->texture;
751 
752          tex->dirty_level_mask |= 1 << surf->u.tex.level;
753 
754          if (tex->surface.has_stencil)
755             tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
756 
757          si_set_sampler_depth_decompress_mask(sctx, tex);
758       }
759 
760       unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
761       while (compressed_cb_mask) {
762          unsigned i = u_bit_scan(&compressed_cb_mask);
763          struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
764          struct si_texture *tex = (struct si_texture *)surf->texture;
765 
766          if (tex->surface.fmask_offset) {
767             tex->dirty_level_mask |= 1 << surf->u.tex.level;
768             tex->fmask_is_identity = false;
769          }
770       }
771    }
772 
773    if (flags & SI_FB_BARRIER_SYNC_CB) {
774       /* Compressed images (MSAA with FMASK) are flushed on demand in si_decompress_textures.
775        *
776        * Synchronize CB only if there is actually a bound color buffer.
777        */
778       if (sctx->framebuffer.uncompressed_cb_mask) {
779          si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
780                                     sctx->framebuffer.CB_has_shader_readable_metadata,
781                                     sctx->framebuffer.all_DCC_pipe_aligned);
782       }
783    }
784 
785    if (flags & SI_FB_BARRIER_SYNC_DB && sctx->framebuffer.state.zsbuf) {
786       /* DB caches are flushed on demand (using si_decompress_textures) except the cases below. */
787       if (sctx->gfx_level >= GFX12) {
788          si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, true, false);
789       } else if (sctx->generate_mipmap_for_depth) {
790          /* u_blitter doesn't invoke depth decompression when it does multiple blits in a row,
791           * but the only case when it matters for DB is when doing generate_mipmap, which writes Z,
792           * which is always uncompressed. So here we flush DB manually between individual
793           * generate_mipmap blits.
794           */
795          si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
796       } else if (sctx->screen->info.family == CHIP_NAVI33) {
797          struct si_surface *old_zsurf = (struct si_surface *)sctx->framebuffer.state.zsbuf;
798          struct si_texture *old_ztex = (struct si_texture *)old_zsurf->base.texture;
799 
800          if (old_ztex->upgraded_depth) {
801             /* TODO: some failures related to hyperz appeared after 969ed851 on nv33:
802              * - piglit tex-miplevel-selection
803              * - KHR-GL46.direct_state_access.framebuffers_texture_attachment
804              * - GTF-GL46.gtf30.GL3Tests.blend_minmax.blend_minmax_draw
805              * - KHR-GL46.direct_state_access.framebuffers_texture_layer_attachment
806              *
807              * This seems to fix them:
808              */
809             sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_INV_L2;
810             si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
811          }
812       } else if (sctx->gfx_level == GFX9) {
813          /* It appears that DB metadata "leaks" in a sequence of:
814           *  - depth clear
815           *  - DCC decompress for shader image writes (with DB disabled)
816           *  - render with DEPTH_BEFORE_SHADER=1
817           * Flushing DB metadata works around the problem.
818           */
819          sctx->barrier_flags |= SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META;
820          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
821       }
822    }
823 }
824 
si_barrier_before_image_fast_clear(struct si_context * sctx,unsigned types)825 void si_barrier_before_image_fast_clear(struct si_context *sctx, unsigned types)
826 {
827    /* Flush caches and wait for idle. */
828    if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC)) {
829       si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
830                                  sctx->framebuffer.CB_has_shader_readable_metadata,
831                                  sctx->framebuffer.all_DCC_pipe_aligned);
832    }
833 
834    if (types & SI_CLEAR_TYPE_HTILE) {
835       si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, sctx->framebuffer.has_stencil,
836                                  sctx->framebuffer.DB_has_shader_readable_metadata);
837    }
838 
839    /* Invalidate the VMEM cache because we always use compute. */
840    sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
841 
842    /* GFX6-8: CB and DB don't use L2. */
843    if (sctx->gfx_level <= GFX8)
844       sctx->barrier_flags |= SI_BARRIER_INV_L2;
845 
846    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
847 }
848 
si_barrier_after_image_fast_clear(struct si_context * sctx)849 void si_barrier_after_image_fast_clear(struct si_context *sctx)
850 {
851    /* Wait for idle. */
852    sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
853 
854    /* GFX6-8: CB and DB don't use L2. */
855    if (sctx->gfx_level <= GFX8)
856       sctx->barrier_flags |= SI_BARRIER_WB_L2;
857 
858    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
859 }
860 
si_init_barrier_functions(struct si_context * sctx)861 void si_init_barrier_functions(struct si_context *sctx)
862 {
863    if (sctx->gfx_level >= GFX10)
864       sctx->emit_barrier = gfx10_emit_barrier;
865    else
866       sctx->emit_barrier = gfx6_emit_barrier;
867 
868    sctx->atoms.s.barrier.emit = si_emit_barrier_as_atom;
869 
870    sctx->b.memory_barrier = si_memory_barrier;
871    sctx->b.texture_barrier = si_texture_barrier;
872 }
873