1 /*
2 * Copyright 2024 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_build_pm4.h"
8
si_get_wait_mem_scratch_bo(struct si_context * ctx,struct radeon_cmdbuf * cs,bool is_secure)9 static struct si_resource *si_get_wait_mem_scratch_bo(struct si_context *ctx,
10 struct radeon_cmdbuf *cs, bool is_secure)
11 {
12 struct si_screen *sscreen = ctx->screen;
13
14 assert(ctx->gfx_level < GFX11);
15
16 if (likely(!is_secure)) {
17 return ctx->wait_mem_scratch;
18 } else {
19 assert(sscreen->info.has_tmz_support);
20 if (!ctx->wait_mem_scratch_tmz) {
21 ctx->wait_mem_scratch_tmz =
22 si_aligned_buffer_create(&sscreen->b,
23 PIPE_RESOURCE_FLAG_UNMAPPABLE |
24 SI_RESOURCE_FLAG_DRIVER_INTERNAL |
25 PIPE_RESOURCE_FLAG_ENCRYPTED,
26 PIPE_USAGE_DEFAULT, 4,
27 sscreen->info.tcc_cache_line_size);
28 si_cp_write_data(ctx, ctx->wait_mem_scratch_tmz, 0, 4, V_370_MEM, V_370_ME,
29 &ctx->wait_mem_number);
30 }
31
32 return ctx->wait_mem_scratch_tmz;
33 }
34 }
35
get_reduced_barrier_flags(struct si_context * ctx)36 static unsigned get_reduced_barrier_flags(struct si_context *ctx)
37 {
38 unsigned flags = ctx->barrier_flags;
39
40 if (!flags)
41 return 0;
42
43 if (!ctx->has_graphics) {
44 /* Only process compute flags. */
45 flags &= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
46 SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA |
47 SI_BARRIER_SYNC_CS;
48 }
49
50 /* Don't flush CB and DB if there have been no draw calls. */
51 if (ctx->num_draw_calls == ctx->last_cb_flush_num_draw_calls &&
52 ctx->num_decompress_calls == ctx->last_cb_flush_num_decompress_calls)
53 flags &= ~SI_BARRIER_SYNC_AND_INV_CB;
54
55 if (ctx->num_draw_calls == ctx->last_db_flush_num_draw_calls &&
56 ctx->num_decompress_calls == ctx->last_db_flush_num_decompress_calls)
57 flags &= ~SI_BARRIER_SYNC_AND_INV_DB;
58
59 if (!ctx->compute_is_busy)
60 flags &= ~SI_BARRIER_SYNC_CS;
61
62 /* Track the last CB/DB flush. */
63 if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
64 ctx->num_cb_cache_flushes++;
65 ctx->last_cb_flush_num_draw_calls = ctx->num_draw_calls;
66 ctx->last_cb_flush_num_decompress_calls = ctx->num_decompress_calls;
67 }
68 if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
69 ctx->num_db_cache_flushes++;
70 ctx->last_db_flush_num_draw_calls = ctx->num_draw_calls;
71 ctx->last_db_flush_num_decompress_calls = ctx->num_decompress_calls;
72 }
73
74 /* Skip VS and PS synchronization if they are idle. */
75 if (ctx->num_draw_calls == ctx->last_ps_sync_num_draw_calls)
76 flags &= ~SI_BARRIER_SYNC_VS & ~SI_BARRIER_SYNC_PS;
77 else if (ctx->num_draw_calls == ctx->last_vs_sync_num_draw_calls)
78 flags &= ~SI_BARRIER_SYNC_VS;
79
80 /* Track the last VS/PS flush. Flushing CB or DB also waits for PS (obviously). */
81 if (flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_SYNC_PS)) {
82 ctx->last_ps_sync_num_draw_calls = ctx->num_draw_calls;
83 ctx->last_vs_sync_num_draw_calls = ctx->num_draw_calls;
84 } else if (SI_BARRIER_SYNC_VS) {
85 ctx->last_vs_sync_num_draw_calls = ctx->num_draw_calls;
86 }
87
88 /* We use a TS event to flush CB/DB on GFX9+. */
89 bool uses_ts_event = ctx->gfx_level >= GFX9 &&
90 flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB);
91
92 /* TS events wait for everything. */
93 if (uses_ts_event)
94 flags &= ~SI_BARRIER_SYNC_VS & ~SI_BARRIER_SYNC_PS & ~SI_BARRIER_SYNC_CS;
95
96 /* TS events wait for compute too. */
97 if (flags & SI_BARRIER_SYNC_CS || uses_ts_event)
98 ctx->compute_is_busy = false;
99
100 if (flags & SI_BARRIER_SYNC_VS)
101 ctx->num_vs_flushes++;
102 if (flags & SI_BARRIER_SYNC_PS)
103 ctx->num_ps_flushes++;
104 if (flags & SI_BARRIER_SYNC_CS)
105 ctx->num_cs_flushes++;
106
107 if (flags & SI_BARRIER_INV_L2)
108 ctx->num_L2_invalidates++;
109 else if (flags & SI_BARRIER_WB_L2)
110 ctx->num_L2_writebacks++;
111
112 ctx->barrier_flags = 0;
113 return flags;
114 }
115
si_handle_common_barrier_events(struct si_context * ctx,struct radeon_cmdbuf * cs,unsigned flags)116 static void si_handle_common_barrier_events(struct si_context *ctx, struct radeon_cmdbuf *cs,
117 unsigned flags)
118 {
119 radeon_begin(cs);
120
121 if (flags & SI_BARRIER_EVENT_PIPELINESTAT_START && ctx->pipeline_stats_enabled != 1) {
122 radeon_event_write(V_028A90_PIPELINESTAT_START);
123 ctx->pipeline_stats_enabled = 1;
124 } else if (flags & SI_BARRIER_EVENT_PIPELINESTAT_STOP && ctx->pipeline_stats_enabled != 0) {
125 radeon_event_write(V_028A90_PIPELINESTAT_STOP);
126 ctx->pipeline_stats_enabled = 0;
127 }
128
129 if (flags & SI_BARRIER_EVENT_VGT_FLUSH)
130 radeon_event_write(V_028A90_VGT_FLUSH);
131
132 radeon_end();
133 }
134
gfx10_emit_barrier(struct si_context * ctx,struct radeon_cmdbuf * cs)135 static void gfx10_emit_barrier(struct si_context *ctx, struct radeon_cmdbuf *cs)
136 {
137 assert(ctx->gfx_level >= GFX10);
138 uint32_t gcr_cntl = 0;
139 unsigned flags = get_reduced_barrier_flags(ctx);
140
141 if (!flags)
142 return;
143
144 si_handle_common_barrier_events(ctx, cs, flags);
145
146 /* We don't need these. */
147 assert(!(flags & SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META));
148 assert(ctx->gfx_level < GFX12 || !(flags & SI_BARRIER_INV_L2_METADATA));
149
150 if (flags & SI_BARRIER_INV_ICACHE)
151 gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
152 if (flags & SI_BARRIER_INV_SMEM)
153 gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
154 if (flags & SI_BARRIER_INV_VMEM)
155 gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
156
157 /* The L2 cache ops are:
158 * - INV: - invalidate lines that reflect memory (were loaded from memory)
159 * - don't touch lines that were overwritten (were stored by gfx clients)
160 * - WB: - don't touch lines that reflect memory
161 * - write back lines that were overwritten
162 * - WB | INV: - invalidate lines that reflect memory
163 * - write back lines that were overwritten
164 *
165 * GLM doesn't support WB alone. If WB is set, INV must be set too.
166 */
167 if (flags & SI_BARRIER_INV_L2)
168 gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1); /* Writeback and invalidate everything in L2. */
169 else if (flags & SI_BARRIER_WB_L2)
170 gcr_cntl |= S_586_GL2_WB(1);
171
172 /* Invalidate the metadata cache. */
173 if (ctx->gfx_level < GFX12 &&
174 flags & (SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2 | SI_BARRIER_INV_L2_METADATA))
175 gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
176
177 /* Flush CB/DB. Note that this also idles all shaders, including compute shaders. */
178 if (flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB)) {
179 unsigned cb_db_event = 0;
180
181 /* Determine the TS event that we'll use to flush CB/DB. */
182 if ((flags & SI_BARRIER_SYNC_AND_INV_CB && flags & SI_BARRIER_SYNC_AND_INV_DB) ||
183 /* Gfx11 can't use the DB_META event and must use a full flush to flush DB_META. */
184 (ctx->gfx_level == GFX11 && flags & SI_BARRIER_SYNC_AND_INV_DB)) {
185 cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
186 } else if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
187 cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
188 } else {
189 assert(flags & SI_BARRIER_SYNC_AND_INV_DB);
190 cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
191 }
192
193 /* We must flush CMASK/FMASK/DCC separately if the main event only flushes CB_DATA. */
194 radeon_begin(cs);
195 if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_CB_DATA_TS)
196 radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
197
198 /* We must flush HTILE separately if the main event only flushes DB_DATA. */
199 if (ctx->gfx_level < GFX12 && cb_db_event == V_028A90_FLUSH_AND_INV_DB_DATA_TS)
200 radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
201
202 radeon_end();
203
204 /* First flush CB/DB, then L1/L2. */
205 gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
206
207 if (ctx->gfx_level >= GFX11) {
208 si_cp_release_mem_pws(ctx, cs, cb_db_event, gcr_cntl & C_586_GLI_INV);
209
210 /* Wait for the event and invalidate remaining caches if needed. */
211 si_cp_acquire_mem_pws(ctx, cs, cb_db_event,
212 flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME,
213 gcr_cntl & ~C_586_GLI_INV, /* keep only GLI_INV */
214 0, flags);
215
216 gcr_cntl = 0; /* all done */
217 /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
218 flags &= ~SI_BARRIER_PFP_SYNC_ME;
219 } else {
220 /* GFX10 */
221 struct si_resource *wait_mem_scratch =
222 si_get_wait_mem_scratch_bo(ctx, cs, ctx->ws->cs_is_secure(cs));
223
224 /* CB/DB flush and invalidate via RELEASE_MEM.
225 * Combine this with other cache flushes when possible.
226 */
227 uint64_t va = wait_mem_scratch->gpu_address;
228 ctx->wait_mem_number++;
229
230 /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
231 unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
232 unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
233 unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
234 unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
235 assert(G_586_GL2_US(gcr_cntl) == 0);
236 assert(G_586_GL2_RANGE(gcr_cntl) == 0);
237 assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
238 unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
239 unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
240 unsigned gcr_seq = G_586_SEQ(gcr_cntl);
241
242 gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
243 C_586_GL2_WB; /* keep SEQ */
244
245 si_cp_release_mem(ctx, cs, cb_db_event,
246 S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
247 S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
248 S_490_SEQ(gcr_seq),
249 EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
250 EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
251 SI_NOT_QUERY);
252
253 if (unlikely(ctx->sqtt_enabled)) {
254 si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
255 }
256
257 si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
258
259 if (unlikely(ctx->sqtt_enabled)) {
260 si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
261 }
262 }
263 } else {
264 /* The TS event above also makes sure that PS and CS are idle, so we have to do this only
265 * if we are not flushing CB or DB.
266 */
267 radeon_begin(cs);
268 if (flags & SI_BARRIER_SYNC_PS)
269 radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
270 else if (flags & SI_BARRIER_SYNC_VS)
271 radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
272
273 if (flags & SI_BARRIER_SYNC_CS)
274 radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
275
276 radeon_end();
277 }
278
279 /* Ignore fields that only modify the behavior of other fields. */
280 if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
281 si_cp_acquire_mem(ctx, cs, gcr_cntl,
282 flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME);
283 } else if (flags & SI_BARRIER_PFP_SYNC_ME) {
284 si_cp_pfp_sync_me(cs);
285 }
286 }
287
gfx6_emit_barrier(struct si_context * sctx,struct radeon_cmdbuf * cs)288 static void gfx6_emit_barrier(struct si_context *sctx, struct radeon_cmdbuf *cs)
289 {
290 assert(sctx->gfx_level <= GFX9);
291 unsigned flags = get_reduced_barrier_flags(sctx);
292
293 if (!flags)
294 return;
295
296 si_handle_common_barrier_events(sctx, cs, flags);
297
298 uint32_t cp_coher_cntl = 0;
299 const uint32_t flush_cb_db = flags & (SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB);
300
301 /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
302 * bit is set. An alternative way is to write SQC_CACHES, but that
303 * doesn't seem to work reliably. Since the bug doesn't affect
304 * correctness (it only does more work than necessary) and
305 * the performance impact is likely negligible, there is no plan
306 * to add a workaround for it.
307 */
308
309 if (flags & SI_BARRIER_INV_ICACHE)
310 cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
311 if (flags & SI_BARRIER_INV_SMEM)
312 cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
313
314 if (sctx->gfx_level <= GFX8) {
315 if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
316 cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
317 S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
318 S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
319 S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
320 S_0085F0_CB7_DEST_BASE_ENA(1);
321
322 /* Necessary for DCC */
323 if (sctx->gfx_level == GFX8)
324 si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
325 EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
326 }
327 if (flags & SI_BARRIER_SYNC_AND_INV_DB)
328 cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
329 }
330
331 radeon_begin(cs);
332
333 /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
334 if (flags & SI_BARRIER_SYNC_AND_INV_CB)
335 radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
336
337 /* Flush HTILE. SURFACE_SYNC will wait for idle. */
338 if (flags & (SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META))
339 radeon_event_write(V_028A90_FLUSH_AND_INV_DB_META);
340
341 /* Wait for shader engines to go idle.
342 * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
343 * for everything including CB/DB cache flushes.
344 *
345 * GFX6-8: SURFACE_SYNC with CB_ACTION_ENA doesn't do anything if there are no CB/DB bindings.
346 * Reproducible with: piglit/arb_framebuffer_no_attachments-atomic
347 *
348 * GFX9: The TS event is always written after full pipeline completion regardless of CB/DB
349 * bindings.
350 */
351 if (sctx->gfx_level <= GFX8 || !flush_cb_db) {
352 if (flags & SI_BARRIER_SYNC_PS)
353 radeon_event_write(V_028A90_PS_PARTIAL_FLUSH);
354 else if (flags & SI_BARRIER_SYNC_VS)
355 radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
356 }
357
358 if (flags & SI_BARRIER_SYNC_CS)
359 radeon_event_write(V_028A90_CS_PARTIAL_FLUSH);
360
361 radeon_end();
362
363 /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
364 * wait for idle on GFX9. We have to use a TS event.
365 */
366 if (sctx->gfx_level == GFX9 && flush_cb_db) {
367 uint64_t va;
368 unsigned tc_flags, cb_db_event;
369
370 /* Set the CB/DB flush event. */
371 switch (flush_cb_db) {
372 case SI_BARRIER_SYNC_AND_INV_CB:
373 cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
374 break;
375 case SI_BARRIER_SYNC_AND_INV_DB:
376 cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
377 break;
378 default:
379 /* both CB & DB */
380 cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
381 }
382
383 /* These are the only allowed combinations. If you need to
384 * do multiple operations at once, do them separately.
385 * All operations that invalidate L2 also seem to invalidate
386 * metadata. Volatile (VOL) and WC flushes are not listed here.
387 *
388 * TC | TC_WB = writeback & invalidate L2
389 * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
390 * TC_WB | TC_NC = writeback L2 for MTYPE == NC
391 * TC | TC_NC = invalidate L2 for MTYPE == NC
392 * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
393 * TCL1 = invalidate L1
394 */
395 tc_flags = 0;
396
397 if (flags & SI_BARRIER_INV_L2_METADATA) {
398 tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
399 }
400
401 /* Ideally flush L2 together with CB/DB. */
402 if (flags & SI_BARRIER_INV_L2) {
403 /* Writeback and invalidate everything in L2 & L1. */
404 tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
405
406 /* Clear the flags. */
407 flags &= ~(SI_BARRIER_INV_L2 | SI_BARRIER_WB_L2);
408 }
409
410 /* Do the flush (enqueue the event and wait for it). */
411 struct si_resource* wait_mem_scratch =
412 si_get_wait_mem_scratch_bo(sctx, cs, sctx->ws->cs_is_secure(cs));
413
414 va = wait_mem_scratch->gpu_address;
415 sctx->wait_mem_number++;
416
417 si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
418 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
419 wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
420
421 if (unlikely(sctx->sqtt_enabled)) {
422 si_sqtt_describe_barrier_start(sctx, cs);
423 }
424
425 si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
426
427 if (unlikely(sctx->sqtt_enabled)) {
428 si_sqtt_describe_barrier_end(sctx, cs, sctx->barrier_flags);
429 }
430 }
431
432 /* GFX6-GFX8 only: When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC waits
433 * for idle, so it should be last.
434 *
435 * cp_coher_cntl should contain everything except TC flags at this point.
436 *
437 * GFX6-GFX7 don't support L2 write-back.
438 */
439 unsigned engine = flags & SI_BARRIER_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME;
440
441 if (flags & SI_BARRIER_INV_L2 || (sctx->gfx_level <= GFX7 && flags & SI_BARRIER_WB_L2)) {
442 /* Invalidate L1 & L2. WB must be set on GFX8+ when TC_ACTION is set. */
443 si_cp_acquire_mem(sctx, cs,
444 cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
445 S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8), engine);
446 } else {
447 /* L1 invalidation and L2 writeback must be done separately, because both operations can't
448 * be done together.
449 */
450 if (flags & SI_BARRIER_WB_L2) {
451 /* WB = write-back
452 * NC = apply to non-coherent MTYPEs
453 * (i.e. MTYPE <= 1, which is what we use everywhere)
454 *
455 * WB doesn't work without NC.
456 *
457 * If we get here, the only flag that can't be executed together with WB_L2 is VMEM cache
458 * invalidation.
459 */
460 bool last_acquire_mem = !(flags & SI_BARRIER_INV_VMEM);
461
462 si_cp_acquire_mem(sctx, cs,
463 cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) |
464 S_0301F0_TC_NC_ACTION_ENA(1),
465 /* If this is not the last ACQUIRE_MEM, flush in ME.
466 * We only want to synchronize with PFP in the last ACQUIRE_MEM. */
467 last_acquire_mem ? engine : V_580_CP_ME);
468
469 if (last_acquire_mem)
470 flags &= ~SI_BARRIER_PFP_SYNC_ME;
471 cp_coher_cntl = 0;
472 }
473
474 if (flags & SI_BARRIER_INV_VMEM)
475 cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
476
477 /* If there are still some cache flags left... */
478 if (cp_coher_cntl) {
479 si_cp_acquire_mem(sctx, cs, cp_coher_cntl, engine);
480 flags &= ~SI_BARRIER_PFP_SYNC_ME;
481 }
482
483 /* This might be needed even without any cache flags, such as when doing buffer stores
484 * to an index buffer.
485 */
486 if (flags & SI_BARRIER_PFP_SYNC_ME)
487 si_cp_pfp_sync_me(cs);
488 }
489 }
490
si_emit_barrier_as_atom(struct si_context * sctx,unsigned index)491 static void si_emit_barrier_as_atom(struct si_context *sctx, unsigned index)
492 {
493 sctx->emit_barrier(sctx, &sctx->gfx_cs);
494 }
495
si_is_buffer_idle(struct si_context * sctx,struct si_resource * buf,unsigned usage)496 static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
497 unsigned usage)
498 {
499 return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
500 sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage | RADEON_USAGE_DISALLOW_SLOW_REPLY);
501 }
502
si_barrier_before_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)503 void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
504 unsigned num_buffers,
505 const struct pipe_shader_buffer *buffers,
506 unsigned writable_buffers_mask,
507 unsigned num_images,
508 const struct pipe_image_view *images)
509 {
510 for (unsigned i = 0; i < num_images; i++) {
511 /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
512 si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS,
513 images[i].u.tex.level, images[i].u.tex.first_layer,
514 images[i].u.tex.last_layer,
515 images[i].access & PIPE_IMAGE_ACCESS_WRITE);
516 }
517
518 /* Don't sync if buffers are idle. */
519 const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
520 SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
521 SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
522 SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
523 const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
524 SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
525 SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
526 SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
527
528 for (unsigned i = 0; i < num_buffers; i++) {
529 struct si_resource *buf = si_resource(buffers[i].buffer);
530
531 if (!buf)
532 continue;
533
534 /* We always wait for the last write. If the buffer is used for write, also wait
535 * for the last read.
536 */
537 if (!si_is_buffer_idle(sctx, buf, RADEON_USAGE_WRITE |
538 (writable_buffers_mask & BITFIELD_BIT(i) ? RADEON_USAGE_READ : 0))) {
539 if (buf->bind_history & ps_mask)
540 sctx->barrier_flags |= SI_BARRIER_SYNC_PS;
541 else
542 sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
543
544 if (buf->bind_history & cs_mask)
545 sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
546 }
547 }
548
549 /* Don't sync if images are idle. */
550 for (unsigned i = 0; i < num_images; i++) {
551 struct si_resource *img = si_resource(images[i].resource);
552 bool writable = images[i].access & PIPE_IMAGE_ACCESS_WRITE;
553
554 /* We always wait for the last write. If the buffer is used for write, also wait
555 * for the last read.
556 */
557 if (!si_is_buffer_idle(sctx, img, RADEON_USAGE_WRITE | (writable ? RADEON_USAGE_READ : 0))) {
558 si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true,
559 ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned);
560 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
561 }
562 }
563
564 /* Invalidate the VMEM cache only. The SMEM cache isn't used by shader buffers. */
565 sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
566 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
567 }
568
si_barrier_after_internal_op(struct si_context * sctx,unsigned flags,unsigned num_buffers,const struct pipe_shader_buffer * buffers,unsigned writable_buffers_mask,unsigned num_images,const struct pipe_image_view * images)569 void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
570 unsigned num_buffers,
571 const struct pipe_shader_buffer *buffers,
572 unsigned writable_buffers_mask,
573 unsigned num_images,
574 const struct pipe_image_view *images)
575 {
576 sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
577
578 if (num_images) {
579 /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
580 sctx->barrier_flags |= sctx->gfx_level <= GFX8 ? SI_BARRIER_WB_L2 : 0;
581 /* Make sure image stores are visible to all CUs. */
582 sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
583 }
584
585 /* Make sure buffer stores are visible to all CUs and also as index/indirect buffers. */
586 if (num_buffers)
587 sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | SI_BARRIER_PFP_SYNC_ME;
588
589 /* We must set L2_cache_dirty for buffers because:
590 * - GFX6,12: CP DMA doesn't use L2.
591 * - GFX6-7,12: Index buffer reads don't use L2.
592 * - GFX6-8,12: CP doesn't use L2.
593 * - GFX6-8: CB/DB don't use L2.
594 *
595 * L2_cache_dirty is checked explicitly when buffers are used in those cases to enforce coherency.
596 */
597 while (writable_buffers_mask)
598 si_resource(buffers[u_bit_scan(&writable_buffers_mask)].buffer)->L2_cache_dirty = true;
599
600 /* Make sure RBs see our DCC image stores if RBs and TCCs (L2 instances) are non-coherent. */
601 if (sctx->gfx_level >= GFX10 && sctx->screen->info.tcc_rb_non_coherent) {
602 for (unsigned i = 0; i < num_images; i++) {
603 if (vi_dcc_enabled((struct si_texture*)images[i].resource, images[i].u.tex.level) &&
604 images[i].access & PIPE_IMAGE_ACCESS_WRITE &&
605 (sctx->screen->always_allow_dcc_stores ||
606 images[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE)) {
607 sctx->barrier_flags |= SI_BARRIER_INV_L2;
608 break;
609 }
610 }
611 }
612
613 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
614 }
615
si_set_dst_src_barrier_buffers(struct pipe_shader_buffer * buffers,struct pipe_resource * dst,struct pipe_resource * src)616 static void si_set_dst_src_barrier_buffers(struct pipe_shader_buffer *buffers,
617 struct pipe_resource *dst, struct pipe_resource *src)
618 {
619 assert(dst);
620 memset(buffers, 0, sizeof(buffers[0]) * 2);
621 /* Only the "buffer" field is going to be used. */
622 buffers[0].buffer = dst;
623 buffers[1].buffer = src;
624 }
625
626 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_before_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)627 void si_barrier_before_simple_buffer_op(struct si_context *sctx, unsigned flags,
628 struct pipe_resource *dst, struct pipe_resource *src)
629 {
630 struct pipe_shader_buffer barrier_buffers[2];
631 si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
632 si_barrier_before_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
633 }
634
635 /* This is for simple buffer ops that have 1 dst and 0-1 src. */
si_barrier_after_simple_buffer_op(struct si_context * sctx,unsigned flags,struct pipe_resource * dst,struct pipe_resource * src)636 void si_barrier_after_simple_buffer_op(struct si_context *sctx, unsigned flags,
637 struct pipe_resource *dst, struct pipe_resource *src)
638 {
639 struct pipe_shader_buffer barrier_buffers[2];
640 si_set_dst_src_barrier_buffers(barrier_buffers, dst, src);
641 si_barrier_after_internal_op(sctx, flags, src ? 2 : 1, barrier_buffers, 0x1, 0, NULL);
642 }
643
si_texture_barrier(struct pipe_context * ctx,unsigned flags)644 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
645 {
646 si_fb_barrier_after_rendering((struct si_context *)ctx, SI_FB_BARRIER_SYNC_CB);
647 }
648
649 /* This enforces coherency between shader stores and any past and future access. */
si_memory_barrier(struct pipe_context * ctx,unsigned flags)650 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
651 {
652 struct si_context *sctx = (struct si_context *)ctx;
653
654 /* Ignore PIPE_BARRIER_UPDATE_BUFFER - it synchronizes against updates like buffer_subdata. */
655 /* Ignore PIPE_BARRIER_UPDATE_TEXTURE - it synchronizes against updates like texture_subdata. */
656 /* Ignore PIPE_BARRIER_MAPPED_BUFFER - it synchronizes against buffer_map/unmap. */
657 /* Ignore PIPE_BARRIER_QUERY_BUFFER - the GL spec description is confusing, and the driver
658 * always inserts barriers around get_query_result_resource.
659 */
660 flags &= ~PIPE_BARRIER_UPDATE_BUFFER & ~PIPE_BARRIER_UPDATE_TEXTURE &
661 ~PIPE_BARRIER_MAPPED_BUFFER & ~PIPE_BARRIER_QUERY_BUFFER;
662
663 if (!flags)
664 return;
665
666 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
667
668 if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
669 sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM;
670
671 /* VMEM cache contents are written back to L2 automatically at the end of waves, but
672 * the contents of other VMEM caches might still be stale.
673 *
674 * TEXTURE and IMAGE mean sampler buffers and image buffers, respectively.
675 */
676 if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
677 PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER))
678 sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
679
680 if (flags & (PIPE_BARRIER_INDEX_BUFFER | PIPE_BARRIER_INDIRECT_BUFFER))
681 sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME;
682
683 /* Index buffers use L2 since GFX8 */
684 if (flags & PIPE_BARRIER_INDEX_BUFFER &&
685 (sctx->gfx_level <= GFX7 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
686 sctx->barrier_flags |= SI_BARRIER_WB_L2;
687
688 /* Indirect buffers use L2 since GFX9. */
689 if (flags & PIPE_BARRIER_INDIRECT_BUFFER &&
690 (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope))
691 sctx->barrier_flags |= SI_BARRIER_WB_L2;
692
693 /* MSAA color images are flushed in si_decompress_textures when needed.
694 * Shaders never write to depth/stencil images.
695 */
696 if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
697 sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB;
698
699 if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) {
700 if (sctx->screen->info.tcc_rb_non_coherent)
701 sctx->barrier_flags |= SI_BARRIER_INV_L2;
702 else /* We don't know which shaders do image stores with DCC: */
703 sctx->barrier_flags |= SI_BARRIER_INV_L2_METADATA;
704 } else if (sctx->gfx_level == GFX9) {
705 /* We have to invalidate L2 for MSAA and when DCC can have pipe_aligned=0. */
706 sctx->barrier_flags |= SI_BARRIER_INV_L2;
707 } else if (sctx->gfx_level <= GFX8) {
708 /* CB doesn't use L2 on GFX6-8. */
709 sctx->barrier_flags |= SI_BARRIER_WB_L2;
710 }
711 }
712
713 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
714 }
715
si_set_sampler_depth_decompress_mask(struct si_context * sctx,struct si_texture * tex)716 static void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex)
717 {
718 assert(sctx->gfx_level < GFX12);
719
720 /* Check all sampler bindings in all shaders where depth textures are bound, and update
721 * which samplers should be decompressed.
722 */
723 u_foreach_bit(sh, sctx->shader_has_depth_tex) {
724 u_foreach_bit(i, sctx->samplers[sh].has_depth_tex_mask) {
725 if (sctx->samplers[sh].views[i]->texture == &tex->buffer.b.b) {
726 sctx->samplers[sh].needs_depth_decompress_mask |= 1 << i;
727 sctx->shader_needs_decompress_mask |= 1 << sh;
728 }
729 }
730 }
731 }
732
si_fb_barrier_before_rendering(struct si_context * sctx)733 void si_fb_barrier_before_rendering(struct si_context *sctx)
734 {
735 /* Wait for all shaders because all image loads must finish before CB/DB can write there. */
736 if (sctx->framebuffer.state.nr_cbufs || sctx->framebuffer.state.zsbuf) {
737 sctx->barrier_flags |= SI_BARRIER_SYNC_CS | SI_BARRIER_SYNC_PS;
738 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
739 }
740 }
741
si_fb_barrier_after_rendering(struct si_context * sctx,unsigned flags)742 void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags)
743 {
744 if (sctx->gfx_level < GFX12 && !sctx->decompression_enabled) {
745 /* Setting dirty_level_mask should ignore SI_FB_BARRIER_SYNC_* because it triggers
746 * decompression, which is not syncing.
747 */
748 if (sctx->framebuffer.state.zsbuf) {
749 struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
750 struct si_texture *tex = (struct si_texture *)surf->texture;
751
752 tex->dirty_level_mask |= 1 << surf->u.tex.level;
753
754 if (tex->surface.has_stencil)
755 tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
756
757 si_set_sampler_depth_decompress_mask(sctx, tex);
758 }
759
760 unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
761 while (compressed_cb_mask) {
762 unsigned i = u_bit_scan(&compressed_cb_mask);
763 struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
764 struct si_texture *tex = (struct si_texture *)surf->texture;
765
766 if (tex->surface.fmask_offset) {
767 tex->dirty_level_mask |= 1 << surf->u.tex.level;
768 tex->fmask_is_identity = false;
769 }
770 }
771 }
772
773 if (flags & SI_FB_BARRIER_SYNC_CB) {
774 /* Compressed images (MSAA with FMASK) are flushed on demand in si_decompress_textures.
775 *
776 * Synchronize CB only if there is actually a bound color buffer.
777 */
778 if (sctx->framebuffer.uncompressed_cb_mask) {
779 si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
780 sctx->framebuffer.CB_has_shader_readable_metadata,
781 sctx->framebuffer.all_DCC_pipe_aligned);
782 }
783 }
784
785 if (flags & SI_FB_BARRIER_SYNC_DB && sctx->framebuffer.state.zsbuf) {
786 /* DB caches are flushed on demand (using si_decompress_textures) except the cases below. */
787 if (sctx->gfx_level >= GFX12) {
788 si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, true, false);
789 } else if (sctx->generate_mipmap_for_depth) {
790 /* u_blitter doesn't invoke depth decompression when it does multiple blits in a row,
791 * but the only case when it matters for DB is when doing generate_mipmap, which writes Z,
792 * which is always uncompressed. So here we flush DB manually between individual
793 * generate_mipmap blits.
794 */
795 si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
796 } else if (sctx->screen->info.family == CHIP_NAVI33) {
797 struct si_surface *old_zsurf = (struct si_surface *)sctx->framebuffer.state.zsbuf;
798 struct si_texture *old_ztex = (struct si_texture *)old_zsurf->base.texture;
799
800 if (old_ztex->upgraded_depth) {
801 /* TODO: some failures related to hyperz appeared after 969ed851 on nv33:
802 * - piglit tex-miplevel-selection
803 * - KHR-GL46.direct_state_access.framebuffers_texture_attachment
804 * - GTF-GL46.gtf30.GL3Tests.blend_minmax.blend_minmax_draw
805 * - KHR-GL46.direct_state_access.framebuffers_texture_layer_attachment
806 *
807 * This seems to fix them:
808 */
809 sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_INV_L2;
810 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
811 }
812 } else if (sctx->gfx_level == GFX9) {
813 /* It appears that DB metadata "leaks" in a sequence of:
814 * - depth clear
815 * - DCC decompress for shader image writes (with DB disabled)
816 * - render with DEPTH_BEFORE_SHADER=1
817 * Flushing DB metadata works around the problem.
818 */
819 sctx->barrier_flags |= SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META;
820 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
821 }
822 }
823 }
824
si_barrier_before_image_fast_clear(struct si_context * sctx,unsigned types)825 void si_barrier_before_image_fast_clear(struct si_context *sctx, unsigned types)
826 {
827 /* Flush caches and wait for idle. */
828 if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC)) {
829 si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
830 sctx->framebuffer.CB_has_shader_readable_metadata,
831 sctx->framebuffer.all_DCC_pipe_aligned);
832 }
833
834 if (types & SI_CLEAR_TYPE_HTILE) {
835 si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, sctx->framebuffer.has_stencil,
836 sctx->framebuffer.DB_has_shader_readable_metadata);
837 }
838
839 /* Invalidate the VMEM cache because we always use compute. */
840 sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
841
842 /* GFX6-8: CB and DB don't use L2. */
843 if (sctx->gfx_level <= GFX8)
844 sctx->barrier_flags |= SI_BARRIER_INV_L2;
845
846 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
847 }
848
si_barrier_after_image_fast_clear(struct si_context * sctx)849 void si_barrier_after_image_fast_clear(struct si_context *sctx)
850 {
851 /* Wait for idle. */
852 sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
853
854 /* GFX6-8: CB and DB don't use L2. */
855 if (sctx->gfx_level <= GFX8)
856 sctx->barrier_flags |= SI_BARRIER_WB_L2;
857
858 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
859 }
860
si_init_barrier_functions(struct si_context * sctx)861 void si_init_barrier_functions(struct si_context *sctx)
862 {
863 if (sctx->gfx_level >= GFX10)
864 sctx->emit_barrier = gfx10_emit_barrier;
865 else
866 sctx->emit_barrier = gfx6_emit_barrier;
867
868 sctx->atoms.s.barrier.emit = si_emit_barrier_as_atom;
869
870 sctx->b.memory_barrier = si_memory_barrier;
871 sctx->b.texture_barrier = si_texture_barrier;
872 }
873