1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_lrz.h"
7
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17 *
18 * There are a number of limitations when LRZ cannot be used:
19 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20 * - Writing to stencil buffer
21 * - Writing depth while:
22 * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23 * - Using OP_ALWAYS or OP_NOT_EQUAL;
24 * - Clearing depth with vkCmdClearAttachments;
25 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26 * - (pre-a650) Using secondary command buffers;
27 * - Sysmem rendering (with small caveat).
28 *
29 * A650+ (gen3+)
30 * =============
31 *
32 * While LRZ could be reused between renderpasses LRZ, it is disabled when
33 * underlying depth buffer is changed.
34 * The following commands could change a depth image:
35 * - vkCmdBlitImage*
36 * - vkCmdCopyBufferToImage*
37 * - vkCmdCopyImage*
38 *
39 * LRZ Fast-Clear
40 * ==============
41 *
42 * It's always valid to fast-clear. On the other hand we disable
43 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44 * for perf if some primitives are expected to fail depth test against the
45 * actual depth clear value.
46 *
47 * LRZ Caches
48 * ==========
49 *
50 * ! The policy here is to flush LRZ cache right after it is changed,
51 * so if LRZ data is needed afterwards - there is no need to flush it
52 * before using LRZ.
53 */
54
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57 cmd->state.rp.lrz_disable_reason = reason;
58 cmd->state.rp.lrz_disabled_at_draw = cmd->state.rp.drawcall_count;
59 perf_debug(cmd->device, "Disabling LRZ because '%s' at draw %u", reason,
60 cmd->state.rp.lrz_disabled_at_draw);
61 }
62
63 template <chip CHIP>
64 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)65 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
66 {
67 if (!depth_image) {
68 tu_cs_emit_regs(cs,
69 A6XX_GRAS_LRZ_BUFFER_BASE(0),
70 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
71 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
72
73 if (CHIP >= A7XX)
74 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
75
76 return;
77 }
78
79 uint64_t lrz_iova = depth_image->iova + depth_image->lrz_layout.lrz_offset;
80 uint64_t lrz_fc_iova =
81 depth_image->iova + depth_image->lrz_layout.lrz_fc_offset;
82 if (!depth_image->lrz_layout.lrz_fc_offset)
83 lrz_fc_iova = 0;
84
85 tu_cs_emit_regs(
86 cs, A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
87 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_layout.lrz_pitch,
88 .array_pitch =
89 depth_image->lrz_layout.lrz_layer_size),
90 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
91
92 if (CHIP >= A7XX) {
93 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
94 .depth_format = tu6_pipe2depth(depth_image->vk.format)
95 ));
96 }
97 }
98
99 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)100 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
101 struct tu_reg_value reg)
102 {
103 if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
104 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
105 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
106 tu_cs_emit(cs, reg.reg);
107 tu_cs_emit(cs, reg.value);
108 } else {
109 tu_cs_emit_pkt4(cs, reg.reg, 1);
110 tu_cs_emit(cs, reg.value);
111 }
112 }
113
114 template <chip CHIP>
115 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)116 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
117 struct A6XX_GRAS_LRZ_CNTL cntl)
118 {
119 if (CHIP >= A7XX) {
120 // A7XX split LRZ_CNTL into two seperate registers.
121 struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
122 .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
123 .fc_enable = cntl.fc_enable,
124 );
125 cntl.disable_on_wrong_dir = false;
126 cntl.fc_enable = false;
127
128 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
129 tu6_write_lrz_reg(cmd, cs, cntl2);
130 } else {
131 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
132 }
133 }
134
135 template <chip CHIP>
136 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)137 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
138 {
139 /* Disable direction by writing invalid depth view. */
140 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
141 .base_layer = 0b11111111111,
142 .layer_count = 0b11111111111,
143 .base_mip_level = 0b1111,
144 ));
145
146 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
147 .enable = true,
148 .disable_on_wrong_dir = true,
149 });
150
151 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
152 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
153 }
154
155 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)156 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
157 const struct tu_render_pass_attachment *att,
158 const struct tu_image_view *view)
159 {
160 if (!view->image->lrz_layout.lrz_total_size) {
161 assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
162 return;
163 }
164
165 bool clears_depth = att->clear_mask &
166 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
167 bool has_gpu_tracking =
168 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
169
170 if (!has_gpu_tracking && !clears_depth)
171 return;
172
173 /* We need to always have an LRZ view just to disable it if there is a
174 * depth attachment, there are any secondaries, and GPU tracking is
175 * enabled, in order not to rely on loadOp state which doesn't exist with
176 * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
177 * enabled and there will be a NULL/garbage LRZ buffer.
178 */
179 cmd->state.lrz.image_view = view;
180
181 if (!clears_depth && !att->load)
182 return;
183
184 cmd->state.lrz.valid = true;
185 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
186 /* Be optimistic and unconditionally enable fast-clear in
187 * secondary cmdbufs and when reusing previous LRZ state.
188 */
189 cmd->state.lrz.fast_clear =
190 view->image->lrz_layout.lrz_fc_size > 0 && !TU_DEBUG(NOLRZFC);
191
192 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
193 cmd->state.lrz.reuse_previous_state = !clears_depth;
194 }
195
196 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
197 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
198 * dynamically disabled).
199 */
200
201 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)202 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
203 const struct tu_render_pass_attachment *att)
204 {
205 bool has_gpu_tracking =
206 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
207
208 if (!has_gpu_tracking)
209 return;
210
211 if (!cmd->device->use_lrz)
212 return;
213
214 if (!vk_format_has_depth(att->format))
215 return;
216
217 cmd->state.lrz.valid = true;
218 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
219 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
220
221 /* We may not have the depth attachment when executing in a secondary
222 * inside a render pass. This means we have to be even more optimistic than
223 * the normal case and enable fast clear even if the depth image doesn't
224 * support it.
225 */
226 cmd->state.lrz.fast_clear = true;
227
228 /* These are not used inside secondaries */
229 cmd->state.lrz.image_view = NULL;
230 cmd->state.lrz.reuse_previous_state = false;
231 }
232
233 template <chip CHIP>
234 bool
tu_lrzfc_depth_supported(float depth)235 tu_lrzfc_depth_supported(float depth) {
236 /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
237 return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
238 }
239
240 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
241 * actually emitting anything. The lrz state needs to be consistent between
242 * renderpasses, but only the first should actually emit commands to disable
243 * lrz etc.
244 */
245 template <chip CHIP>
246 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)247 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
248 {
249 /* Track LRZ valid state */
250 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
251
252 uint32_t a;
253 for (a = 0; a < cmd->state.pass->attachment_count; a++) {
254 if (cmd->state.attachments[a]->image->lrz_layout.lrz_total_size)
255 break;
256 }
257
258 if (a != cmd->state.pass->attachment_count) {
259 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
260 tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
261 if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
262 VkClearValue clear = cmd->state.clear_values[a];
263 cmd->state.lrz.depth_clear_value = clear;
264 cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
265 tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
266 }
267 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
268 }
269 }
270 TU_GENX(tu_lrz_begin_resumed_renderpass);
271
272 template <chip CHIP>
273 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)274 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
275 {
276 const struct tu_render_pass *pass = cmd->state.pass;
277
278 cmd->state.rp.lrz_disable_reason = "";
279 cmd->state.rp.lrz_disabled_at_draw = 0;
280
281 int lrz_img_count = 0;
282 for (unsigned i = 0; i < pass->attachment_count; i++) {
283 if (cmd->state.attachments[i]->image->lrz_layout.lrz_total_size)
284 lrz_img_count++;
285 }
286
287 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
288 cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
289 /* Theoretically we could switch between LRZ buffers during the binning
290 * and tiling passes, but it is untested and would add complexity for
291 * presumably extremely rare case.
292 */
293 tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
294
295 for (unsigned i = 0; i < pass->attachment_count; i++) {
296 struct tu_image *image = cmd->state.attachments[i]->image;
297 tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
298 }
299
300 /* We need a valid LRZ fast-clear base, in case the render pass contents
301 * are in secondaries that enable LRZ, so that they can read that LRZ is
302 * dynamically disabled. It doesn't matter which we use, so just leave
303 * the last one as emitted in tu_disable_lrz().
304 */
305 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
306 return;
307 }
308
309 /* Track LRZ valid state */
310 tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
311
312 if (!cmd->state.lrz.valid || TU_DEBUG(NOLRZ)) {
313 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {});
314 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
315 }
316 }
317 TU_GENX(tu_lrz_begin_renderpass);
318
319 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)320 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
321 {
322 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
323 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
324 if (a != VK_ATTACHMENT_UNUSED) {
325 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
326 tu_lrz_init_secondary(cmd, att);
327 }
328 }
329
330 template <chip CHIP>
331 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)332 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
333 {
334 /* TODO: If lrz was never valid for the entire renderpass, we could exit
335 * early here. Sometimes we know this ahead of time and null out
336 * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
337 * no secondaries.
338 */
339 if (!cmd->state.lrz.image_view)
340 return;
341
342 struct tu_lrz_state *lrz = &cmd->state.lrz;
343
344 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
345
346 if (lrz->reuse_previous_state) {
347 /* Reuse previous LRZ state, LRZ cache is assumed to be
348 * already invalidated by previous renderpass.
349 */
350 assert(lrz->gpu_dir_tracking);
351
352 tu6_write_lrz_reg(cmd, cs,
353 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
354 return;
355 }
356
357 if (lrz->disable_for_rp) {
358 /* We may deem necessary to disable LRZ for the whole renderpass.
359 * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
360 * to fail the comparison of depth views.
361 * TODO: Find if there are conditions where it is beneficial.
362 */
363 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
364 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
365 } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
366 if (lrz->gpu_dir_tracking) {
367 tu6_write_lrz_reg(cmd, cs,
368 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
369 }
370
371 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
372 .enable = true,
373 .fc_enable = lrz->fast_clear,
374 .disable_on_wrong_dir = lrz->gpu_dir_tracking,
375 });
376
377 /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
378 * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
379 * CUR_DIR_UNSET.
380 */
381 if (CHIP >= A7XX)
382 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
383 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
384 }
385
386 if (!lrz->fast_clear && !lrz->disable_for_rp) {
387 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
388 /* Even though we disable fast-clear we still have to dirty
389 * fast-clear buffer because both secondary cmdbufs and following
390 * renderpasses won't know that fast-clear is disabled.
391 *
392 * TODO: we could avoid this if we don't store depth and don't
393 * expect secondary cmdbufs.
394 */
395 if (lrz->image_view->image->lrz_layout.lrz_fc_size > 0) {
396 tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
397 }
398 }
399 }
400 TU_GENX(tu_lrz_tiling_begin);
401
402 /* We need to re-emit LRZ state before each tile due to skipsaverestore.
403 */
404 template <chip CHIP>
405 void
tu_lrz_before_tile(struct tu_cmd_buffer * cmd,struct tu_cs * cs)406 tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
407 {
408 struct tu_lrz_state *lrz = &cmd->state.lrz;
409
410 if (!lrz->image_view) {
411 tu6_emit_lrz_buffer<CHIP>(cs, NULL);
412 } else {
413 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
414
415 if (lrz->gpu_dir_tracking) {
416 if (lrz->disable_for_rp) {
417 /* Make sure we fail the comparison of depth views */
418 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
419 } else {
420 tu6_write_lrz_reg(cmd, cs,
421 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
422 }
423 }
424 }
425 }
426 TU_GENX(tu_lrz_before_tile);
427
428 template <chip CHIP>
429 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)430 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
431 {
432 if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
433 tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
434
435 if (cmd->state.lrz.gpu_dir_tracking) {
436 tu6_write_lrz_reg(cmd, &cmd->cs,
437 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
438 }
439
440 /* Enable flushing of LRZ fast-clear and of direction buffer */
441 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
442 .enable = true,
443 .fc_enable = cmd->state.lrz.fast_clear,
444 .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
445 });
446 } else {
447 tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
448 }
449
450 /* If we haven't disabled LRZ during renderpass, we need to disable it here
451 * for next renderpass to not use invalid LRZ values.
452 */
453 if (cmd->state.lrz.gpu_dir_tracking && !cmd->state.lrz.disable_for_rp &&
454 !cmd->state.lrz.valid) {
455 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
456 .base_layer = 0b11111111111,
457 .layer_count = 0b11111111111,
458 .base_mip_level = 0b1111,
459 ));
460
461 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
462 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
463 } else {
464 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
465 }
466 }
467 TU_GENX(tu_lrz_tiling_end);
468
469 template <chip CHIP>
470 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)471 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
472 {
473 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
474 tu_lrz_tiling_begin<CHIP>(cmd, cs);
475 return;
476 }
477
478 if (!cmd->state.lrz.image_view)
479 return;
480
481 /* Actually, LRZ buffer could be filled in sysmem, in theory to
482 * be used in another renderpass, but the benefit is rather dubious.
483 */
484
485 struct tu_lrz_state *lrz = &cmd->state.lrz;
486
487 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
488 tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
489 /* Make sure depth view comparison will fail. */
490 tu6_write_lrz_reg(cmd, cs,
491 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
492 } else {
493 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
494 /* Even though we disable LRZ writes in sysmem mode - there is still
495 * LRZ test, so LRZ should be cleared.
496 */
497 if (lrz->fast_clear) {
498 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
499 .enable = true,
500 .fc_enable = true,
501 });
502
503 if (CHIP >= A7XX)
504 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
505 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
506 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
507 } else {
508 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
509 }
510 }
511 }
512 TU_GENX(tu_lrz_sysmem_begin);
513
514 template <chip CHIP>
515 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)516 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
517 {
518 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
519 tu_lrz_tiling_end<CHIP>(cmd, cs);
520 return;
521 }
522
523 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
524 }
525 TU_GENX(tu_lrz_sysmem_end);
526
527 /* Disable LRZ outside of renderpass. */
528 template <chip CHIP>
529 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)530 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
531 struct tu_image *image)
532 {
533 if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
534 return;
535
536 if (!image->lrz_layout.lrz_total_size)
537 return;
538
539 tu6_emit_lrz_buffer<CHIP>(cs, image);
540 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
541 }
542 TU_GENX(tu_disable_lrz);
543
544 /* Disable LRZ from the CPU, for host image copy */
545 template <chip CHIP>
546 void
tu_disable_lrz_cpu(struct tu_device * device,struct tu_image * image)547 tu_disable_lrz_cpu(struct tu_device *device, struct tu_image *image)
548 {
549 if (!device->physical_device->info->a6xx.has_lrz_dir_tracking)
550 return;
551
552 if (!image->lrz_layout.lrz_total_size)
553 return;
554
555 const unsigned lrz_dir_offset = offsetof(fd_lrzfc_layout<CHIP>, dir_track);
556 uint8_t *lrz_dir_tracking =
557 (uint8_t *)image->map + image->lrz_layout.lrz_fc_offset + lrz_dir_offset;
558
559 *lrz_dir_tracking = FD_LRZ_GPU_DIR_DISABLED;
560
561 if (image->bo->cached_non_coherent) {
562 tu_bo_sync_cache(
563 device, image->bo,
564 image->bo_offset + image->lrz_layout.lrz_offset + lrz_dir_offset, 1,
565 TU_MEM_SYNC_CACHE_TO_GPU);
566 }
567 }
568 TU_GENX(tu_disable_lrz_cpu);
569
570 /* Clear LRZ, used for out of renderpass depth clears. */
571 template <chip CHIP>
572 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)573 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
574 struct tu_image *image,
575 const VkClearDepthStencilValue *pDepthStencil,
576 uint32_t rangeCount,
577 const VkImageSubresourceRange *pRanges)
578 {
579 if (!rangeCount || !image->lrz_layout.lrz_total_size ||
580 !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
581 return;
582
583 /* We cannot predict which depth subresource would be used later on,
584 * so we just pick the first one with depth cleared and clear the LRZ.
585 */
586 const VkImageSubresourceRange *range = NULL;
587 for (unsigned i = 0; i < rangeCount; i++) {
588 if (pRanges[i].aspectMask &
589 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
590 range = &pRanges[i];
591 break;
592 }
593 }
594
595 if (!range)
596 return;
597
598 bool fast_clear = image->lrz_layout.lrz_fc_size &&
599 tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth) &&
600 !TU_DEBUG(NOLRZFC);
601
602 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
603
604 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
605 .base_layer = range->baseArrayLayer,
606 .layer_count = vk_image_subresource_layer_count(&image->vk, range),
607 .base_mip_level = range->baseMipLevel,
608 ));
609
610 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
611 .enable = true,
612 .fc_enable = fast_clear,
613 .disable_on_wrong_dir = true,
614 });
615
616 if (CHIP >= A7XX)
617 tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
618 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
619 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
620
621 if (!fast_clear) {
622 tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
623 }
624 }
625 TU_GENX(tu_lrz_clear_depth_image);
626
627 template <chip CHIP>
628 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd,const char * reason)629 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd,
630 const char *reason)
631 {
632 assert(cmd->state.pass);
633
634 tu_lrz_disable_reason(cmd, reason);
635
636 cmd->state.lrz.valid = false;
637 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
638 }
639 TU_GENX(tu_lrz_disable_during_renderpass);
640
641 template <chip CHIP>
642 void
tu_lrz_flush_valid_during_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs)643 tu_lrz_flush_valid_during_renderpass(struct tu_cmd_buffer *cmd,
644 struct tu_cs *cs)
645 {
646 if (cmd->state.lrz.valid || cmd->state.lrz.disable_for_rp)
647 return;
648
649 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
650 .base_layer = 0b11111111111,
651 .layer_count = 0b11111111111,
652 .base_mip_level = 0b1111,
653 ));
654 }
655 TU_GENX(tu_lrz_flush_valid_during_renderpass);
656
657 /* update lrz state based on stencil-test func:
658 *
659 * Conceptually the order of the pipeline is:
660 *
661 *
662 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
663 * | |
664 * if wrmask != 0 if wrmask != 0
665 * | |
666 * v v
667 * Stencil-Write Depth-Write
668 *
669 * Because Stencil-Test can have side effects (Stencil-Write) prior
670 * to depth test, in this case we potentially need to disable early
671 * lrz-test. See:
672 *
673 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
674 */
675 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)676 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
677 VkCompareOp func,
678 bool stencil_write)
679 {
680 switch (func) {
681 case VK_COMPARE_OP_ALWAYS:
682 /* nothing to do for LRZ, but for stencil test when stencil-
683 * write is enabled, we need to disable lrz-test, since
684 * conceptually stencil test and write happens before depth-test.
685 */
686 if (stencil_write) {
687 return false;
688 }
689 break;
690 case VK_COMPARE_OP_NEVER:
691 /* fragment never passes, disable lrz_write for this draw. */
692 gras_lrz_cntl->lrz_write = false;
693 break;
694 default:
695 /* whether the fragment passes or not depends on result
696 * of stencil test, which we cannot know when doing binning
697 * pass.
698 */
699 gras_lrz_cntl->lrz_write = false;
700 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
701 * effects from stencil test we need to disable lrz-test.
702 */
703 if (stencil_write) {
704 return false;
705 }
706 break;
707 }
708
709 return true;
710 }
711
712 template <chip CHIP>
713 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)714 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
715 const uint32_t a)
716 {
717 const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
718 bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
719 bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
720 bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
721 VkCompareOp depth_compare_op =
722 cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
723
724 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
725
726 if (!cmd->state.lrz.valid) {
727 return gras_lrz_cntl;
728 }
729
730 /* If depth test is disabled we shouldn't touch LRZ.
731 * Same if there is no depth attachment.
732 */
733 if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
734 return gras_lrz_cntl;
735
736 if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
737 /* Without on-gpu LRZ direction tracking - there is nothing we
738 * can do to enable LRZ in secondary command buffers.
739 */
740 return gras_lrz_cntl;
741 }
742
743 /* See comment in tu_pipeline about disabling LRZ write for blending. */
744 bool reads_dest = cmd->state.blend_reads_dest;
745
746 gras_lrz_cntl.enable = true;
747 gras_lrz_cntl.lrz_write =
748 z_write_enable &&
749 !reads_dest &&
750 !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
751 gras_lrz_cntl.z_test_enable = z_write_enable;
752 gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
753 gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
754 gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
755 gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
756
757 if (CHIP >= A7XX)
758 gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
759
760 /* LRZ is disabled until it is cleared, which means that one "wrong"
761 * depth test or shader could disable LRZ until depth buffer is cleared.
762 */
763 bool disable_lrz = false;
764 bool temporary_disable_lrz = false;
765
766 /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
767 * fragment tests. We have to skip LRZ testing and updating, but as long as
768 * the depth direction stayed the same we can continue with LRZ testing later.
769 */
770 if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
771 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
772 perf_debug(cmd->device, "Skipping LRZ due to FS");
773 temporary_disable_lrz = true;
774 } else {
775 tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
776 disable_lrz = true;
777 }
778 }
779
780 /* If Z is not written - it doesn't affect LRZ buffer state.
781 * Which means two things:
782 * - Don't lock direction until Z is written for the first time;
783 * - If Z isn't written and direction IS locked it's possible to just
784 * temporary disable LRZ instead of fully bailing out, when direction
785 * is changed.
786 */
787
788 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
789 switch (depth_compare_op) {
790 case VK_COMPARE_OP_ALWAYS:
791 case VK_COMPARE_OP_NOT_EQUAL:
792 /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
793 * so if there is a depth write - LRZ must be disabled.
794 */
795 if (z_write_enable) {
796 tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
797 disable_lrz = true;
798 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
799 } else {
800 perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
801 temporary_disable_lrz = true;
802 }
803 break;
804 case VK_COMPARE_OP_EQUAL:
805 case VK_COMPARE_OP_NEVER:
806 /* Blob disables LRZ for OP_EQUAL, and from our empirical
807 * evidence it is a right thing to do.
808 *
809 * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
810 * we could just temporary disable LRZ.
811 */
812 temporary_disable_lrz = true;
813 break;
814 case VK_COMPARE_OP_GREATER:
815 case VK_COMPARE_OP_GREATER_OR_EQUAL:
816 lrz_direction = TU_LRZ_GREATER;
817 gras_lrz_cntl.greater = true;
818 gras_lrz_cntl.dir = LRZ_DIR_GE;
819 break;
820 case VK_COMPARE_OP_LESS:
821 case VK_COMPARE_OP_LESS_OR_EQUAL:
822 lrz_direction = TU_LRZ_LESS;
823 gras_lrz_cntl.greater = false;
824 gras_lrz_cntl.dir = LRZ_DIR_LE;
825 break;
826 default:
827 unreachable("bad VK_COMPARE_OP value or uninitialized");
828 break;
829 };
830
831 /* If depthfunc direction is changed, bail out on using LRZ. The
832 * LRZ buffer encodes a min/max depth value per block, but if
833 * we switch from GT/GE <-> LT/LE, those values cannot be
834 * interpreted properly.
835 */
836 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
837 lrz_direction != TU_LRZ_UNKNOWN &&
838 cmd->state.lrz.prev_direction != lrz_direction) {
839 if (z_write_enable) {
840 tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
841 disable_lrz = true;
842 } else {
843 perf_debug(cmd->device, "Skipping LRZ due to direction change");
844 temporary_disable_lrz = true;
845 }
846 }
847
848 /* Consider the following sequence of depthfunc changes:
849 *
850 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
851 * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
852 * during second VK_COMPARE_OP_GREATER.
853 *
854 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
855 * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
856 * invalid during COMPARE_OP_LESS.
857 *
858 * This shows that we should keep last KNOWN direction.
859 */
860 if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
861 cmd->state.lrz.prev_direction = lrz_direction;
862
863 /* Invalidate LRZ and disable write if stencil test is enabled */
864 bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
865 if (!disable_lrz && stencil_test_enable) {
866 VkCompareOp stencil_front_compare_op = (VkCompareOp)
867 cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
868
869 VkCompareOp stencil_back_compare_op = (VkCompareOp)
870 cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
871
872 bool lrz_allowed = true;
873 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
874 &gras_lrz_cntl, stencil_front_compare_op,
875 cmd->state.stencil_front_write);
876
877 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
878 &gras_lrz_cntl, stencil_back_compare_op,
879 cmd->state.stencil_back_write);
880
881 /* Without depth write it's enough to make sure that depth test
882 * is executed after stencil test, so temporary disabling LRZ is enough.
883 */
884 if (!lrz_allowed) {
885 if (z_write_enable) {
886 tu_lrz_disable_reason(cmd, "Stencil write");
887 disable_lrz = true;
888 } else {
889 perf_debug(cmd->device, "Skipping LRZ due to stencil write");
890 temporary_disable_lrz = true;
891 }
892 }
893 }
894
895 /* Writing depth with blend enabled means we need to invalidate LRZ,
896 * because the written depth value could mean that a later draw with
897 * depth enabled (where we would otherwise write LRZ) could have
898 * fragments which don't pass the depth test due to this draw. For
899 * example, consider this sequence of draws, with depth mode GREATER:
900 *
901 * draw A:
902 * z=0.1, fragments pass
903 * draw B:
904 * z=0.4, fragments pass
905 * blend enabled (LRZ write disabled)
906 * depth write enabled
907 * draw C:
908 * z=0.2, fragments don't pass
909 * blend disabled
910 * depth write enabled
911 *
912 * Normally looking at the state in draw C, we'd assume we could
913 * enable LRZ write. But this would cause early-z/lrz to discard
914 * fragments from draw A which should be visible due to draw B.
915 */
916 if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
917 tu_lrz_disable_reason(cmd, "Depth write + blending");
918 disable_lrz = true;
919 }
920
921 if (disable_lrz)
922 cmd->state.lrz.valid = false;
923
924 if (temporary_disable_lrz || disable_lrz)
925 gras_lrz_cntl.enable = false;
926
927 cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
928 if (!cmd->state.lrz.enabled)
929 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
930
931 return gras_lrz_cntl;
932 }
933
934 template <chip CHIP>
935 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)936 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
937 {
938 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
939 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
940
941 tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
942 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
943 }
944 TU_GENX(tu6_emit_lrz);
945