1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_lrz.h"
7
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17 *
18 * There are a number of limitations when LRZ cannot be used:
19 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20 * - Writing to stencil buffer
21 * - Writing depth while:
22 * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23 * - Using OP_ALWAYS or OP_NOT_EQUAL;
24 * - Clearing depth with vkCmdClearAttachments;
25 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26 * - (pre-a650) Using secondary command buffers;
27 * - Sysmem rendering (with small caveat).
28 *
29 * A650+ (gen3+)
30 * =============
31 *
32 * While LRZ could be reused between renderpasses LRZ, it is disabled when
33 * underlying depth buffer is changed.
34 * The following commands could change a depth image:
35 * - vkCmdBlitImage*
36 * - vkCmdCopyBufferToImage*
37 * - vkCmdCopyImage*
38 *
39 * LRZ Fast-Clear
40 * ==============
41 *
42 * It's always valid to fast-clear. On the other hand we disable
43 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44 * for perf if some primitives are expected to fail depth test against the
45 * actual depth clear value.
46 *
47 * LRZ Caches
48 * ==========
49 *
50 * ! The policy here is to flush LRZ cache right after it is changed,
51 * so if LRZ data is needed afterwards - there is no need to flush it
52 * before using LRZ.
53 */
54
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57 cmd->state.rp.lrz_disable_reason = reason;
58 perf_debug(cmd->device, "Disabling LRZ because '%s'", reason);
59 }
60
61 template <chip CHIP>
62 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)63 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
64 {
65 if (!depth_image) {
66 tu_cs_emit_regs(cs,
67 A6XX_GRAS_LRZ_BUFFER_BASE(0),
68 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
69 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
70
71 if (CHIP >= A7XX)
72 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
73
74 return;
75 }
76
77 uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
78 uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
79 if (!depth_image->lrz_fc_offset)
80 lrz_fc_iova = 0;
81
82 tu_cs_emit_regs(cs,
83 A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
84 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
85 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
86
87 if (CHIP >= A7XX) {
88 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
89 .depth_format = tu6_pipe2depth(depth_image->vk.format)
90 ));
91 }
92 }
93
94 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)95 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
96 struct tu_reg_value reg)
97 {
98 if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
99 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
100 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
101 tu_cs_emit(cs, reg.reg);
102 tu_cs_emit(cs, reg.value);
103 } else {
104 tu_cs_emit_pkt4(cs, reg.reg, 1);
105 tu_cs_emit(cs, reg.value);
106 }
107 }
108
109 template <chip CHIP>
110 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)111 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
112 struct A6XX_GRAS_LRZ_CNTL cntl)
113 {
114 if (CHIP >= A7XX) {
115 // A7XX split LRZ_CNTL into two seperate registers.
116 struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
117 .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
118 .fc_enable = cntl.fc_enable,
119 );
120 cntl.disable_on_wrong_dir = false;
121 cntl.fc_enable = false;
122
123 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
124 tu6_write_lrz_reg(cmd, cs, cntl2);
125 } else {
126 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
127 }
128 }
129
130 template <chip CHIP>
131 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)132 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
133 {
134 /* Disable direction by writing invalid depth view. */
135 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
136 .base_layer = 0b11111111111,
137 .layer_count = 0b11111111111,
138 .base_mip_level = 0b1111,
139 ));
140
141 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
142 .enable = true,
143 .disable_on_wrong_dir = true,
144 });
145
146 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
147 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
148 }
149
150 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)151 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
152 const struct tu_render_pass_attachment *att,
153 const struct tu_image_view *view)
154 {
155 if (!view->image->lrz_height) {
156 assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
157 return;
158 }
159
160 bool clears_depth = att->clear_mask &
161 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
162 bool has_gpu_tracking =
163 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
164
165 if (!has_gpu_tracking && !clears_depth)
166 return;
167
168 /* We need to always have an LRZ view just to disable it if there is a
169 * depth attachment, there are any secondaries, and GPU tracking is
170 * enabled, in order not to rely on loadOp state which doesn't exist with
171 * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
172 * enabled and there will be a NULL/garbage LRZ buffer.
173 */
174 cmd->state.lrz.image_view = view;
175
176 if (!clears_depth && !att->load)
177 return;
178
179 cmd->state.lrz.valid = true;
180 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
181 /* Be optimistic and unconditionally enable fast-clear in
182 * secondary cmdbufs and when reusing previous LRZ state.
183 */
184 cmd->state.lrz.fast_clear = view->image->has_lrz_fc;
185
186 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
187 cmd->state.lrz.reuse_previous_state = !clears_depth;
188 }
189
190 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
191 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
192 * dynamically disabled).
193 */
194
195 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)196 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
197 const struct tu_render_pass_attachment *att)
198 {
199 bool has_gpu_tracking =
200 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
201
202 if (!has_gpu_tracking)
203 return;
204
205 if (!cmd->device->use_lrz)
206 return;
207
208 if (!vk_format_has_depth(att->format))
209 return;
210
211 cmd->state.lrz.valid = true;
212 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
213 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
214
215 /* We may not have the depth attachment when executing in a secondary
216 * inside a render pass. This means we have to be even more optimistic than
217 * the normal case and enable fast clear even if the depth image doesn't
218 * support it.
219 */
220 cmd->state.lrz.fast_clear = true;
221
222 /* These are not used inside secondaries */
223 cmd->state.lrz.image_view = NULL;
224 cmd->state.lrz.reuse_previous_state = false;
225 }
226
227 template <chip CHIP>
228 bool
tu_lrzfc_depth_supported(float depth)229 tu_lrzfc_depth_supported(float depth) {
230 /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
231 return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
232 }
233
234 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
235 * actually emitting anything. The lrz state needs to be consistent between
236 * renderpasses, but only the first should actually emit commands to disable
237 * lrz etc.
238 */
239 template <chip CHIP>
240 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)241 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
242 {
243 /* Track LRZ valid state */
244 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
245
246 uint32_t a;
247 for (a = 0; a < cmd->state.pass->attachment_count; a++) {
248 if (cmd->state.attachments[a]->image->lrz_height)
249 break;
250 }
251
252 if (a != cmd->state.pass->attachment_count) {
253 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
254 tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
255 if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
256 VkClearValue clear = cmd->state.clear_values[a];
257 cmd->state.lrz.depth_clear_value = clear;
258 cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
259 tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
260 }
261 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
262 }
263 }
264 TU_GENX(tu_lrz_begin_resumed_renderpass);
265
266 template <chip CHIP>
267 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)268 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
269 {
270 const struct tu_render_pass *pass = cmd->state.pass;
271
272 cmd->state.rp.lrz_disable_reason = "";
273
274 int lrz_img_count = 0;
275 for (unsigned i = 0; i < pass->attachment_count; i++) {
276 if (cmd->state.attachments[i]->image->lrz_height)
277 lrz_img_count++;
278 }
279
280 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
281 cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
282 /* Theoretically we could switch between LRZ buffers during the binning
283 * and tiling passes, but it is untested and would add complexity for
284 * presumably extremely rare case.
285 */
286 tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
287
288 for (unsigned i = 0; i < pass->attachment_count; i++) {
289 struct tu_image *image = cmd->state.attachments[i]->image;
290 tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
291 }
292
293 /* We need a valid LRZ fast-clear base, in case the render pass contents
294 * are in secondaries that enable LRZ, so that they can read that LRZ is
295 * dynamically disabled. It doesn't matter which we use, so just leave
296 * the last one as emitted in tu_disable_lrz().
297 */
298 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
299 return;
300 }
301
302 /* Track LRZ valid state */
303 tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
304
305 if (!cmd->state.lrz.valid) {
306 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
307 }
308 }
309 TU_GENX(tu_lrz_begin_renderpass);
310
311 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)312 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
313 {
314 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
315 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
316 if (a != VK_ATTACHMENT_UNUSED) {
317 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
318 tu_lrz_init_secondary(cmd, att);
319 }
320 }
321
322 template <chip CHIP>
323 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)324 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
325 {
326 /* TODO: If lrz was never valid for the entire renderpass, we could exit
327 * early here. Sometimes we know this ahead of time and null out
328 * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
329 * no secondaries.
330 */
331 if (!cmd->state.lrz.image_view)
332 return;
333
334 struct tu_lrz_state *lrz = &cmd->state.lrz;
335
336 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
337
338 if (lrz->reuse_previous_state) {
339 /* Reuse previous LRZ state, LRZ cache is assumed to be
340 * already invalidated by previous renderpass.
341 */
342 assert(lrz->gpu_dir_tracking);
343
344 tu6_write_lrz_reg(cmd, cs,
345 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
346 return;
347 }
348
349 bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
350 if (invalidate_lrz) {
351 /* Following the blob we elect to disable LRZ for the whole renderpass
352 * if it is known that LRZ is disabled somewhere in the renderpass.
353 *
354 * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
355 * to fail the comparison of depth views.
356 */
357 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
358 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
359 } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
360 if (lrz->gpu_dir_tracking) {
361 tu6_write_lrz_reg(cmd, cs,
362 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
363 }
364
365 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
366 .enable = true,
367 .fc_enable = lrz->fast_clear,
368 .disable_on_wrong_dir = lrz->gpu_dir_tracking,
369 });
370
371 /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
372 * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
373 * CUR_DIR_UNSET.
374 */
375 if (CHIP >= A7XX)
376 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
377 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
378 }
379
380 if (!lrz->fast_clear && !invalidate_lrz) {
381 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
382 /* Even though we disable fast-clear we still have to dirty
383 * fast-clear buffer because both secondary cmdbufs and following
384 * renderpasses won't know that fast-clear is disabled.
385 *
386 * TODO: we could avoid this if we don't store depth and don't
387 * expect secondary cmdbufs.
388 */
389 if (lrz->image_view->image->has_lrz_fc) {
390 tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
391 }
392 }
393 }
394 TU_GENX(tu_lrz_tiling_begin);
395
396 /* We need to re-emit LRZ state before each tile due to skipsaverestore.
397 */
398 template <chip CHIP>
399 void
tu_lrz_before_tile(struct tu_cmd_buffer * cmd,struct tu_cs * cs)400 tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
401 {
402 struct tu_lrz_state *lrz = &cmd->state.lrz;
403
404 if (!lrz->image_view) {
405 tu6_emit_lrz_buffer<CHIP>(cs, NULL);
406 } else {
407 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
408
409 if (lrz->gpu_dir_tracking) {
410 if (!lrz->valid) {
411 /* Make sure we fail the comparison of depth views */
412 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
413 } else {
414 tu6_write_lrz_reg(cmd, cs,
415 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
416 }
417 }
418 }
419 }
420 TU_GENX(tu_lrz_before_tile);
421
422 template <chip CHIP>
423 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)424 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
425 {
426 if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
427 tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
428
429 if (cmd->state.lrz.gpu_dir_tracking) {
430 tu6_write_lrz_reg(cmd, &cmd->cs,
431 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
432 }
433
434 /* Enable flushing of LRZ fast-clear and of direction buffer */
435 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
436 .enable = true,
437 .fc_enable = cmd->state.lrz.fast_clear,
438 .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
439 });
440 } else {
441 tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
442 }
443
444 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
445
446 /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
447 * additionally clears direction buffer:
448 * GRAS_LRZ_DEPTH_VIEW(.dword = 0)
449 * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
450 * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
451 * LRZ_CLEAR
452 * LRZ_FLUSH
453 * Since it happens after all of the rendering is done there is no known
454 * reason to do such clear.
455 */
456 }
457 TU_GENX(tu_lrz_tiling_end);
458
459 template <chip CHIP>
460 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)461 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
462 {
463 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
464 tu_lrz_tiling_begin<CHIP>(cmd, cs);
465 return;
466 }
467
468 if (!cmd->state.lrz.image_view)
469 return;
470
471 /* Actually, LRZ buffer could be filled in sysmem, in theory to
472 * be used in another renderpass, but the benefit is rather dubious.
473 */
474
475 struct tu_lrz_state *lrz = &cmd->state.lrz;
476
477 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
478 tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
479 /* Make sure depth view comparison will fail. */
480 tu6_write_lrz_reg(cmd, cs,
481 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
482 } else {
483 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
484 /* Even though we disable LRZ writes in sysmem mode - there is still
485 * LRZ test, so LRZ should be cleared.
486 */
487 if (lrz->fast_clear) {
488 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
489 .enable = true,
490 .fc_enable = true,
491 });
492
493 if (CHIP >= A7XX)
494 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
495 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
496 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
497 } else {
498 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
499 }
500 }
501 }
502 TU_GENX(tu_lrz_sysmem_begin);
503
504 template <chip CHIP>
505 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)506 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
507 {
508 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
509 tu_lrz_tiling_end<CHIP>(cmd, cs);
510 return;
511 }
512
513 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
514 }
515 TU_GENX(tu_lrz_sysmem_end);
516
517 /* Disable LRZ outside of renderpass. */
518 template <chip CHIP>
519 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)520 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
521 struct tu_image *image)
522 {
523 if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
524 return;
525
526 if (!image->lrz_height)
527 return;
528
529 tu6_emit_lrz_buffer<CHIP>(cs, image);
530 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
531 }
532 TU_GENX(tu_disable_lrz);
533
534 /* Disable LRZ from the CPU, for host image copy */
535 template <chip CHIP>
536 void
tu_disable_lrz_cpu(struct tu_device * device,struct tu_image * image)537 tu_disable_lrz_cpu(struct tu_device *device, struct tu_image *image)
538 {
539 if (!device->physical_device->info->a6xx.has_lrz_dir_tracking)
540 return;
541
542 if (!image->lrz_height)
543 return;
544
545 const unsigned lrz_dir_offset = offsetof(fd_lrzfc_layout<CHIP>, dir_track);
546 uint8_t *lrz_dir_tracking =
547 (uint8_t *)image->map + image->lrz_fc_offset + lrz_dir_offset;
548
549 *lrz_dir_tracking = FD_LRZ_GPU_DIR_DISABLED;
550
551 if (image->bo->cached_non_coherent) {
552 tu_bo_sync_cache(device, image->bo,
553 image->bo_offset + image->lrz_offset + lrz_dir_offset,
554 1, TU_MEM_SYNC_CACHE_TO_GPU);
555 }
556 }
557 TU_GENX(tu_disable_lrz_cpu);
558
559 /* Clear LRZ, used for out of renderpass depth clears. */
560 template <chip CHIP>
561 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)562 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
563 struct tu_image *image,
564 const VkClearDepthStencilValue *pDepthStencil,
565 uint32_t rangeCount,
566 const VkImageSubresourceRange *pRanges)
567 {
568 if (!rangeCount || !image->lrz_height ||
569 !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
570 return;
571
572 /* We cannot predict which depth subresource would be used later on,
573 * so we just pick the first one with depth cleared and clear the LRZ.
574 */
575 const VkImageSubresourceRange *range = NULL;
576 for (unsigned i = 0; i < rangeCount; i++) {
577 if (pRanges[i].aspectMask &
578 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
579 range = &pRanges[i];
580 break;
581 }
582 }
583
584 if (!range)
585 return;
586
587 bool fast_clear = image->has_lrz_fc &&
588 tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth);
589
590 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
591
592 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
593 .base_layer = range->baseArrayLayer,
594 .layer_count = vk_image_subresource_layer_count(&image->vk, range),
595 .base_mip_level = range->baseMipLevel,
596 ));
597
598 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
599 .enable = true,
600 .fc_enable = fast_clear,
601 .disable_on_wrong_dir = true,
602 });
603
604 if (CHIP >= A7XX)
605 tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
606 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
607 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
608
609 if (!fast_clear) {
610 tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
611 }
612 }
613 TU_GENX(tu_lrz_clear_depth_image);
614
615 template <chip CHIP>
616 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)617 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
618 {
619 assert(cmd->state.pass);
620
621 cmd->state.lrz.valid = false;
622 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
623
624 if (cmd->state.lrz.gpu_dir_tracking) {
625 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
626 .enable = true,
627 .dir = LRZ_DIR_INVALID,
628 .disable_on_wrong_dir = true,
629 });
630 }
631 }
632 TU_GENX(tu_lrz_disable_during_renderpass);
633
634 /* update lrz state based on stencil-test func:
635 *
636 * Conceptually the order of the pipeline is:
637 *
638 *
639 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
640 * | |
641 * if wrmask != 0 if wrmask != 0
642 * | |
643 * v v
644 * Stencil-Write Depth-Write
645 *
646 * Because Stencil-Test can have side effects (Stencil-Write) prior
647 * to depth test, in this case we potentially need to disable early
648 * lrz-test. See:
649 *
650 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
651 */
652 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)653 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
654 VkCompareOp func,
655 bool stencil_write)
656 {
657 switch (func) {
658 case VK_COMPARE_OP_ALWAYS:
659 /* nothing to do for LRZ, but for stencil test when stencil-
660 * write is enabled, we need to disable lrz-test, since
661 * conceptually stencil test and write happens before depth-test.
662 */
663 if (stencil_write) {
664 return false;
665 }
666 break;
667 case VK_COMPARE_OP_NEVER:
668 /* fragment never passes, disable lrz_write for this draw. */
669 gras_lrz_cntl->lrz_write = false;
670 break;
671 default:
672 /* whether the fragment passes or not depends on result
673 * of stencil test, which we cannot know when doing binning
674 * pass.
675 */
676 gras_lrz_cntl->lrz_write = false;
677 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
678 * effects from stencil test we need to disable lrz-test.
679 */
680 if (stencil_write) {
681 return false;
682 }
683 break;
684 }
685
686 return true;
687 }
688
689 template <chip CHIP>
690 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)691 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
692 const uint32_t a)
693 {
694 const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
695 bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
696 bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
697 bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
698 VkCompareOp depth_compare_op =
699 cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
700
701 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
702
703 if (!cmd->state.lrz.valid) {
704 return gras_lrz_cntl;
705 }
706
707 /* If depth test is disabled we shouldn't touch LRZ.
708 * Same if there is no depth attachment.
709 */
710 if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
711 return gras_lrz_cntl;
712
713 if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
714 /* Without on-gpu LRZ direction tracking - there is nothing we
715 * can do to enable LRZ in secondary command buffers.
716 */
717 return gras_lrz_cntl;
718 }
719
720 /* See comment in tu_pipeline about disabling LRZ write for blending. */
721 bool reads_dest = cmd->state.blend_reads_dest;
722
723 gras_lrz_cntl.enable = true;
724 gras_lrz_cntl.lrz_write =
725 z_write_enable &&
726 !reads_dest &&
727 !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
728 gras_lrz_cntl.z_test_enable = z_write_enable;
729 gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
730 gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
731 gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
732 gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
733
734 if (CHIP >= A7XX)
735 gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
736
737 /* LRZ is disabled until it is cleared, which means that one "wrong"
738 * depth test or shader could disable LRZ until depth buffer is cleared.
739 */
740 bool disable_lrz = false;
741 bool temporary_disable_lrz = false;
742
743 /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
744 * fragment tests. We have to skip LRZ testing and updating, but as long as
745 * the depth direction stayed the same we can continue with LRZ testing later.
746 */
747 if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
748 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
749 perf_debug(cmd->device, "Skipping LRZ due to FS");
750 temporary_disable_lrz = true;
751 } else {
752 tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
753 disable_lrz = true;
754 }
755 }
756
757 /* If Z is not written - it doesn't affect LRZ buffer state.
758 * Which means two things:
759 * - Don't lock direction until Z is written for the first time;
760 * - If Z isn't written and direction IS locked it's possible to just
761 * temporary disable LRZ instead of fully bailing out, when direction
762 * is changed.
763 */
764
765 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
766 switch (depth_compare_op) {
767 case VK_COMPARE_OP_ALWAYS:
768 case VK_COMPARE_OP_NOT_EQUAL:
769 /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
770 * so if there is a depth write - LRZ must be disabled.
771 */
772 if (z_write_enable) {
773 tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
774 disable_lrz = true;
775 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
776 } else {
777 perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
778 temporary_disable_lrz = true;
779 }
780 break;
781 case VK_COMPARE_OP_EQUAL:
782 case VK_COMPARE_OP_NEVER:
783 /* Blob disables LRZ for OP_EQUAL, and from our empirical
784 * evidence it is a right thing to do.
785 *
786 * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
787 * we could just temporary disable LRZ.
788 */
789 temporary_disable_lrz = true;
790 break;
791 case VK_COMPARE_OP_GREATER:
792 case VK_COMPARE_OP_GREATER_OR_EQUAL:
793 lrz_direction = TU_LRZ_GREATER;
794 gras_lrz_cntl.greater = true;
795 gras_lrz_cntl.dir = LRZ_DIR_GE;
796 break;
797 case VK_COMPARE_OP_LESS:
798 case VK_COMPARE_OP_LESS_OR_EQUAL:
799 lrz_direction = TU_LRZ_LESS;
800 gras_lrz_cntl.greater = false;
801 gras_lrz_cntl.dir = LRZ_DIR_LE;
802 break;
803 default:
804 unreachable("bad VK_COMPARE_OP value or uninitialized");
805 break;
806 };
807
808 /* If depthfunc direction is changed, bail out on using LRZ. The
809 * LRZ buffer encodes a min/max depth value per block, but if
810 * we switch from GT/GE <-> LT/LE, those values cannot be
811 * interpreted properly.
812 */
813 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
814 lrz_direction != TU_LRZ_UNKNOWN &&
815 cmd->state.lrz.prev_direction != lrz_direction) {
816 if (z_write_enable) {
817 tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
818 disable_lrz = true;
819 } else {
820 perf_debug(cmd->device, "Skipping LRZ due to direction change");
821 temporary_disable_lrz = true;
822 }
823 }
824
825 /* Consider the following sequence of depthfunc changes:
826 *
827 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
828 * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
829 * during second VK_COMPARE_OP_GREATER.
830 *
831 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
832 * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
833 * invalid during COMPARE_OP_LESS.
834 *
835 * This shows that we should keep last KNOWN direction.
836 */
837 if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
838 cmd->state.lrz.prev_direction = lrz_direction;
839
840 /* Invalidate LRZ and disable write if stencil test is enabled */
841 bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
842 if (!disable_lrz && stencil_test_enable) {
843 VkCompareOp stencil_front_compare_op = (VkCompareOp)
844 cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
845
846 VkCompareOp stencil_back_compare_op = (VkCompareOp)
847 cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
848
849 bool lrz_allowed = true;
850 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
851 &gras_lrz_cntl, stencil_front_compare_op,
852 cmd->state.stencil_front_write);
853
854 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
855 &gras_lrz_cntl, stencil_back_compare_op,
856 cmd->state.stencil_back_write);
857
858 /* Without depth write it's enough to make sure that depth test
859 * is executed after stencil test, so temporary disabling LRZ is enough.
860 */
861 if (!lrz_allowed) {
862 if (z_write_enable) {
863 tu_lrz_disable_reason(cmd, "Stencil write");
864 disable_lrz = true;
865 } else {
866 perf_debug(cmd->device, "Skipping LRZ due to stencil write");
867 temporary_disable_lrz = true;
868 }
869 }
870 }
871
872 /* Writing depth with blend enabled means we need to invalidate LRZ,
873 * because the written depth value could mean that a later draw with
874 * depth enabled (where we would otherwise write LRZ) could have
875 * fragments which don't pass the depth test due to this draw. For
876 * example, consider this sequence of draws, with depth mode GREATER:
877 *
878 * draw A:
879 * z=0.1, fragments pass
880 * draw B:
881 * z=0.4, fragments pass
882 * blend enabled (LRZ write disabled)
883 * depth write enabled
884 * draw C:
885 * z=0.2, fragments don't pass
886 * blend disabled
887 * depth write enabled
888 *
889 * Normally looking at the state in draw C, we'd assume we could
890 * enable LRZ write. But this would cause early-z/lrz to discard
891 * fragments from draw A which should be visible due to draw B.
892 */
893 if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
894 tu_lrz_disable_reason(cmd, "Depth write + blending");
895 disable_lrz = true;
896 }
897
898 if (disable_lrz)
899 cmd->state.lrz.valid = false;
900
901 if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
902 /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
903 * for this it's not enough to emit empty GRAS_LRZ_CNTL.
904 */
905 gras_lrz_cntl.enable = true;
906 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
907
908 return gras_lrz_cntl;
909 }
910
911 if (temporary_disable_lrz)
912 gras_lrz_cntl.enable = false;
913
914 cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
915 if (!cmd->state.lrz.enabled)
916 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
917
918 return gras_lrz_cntl;
919 }
920
921 template <chip CHIP>
922 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)923 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
924 {
925 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
926 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
927
928 tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
929 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
930 }
931 TU_GENX(tu6_emit_lrz);
932