1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_lrz.h"
7
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12
13 #include "common/freedreno_gpu_event.h"
14
15 /* See lrz.rst for how HW works. Here are only the implementation notes.
16 *
17 * There are a number of limitations when LRZ cannot be used:
18 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
19 * - Writing to stencil buffer
20 * - Writing depth while:
21 * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
22 * - Using OP_ALWAYS or OP_NOT_EQUAL;
23 * - Clearing depth with vkCmdClearAttachments;
24 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
25 * - (pre-a650) Using secondary command buffers;
26 * - Sysmem rendering (with small caveat).
27 *
28 * A650+ (gen3+)
29 * =============
30 *
31 * While LRZ could be reused between renderpasses LRZ, it is disabled when
32 * underlying depth buffer is changed.
33 * The following commands could change a depth image:
34 * - vkCmdBlitImage*
35 * - vkCmdCopyBufferToImage*
36 * - vkCmdCopyImage*
37 *
38 * LRZ Fast-Clear
39 * ==============
40 *
41 * It's always valid to fast-clear. On the other hand we disable
42 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
43 * for perf if some primitives are expected to fail depth test against the
44 * actual depth clear value.
45 *
46 * LRZ Caches
47 * ==========
48 *
49 * ! The policy here is to flush LRZ cache right after it is changed,
50 * so if LRZ data is needed afterwards - there is no need to flush it
51 * before using LRZ.
52 */
53
54 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)55 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
56 {
57 if (!depth_image) {
58 tu_cs_emit_regs(cs,
59 A6XX_GRAS_LRZ_BUFFER_BASE(0),
60 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
61 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
62 return;
63 }
64
65 uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
66 uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
67 if (!depth_image->lrz_fc_offset)
68 lrz_fc_iova = 0;
69
70 tu_cs_emit_regs(cs,
71 A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
72 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
73 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
74 }
75
76 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)77 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
78 struct tu_reg_value reg)
79 {
80 if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
81 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
82 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
83 tu_cs_emit(cs, reg.reg);
84 tu_cs_emit(cs, reg.value);
85 } else {
86 tu_cs_emit_pkt4(cs, reg.reg, 1);
87 tu_cs_emit(cs, reg.value);
88 }
89 }
90
91 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)92 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
93 {
94 /* Disable direction by writing invalid depth view. */
95 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
96 .base_layer = 0b11111111111,
97 .layer_count = 0b11111111111,
98 .base_mip_level = 0b1111,
99 ));
100
101 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
102 .enable = true,
103 .disable_on_wrong_dir = true,
104 ));
105
106 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
107 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
108 }
109
110 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)111 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
112 const struct tu_render_pass_attachment *att,
113 const struct tu_image_view *view)
114 {
115 if (!view->image->lrz_height) {
116 assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
117 return;
118 }
119
120 bool clears_depth = att->clear_mask &
121 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
122 bool has_gpu_tracking =
123 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
124
125 if (!has_gpu_tracking && !clears_depth)
126 return;
127
128 /* We need to always have an LRZ view just to disable it if there is a
129 * depth attachment, there are any secondaries, and GPU tracking is
130 * enabled, in order not to rely on loadOp state which doesn't exist with
131 * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
132 * enabled and there will be a NULL/garbage LRZ buffer.
133 */
134 cmd->state.lrz.image_view = view;
135
136 if (!clears_depth && !att->load)
137 return;
138
139 cmd->state.lrz.valid = true;
140 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
141 /* Be optimistic and unconditionally enable fast-clear in
142 * secondary cmdbufs and when reusing previous LRZ state.
143 */
144 cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
145
146 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
147 cmd->state.lrz.reuse_previous_state = !clears_depth;
148 }
149
150 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
151 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
152 * dynamically disabled).
153 */
154
155 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)156 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
157 const struct tu_render_pass_attachment *att)
158 {
159 bool has_gpu_tracking =
160 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
161
162 if (!has_gpu_tracking)
163 return;
164
165 if (!cmd->device->use_lrz)
166 return;
167
168 if (!vk_format_has_depth(att->format))
169 return;
170
171 cmd->state.lrz.valid = true;
172 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
173 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
174
175 /* We may not have the depth attachment when executing in a secondary
176 * inside a render pass. This means we have to be even more optimistic than
177 * the normal case and enable fast clear even if the depth image doesn't
178 * support it.
179 */
180 cmd->state.lrz.fast_clear = true;
181
182 /* These are not used inside secondaries */
183 cmd->state.lrz.image_view = NULL;
184 cmd->state.lrz.reuse_previous_state = false;
185 }
186
187 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
188 * actually emitting anything. The lrz state needs to be consistent between
189 * renderpasses, but only the first should actually emit commands to disable
190 * lrz etc.
191 */
192 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)193 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
194 {
195 /* Track LRZ valid state */
196 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
197
198 uint32_t a;
199 for (a = 0; a < cmd->state.pass->attachment_count; a++) {
200 if (cmd->state.attachments[a]->image->lrz_height)
201 break;
202 }
203
204 if (a != cmd->state.pass->attachment_count) {
205 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
206 tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
207 if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
208 VkClearValue clear = cmd->state.clear_values[a];
209 cmd->state.lrz.depth_clear_value = clear;
210 cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
211 (clear.depthStencil.depth == 0.f ||
212 clear.depthStencil.depth == 1.f);
213 }
214 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
215 }
216 }
217
218 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)219 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
220 {
221 const struct tu_render_pass *pass = cmd->state.pass;
222
223 int lrz_img_count = 0;
224 for (unsigned i = 0; i < pass->attachment_count; i++) {
225 if (cmd->state.attachments[i]->image->lrz_height)
226 lrz_img_count++;
227 }
228
229 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
230 cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
231 /* Theoretically we could switch between LRZ buffers during the binning
232 * and tiling passes, but it is untested and would add complexity for
233 * presumably extremely rare case.
234 */
235 perf_debug(cmd->device,
236 "Invalidating LRZ because there are several subpasses with "
237 "different depth attachments in a single renderpass");
238
239 for (unsigned i = 0; i < pass->attachment_count; i++) {
240 struct tu_image *image = cmd->state.attachments[i]->image;
241 tu_disable_lrz(cmd, &cmd->cs, image);
242 }
243
244 /* We need a valid LRZ fast-clear base, in case the render pass contents
245 * are in secondaries that enable LRZ, so that they can read that LRZ is
246 * dynamically disabled. It doesn't matter which we use, so just leave
247 * the last one as emitted in tu_disable_lrz().
248 */
249 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
250 return;
251 }
252
253 /* Track LRZ valid state */
254 tu_lrz_begin_resumed_renderpass(cmd);
255
256 if (!cmd->state.lrz.valid) {
257 tu6_emit_lrz_buffer(&cmd->cs, NULL);
258 }
259 }
260
261 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)262 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
263 {
264 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
265 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
266 if (a != VK_ATTACHMENT_UNUSED) {
267 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
268 tu_lrz_init_secondary(cmd, att);
269 }
270 }
271
272 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)273 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
274 {
275 /* TODO: If lrz was never valid for the entire renderpass, we could exit
276 * early here. Sometimes we know this ahead of time and null out
277 * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
278 * no secondaries.
279 */
280 if (!cmd->state.lrz.image_view)
281 return;
282
283 struct tu_lrz_state *lrz = &cmd->state.lrz;
284
285 tu6_emit_lrz_buffer(cs, lrz->image_view->image);
286
287 if (lrz->reuse_previous_state) {
288 /* Reuse previous LRZ state, LRZ cache is assumed to be
289 * already invalidated by previous renderpass.
290 */
291 assert(lrz->gpu_dir_tracking);
292
293 tu6_write_lrz_reg(cmd, cs,
294 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
295 return;
296 }
297
298 bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
299 if (invalidate_lrz) {
300 /* Following the blob we elect to disable LRZ for the whole renderpass
301 * if it is known that LRZ is disabled somewhere in the renderpass.
302 *
303 * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
304 * to fail the comparison of depth views.
305 */
306 tu6_disable_lrz_via_depth_view(cmd, cs);
307 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
308 } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
309 if (lrz->gpu_dir_tracking) {
310 tu6_write_lrz_reg(cmd, cs,
311 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
312 }
313
314 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
315 .enable = true,
316 .fc_enable = lrz->fast_clear,
317 .disable_on_wrong_dir = lrz->gpu_dir_tracking,
318 ));
319
320 /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
321 * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
322 * CUR_DIR_UNSET.
323 */
324 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
325 }
326
327 if (!lrz->fast_clear && !invalidate_lrz) {
328 tu6_clear_lrz<A6XX>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
329 /* Even though we disable fast-clear we still have to dirty
330 * fast-clear buffer because both secondary cmdbufs and following
331 * renderpasses won't know that fast-clear is disabled.
332 *
333 * TODO: we could avoid this if we don't store depth and don't
334 * expect secondary cmdbufs.
335 */
336 if (lrz->image_view->image->lrz_fc_size) {
337 tu6_dirty_lrz_fc<A6XX>(cmd, cs, lrz->image_view->image);
338 }
339 }
340 }
341
342 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)343 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
344 {
345 if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
346 tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
347
348 if (cmd->state.lrz.gpu_dir_tracking) {
349 tu6_write_lrz_reg(cmd, &cmd->cs,
350 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
351 }
352
353 /* Enable flushing of LRZ fast-clear and of direction buffer */
354 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
355 .enable = true,
356 .fc_enable = cmd->state.lrz.fast_clear,
357 .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
358 ));
359 } else {
360 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
361 }
362
363 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
364
365 /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
366 * additionally clears direction buffer:
367 * GRAS_LRZ_DEPTH_VIEW(.dword = 0)
368 * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
369 * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
370 * LRZ_CLEAR
371 * LRZ_FLUSH
372 * Since it happens after all of the rendering is done there is no known
373 * reason to do such clear.
374 */
375 }
376
377 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)378 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
379 {
380 if (!cmd->state.lrz.image_view)
381 return;
382
383 /* Actually, LRZ buffer could be filled in sysmem, in theory to
384 * be used in another renderpass, but the benefit is rather dubious.
385 */
386
387 struct tu_lrz_state *lrz = &cmd->state.lrz;
388
389 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
390 tu_disable_lrz(cmd, cs, lrz->image_view->image);
391 /* Make sure depth view comparison will fail. */
392 tu6_write_lrz_reg(cmd, cs,
393 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
394 } else {
395 tu6_emit_lrz_buffer(cs, lrz->image_view->image);
396 /* Even though we disable LRZ writes in sysmem mode - there is still
397 * LRZ test, so LRZ should be cleared.
398 */
399 if (lrz->fast_clear) {
400 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
401 .enable = true,
402 .fc_enable = true,
403 ));
404 tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_CLEAR);
405 tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
406 } else {
407 tu6_clear_lrz<A6XX>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
408 }
409 }
410 }
411
412 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)413 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
414 {
415 tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
416 }
417
418 /* Disable LRZ outside of renderpass. */
419 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)420 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
421 struct tu_image *image)
422 {
423 if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
424 return;
425
426 if (!image->lrz_height)
427 return;
428
429 tu6_emit_lrz_buffer(cs, image);
430 tu6_disable_lrz_via_depth_view(cmd, cs);
431 }
432
433 /* Clear LRZ, used for out of renderpass depth clears. */
434 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)435 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
436 struct tu_image *image,
437 const VkClearDepthStencilValue *pDepthStencil,
438 uint32_t rangeCount,
439 const VkImageSubresourceRange *pRanges)
440 {
441 if (!rangeCount || !image->lrz_height ||
442 !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
443 return;
444
445 /* We cannot predict which depth subresource would be used later on,
446 * so we just pick the first one with depth cleared and clear the LRZ.
447 */
448 const VkImageSubresourceRange *range = NULL;
449 for (unsigned i = 0; i < rangeCount; i++) {
450 if (pRanges[i].aspectMask &
451 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
452 range = &pRanges[i];
453 break;
454 }
455 }
456
457 if (!range)
458 return;
459
460 bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
461 pDepthStencil->depth == 1.f);
462
463 tu6_emit_lrz_buffer(&cmd->cs, image);
464
465 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
466 .base_layer = range->baseArrayLayer,
467 .layer_count = vk_image_subresource_layer_count(&image->vk, range),
468 .base_mip_level = range->baseMipLevel,
469 ));
470
471 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
472 .enable = true,
473 .fc_enable = fast_clear,
474 .disable_on_wrong_dir = true,
475 ));
476
477 tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_CLEAR);
478 tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
479
480 if (!fast_clear) {
481 tu6_clear_lrz<A6XX>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
482 }
483 }
484
485 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)486 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
487 {
488 assert(cmd->state.pass);
489
490 cmd->state.lrz.valid = false;
491 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
492
493 if (cmd->state.lrz.gpu_dir_tracking) {
494 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
495 .enable = true,
496 .dir = LRZ_DIR_INVALID,
497 .disable_on_wrong_dir = true,
498 ));
499 }
500 }
501
502 /* update lrz state based on stencil-test func:
503 *
504 * Conceptually the order of the pipeline is:
505 *
506 *
507 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
508 * | |
509 * if wrmask != 0 if wrmask != 0
510 * | |
511 * v v
512 * Stencil-Write Depth-Write
513 *
514 * Because Stencil-Test can have side effects (Stencil-Write) prior
515 * to depth test, in this case we potentially need to disable early
516 * lrz-test. See:
517 *
518 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
519 */
520 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)521 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
522 VkCompareOp func,
523 bool stencil_write)
524 {
525 switch (func) {
526 case VK_COMPARE_OP_ALWAYS:
527 /* nothing to do for LRZ, but for stencil test when stencil-
528 * write is enabled, we need to disable lrz-test, since
529 * conceptually stencil test and write happens before depth-test.
530 */
531 if (stencil_write) {
532 return false;
533 }
534 break;
535 case VK_COMPARE_OP_NEVER:
536 /* fragment never passes, disable lrz_write for this draw. */
537 gras_lrz_cntl->lrz_write = false;
538 break;
539 default:
540 /* whether the fragment passes or not depends on result
541 * of stencil test, which we cannot know when doing binning
542 * pass.
543 */
544 gras_lrz_cntl->lrz_write = false;
545 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
546 * effects from stencil test we need to disable lrz-test.
547 */
548 if (stencil_write) {
549 return false;
550 }
551 break;
552 }
553
554 return true;
555 }
556
557 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)558 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
559 const uint32_t a)
560 {
561 const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
562 bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
563 bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
564 bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
565 VkCompareOp depth_compare_op =
566 cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
567
568 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
569
570 if (!cmd->state.lrz.valid) {
571 return gras_lrz_cntl;
572 }
573
574 /* If depth test is disabled we shouldn't touch LRZ.
575 * Same if there is no depth attachment.
576 */
577 if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
578 return gras_lrz_cntl;
579
580 if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
581 /* Without on-gpu LRZ direction tracking - there is nothing we
582 * can do to enable LRZ in secondary command buffers.
583 */
584 return gras_lrz_cntl;
585 }
586
587 /* See comment in tu_pipeline about disabling LRZ write for blending. */
588 bool reads_dest = cmd->state.blend_reads_dest;
589
590 gras_lrz_cntl.enable = true;
591 gras_lrz_cntl.lrz_write =
592 z_write_enable &&
593 !reads_dest &&
594 !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
595 gras_lrz_cntl.z_test_enable = z_write_enable;
596 gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
597 gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
598 gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
599 gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
600
601
602 /* LRZ is disabled until it is cleared, which means that one "wrong"
603 * depth test or shader could disable LRZ until depth buffer is cleared.
604 */
605 bool disable_lrz = false;
606 bool temporary_disable_lrz = false;
607
608 /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
609 * fragment tests. We have to skip LRZ testing and updating, but as long as
610 * the depth direction stayed the same we can continue with LRZ testing later.
611 */
612 if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
613 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
614 perf_debug(cmd->device, "Skipping LRZ due to FS");
615 temporary_disable_lrz = true;
616 } else {
617 perf_debug(cmd->device, "Disabling LRZ due to FS (TODO: fix for gpu-direction-tracking case");
618 disable_lrz = true;
619 }
620 }
621
622 /* If Z is not written - it doesn't affect LRZ buffer state.
623 * Which means two things:
624 * - Don't lock direction until Z is written for the first time;
625 * - If Z isn't written and direction IS locked it's possible to just
626 * temporary disable LRZ instead of fully bailing out, when direction
627 * is changed.
628 */
629
630 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
631 switch (depth_compare_op) {
632 case VK_COMPARE_OP_ALWAYS:
633 case VK_COMPARE_OP_NOT_EQUAL:
634 /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
635 * so if there is a depth write - LRZ must be disabled.
636 */
637 if (z_write_enable) {
638 perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
639 disable_lrz = true;
640 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
641 } else {
642 perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
643 temporary_disable_lrz = true;
644 }
645 break;
646 case VK_COMPARE_OP_EQUAL:
647 case VK_COMPARE_OP_NEVER:
648 /* Blob disables LRZ for OP_EQUAL, and from our empirical
649 * evidence it is a right thing to do.
650 *
651 * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
652 * we could just temporary disable LRZ.
653 */
654 temporary_disable_lrz = true;
655 break;
656 case VK_COMPARE_OP_GREATER:
657 case VK_COMPARE_OP_GREATER_OR_EQUAL:
658 lrz_direction = TU_LRZ_GREATER;
659 gras_lrz_cntl.greater = true;
660 gras_lrz_cntl.dir = LRZ_DIR_GE;
661 break;
662 case VK_COMPARE_OP_LESS:
663 case VK_COMPARE_OP_LESS_OR_EQUAL:
664 lrz_direction = TU_LRZ_LESS;
665 gras_lrz_cntl.greater = false;
666 gras_lrz_cntl.dir = LRZ_DIR_LE;
667 break;
668 default:
669 unreachable("bad VK_COMPARE_OP value or uninitialized");
670 break;
671 };
672
673 /* If depthfunc direction is changed, bail out on using LRZ. The
674 * LRZ buffer encodes a min/max depth value per block, but if
675 * we switch from GT/GE <-> LT/LE, those values cannot be
676 * interpreted properly.
677 */
678 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
679 lrz_direction != TU_LRZ_UNKNOWN &&
680 cmd->state.lrz.prev_direction != lrz_direction) {
681 if (z_write_enable) {
682 perf_debug(cmd->device, "Invalidating LRZ due to direction change");
683 disable_lrz = true;
684 } else {
685 perf_debug(cmd->device, "Skipping LRZ due to direction change");
686 temporary_disable_lrz = true;
687 }
688 }
689
690 /* Consider the following sequence of depthfunc changes:
691 *
692 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
693 * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
694 * during second VK_COMPARE_OP_GREATER.
695 *
696 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
697 * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
698 * invalid during COMPARE_OP_LESS.
699 *
700 * This shows that we should keep last KNOWN direction.
701 */
702 if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
703 cmd->state.lrz.prev_direction = lrz_direction;
704
705 /* Invalidate LRZ and disable write if stencil test is enabled */
706 bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
707 if (!disable_lrz && stencil_test_enable) {
708 VkCompareOp stencil_front_compare_op = (VkCompareOp)
709 cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
710
711 VkCompareOp stencil_back_compare_op = (VkCompareOp)
712 cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
713
714 bool lrz_allowed = true;
715 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
716 &gras_lrz_cntl, stencil_front_compare_op,
717 cmd->state.stencil_front_write);
718
719 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
720 &gras_lrz_cntl, stencil_back_compare_op,
721 cmd->state.stencil_back_write);
722
723 /* Without depth write it's enough to make sure that depth test
724 * is executed after stencil test, so temporary disabling LRZ is enough.
725 */
726 if (!lrz_allowed) {
727 if (z_write_enable) {
728 perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
729 disable_lrz = true;
730 } else {
731 perf_debug(cmd->device, "Skipping LRZ due to stencil write");
732 temporary_disable_lrz = true;
733 }
734 }
735 }
736
737 /* Writing depth with blend enabled means we need to invalidate LRZ,
738 * because the written depth value could mean that a later draw with
739 * depth enabled (where we would otherwise write LRZ) could have
740 * fragments which don't pass the depth test due to this draw. For
741 * example, consider this sequence of draws, with depth mode GREATER:
742 *
743 * draw A:
744 * z=0.1, fragments pass
745 * draw B:
746 * z=0.4, fragments pass
747 * blend enabled (LRZ write disabled)
748 * depth write enabled
749 * draw C:
750 * z=0.2, fragments don't pass
751 * blend disabled
752 * depth write enabled
753 *
754 * Normally looking at the state in draw C, we'd assume we could
755 * enable LRZ write. But this would cause early-z/lrz to discard
756 * fragments from draw A which should be visible due to draw B.
757 */
758 if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
759 perf_debug(cmd->device, "Invalidating LRZ due to blend+depthwrite");
760 disable_lrz = true;
761 }
762
763 if (disable_lrz)
764 cmd->state.lrz.valid = false;
765
766 if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
767 /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
768 * for this it's not enough to emit empty GRAS_LRZ_CNTL.
769 */
770 gras_lrz_cntl.enable = true;
771 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
772
773 return gras_lrz_cntl;
774 }
775
776 if (temporary_disable_lrz)
777 gras_lrz_cntl.enable = false;
778
779 cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
780 if (!cmd->state.lrz.enabled)
781 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
782
783 return gras_lrz_cntl;
784 }
785
786 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)787 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
788 {
789 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
790 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
791
792 tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
793 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
794 }
795