1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_lrz.h"
7
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12
13 /* Low-resolution Z buffer is very similar to a depth prepass that helps
14 * the HW avoid executing the fragment shader on those fragments that will
15 * be subsequently discarded by the depth test afterwards.
16 *
17 * The interesting part of this feature is that it allows applications
18 * to submit the vertices in any order.
19 *
20 * In the binning pass it is possible to store the depth value of each
21 * vertex into internal low resolution depth buffer and quickly test
22 * the primitives against it during the render pass.
23 *
24 * There are a number of limitations when LRZ cannot be used:
25 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
26 * - Writing to stencil buffer
27 * - Writing depth while:
28 * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
29 * - Using OP_ALWAYS or OP_NOT_EQUAL;
30 * - Clearing depth with vkCmdClearAttachments;
31 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
32 * - (pre-a650) Using secondary command buffers;
33 * - Sysmem rendering (with small caveat).
34 *
35 * Pre-a650 (before gen3)
36 * ======================
37 *
38 * The direction is fully tracked on CPU. In renderpass LRZ starts with
39 * unknown direction, the direction is set first time when depth write occurs
40 * and if it does change afterwards - direction becomes invalid and LRZ is
41 * disabled for the rest of the renderpass.
42 *
43 * Since direction is not tracked by GPU - it's impossible to know whether
44 * LRZ is enabled during construction of secondary command buffers.
45 *
46 * For the same reason it's impossible to reuse LRZ between renderpasses.
47 *
48 * A650+ (gen3+)
49 * =============
50 *
51 * Now LRZ direction could be tracked on GPU. There are to parts:
52 * - Direction byte which stores current LRZ direction;
53 * - Parameters of the last used depth view.
54 *
55 * The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL
56 * is used - its direction is compared to previously known direction
57 * and direction byte is set to disabled when directions are incompatible.
58 *
59 * Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks
60 * if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value
61 * stored in the buffer, if not - LRZ is disabled. (This is necessary
62 * because depth buffer may have several layers and mip levels, on the
63 * other hand LRZ buffer represents only a single layer + mip level).
64 *
65 * LRZ direction between renderpasses is disabled when underlying depth
66 * buffer is changed, the following commands could change depth image:
67 * - vkCmdBlitImage*
68 * - vkCmdCopyBufferToImage*
69 * - vkCmdCopyImage*
70 *
71 * LRZ Fast-Clear
72 * ==============
73 *
74 * The LRZ fast-clear buffer is initialized to zeroes and read/written
75 * when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block.
76 * '0' means block has original depth clear value, and '1' means that the
77 * corresponding block in LRZ has been modified.
78 *
79 * LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is
80 * written the LRZ block which corresponds to a single fast-clear bit is cleared:
81 * - To 0.0 if depth comparison is GREATER;
82 * - To 1.0 if depth comparison is LESS;
83 *
84 * This way it's always valid to fast-clear. On the other hand we disable
85 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
86 * for perf if some primitives are expected to fail depth test against the
87 * actual depth clear value.
88 *
89 * LRZ Precision
90 * =============
91 *
92 * LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is
93 * not enough to represent all values of Z32_UNORM or Z32_FLOAT.
94 * This especially rises questions in context of fast-clear, if fast-clear
95 * uses a value which cannot be precisely represented by LRZ - we wouldn't
96 * be able to round it in the correct direction since direction is tracked
97 * on GPU.
98 *
99 * However, it seems that depth comparisons with LRZ values have some "slack"
100 * and nothing special should be done for such depth clear values.
101 *
102 * How it was tested:
103 * - Clear Z32_FLOAT attachment to 1.f / (1 << 17)
104 * - LRZ buffer contains all zeroes
105 * - Do draws and check whether all samples are passing:
106 * - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing;
107 * - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing;
108 * - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples;
109 * - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing;
110 * - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing;
111 * In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated.
112 *
113 * LRZ Caches
114 * ==========
115 *
116 * ! The policy here is to flush LRZ cache right after it is changed,
117 * so if LRZ data is needed afterwards - there is no need to flush it
118 * before using LRZ.
119 *
120 * LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches:
121 * - Cache for fast-clear buffer;
122 * - Cache for direction byte + depth view params.
123 * They could be cleared by LRZ_CLEAR. To become visible in GPU memory
124 * the caches should be flushed with LRZ_FLUSH afterwards.
125 *
126 * GRAS_LRZ_CNTL reads from these caches.
127 */
128
129 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)130 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
131 {
132 if (!depth_image) {
133 tu_cs_emit_regs(cs,
134 A6XX_GRAS_LRZ_BUFFER_BASE(0),
135 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
136 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
137 return;
138 }
139
140 uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
141 uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
142 if (!depth_image->lrz_fc_offset)
143 lrz_fc_iova = 0;
144
145 tu_cs_emit_regs(cs,
146 A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
147 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
148 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
149 }
150
151 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)152 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
153 struct tu_reg_value reg)
154 {
155 if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
156 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
157 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
158 tu_cs_emit(cs, reg.reg);
159 tu_cs_emit(cs, reg.value);
160 } else {
161 tu_cs_emit_pkt4(cs, reg.reg, 1);
162 tu_cs_emit(cs, reg.value);
163 }
164 }
165
166 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)167 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
168 {
169 /* Disable direction by writing invalid depth view. */
170 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
171 .base_layer = 0b11111111111,
172 .layer_count = 0b11111111111,
173 .base_mip_level = 0b1111,
174 ));
175
176 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
177 .enable = true,
178 .disable_on_wrong_dir = true,
179 ));
180
181 tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
182 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
183 }
184
185 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)186 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
187 const struct tu_render_pass_attachment *att,
188 const struct tu_image_view *view)
189 {
190 if (!view->image->lrz_height) {
191 assert((cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ) ||
192 !vk_format_has_depth(att->format));
193 return;
194 }
195
196 bool clears_depth = att->clear_mask &
197 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
198 bool has_gpu_tracking =
199 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
200
201 if (!has_gpu_tracking && !clears_depth)
202 return;
203
204 /* We need to always have an LRZ view just to disable it if there is a
205 * depth attachment, there are any secondaries, and GPU tracking is
206 * enabled, in order not to rely on loadOp state which doesn't exist with
207 * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
208 * enabled and there will be a NULL/garbage LRZ buffer.
209 */
210 cmd->state.lrz.image_view = view;
211
212 if (!clears_depth && !att->load)
213 return;
214
215 cmd->state.lrz.valid = true;
216 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
217 /* Be optimistic and unconditionally enable fast-clear in
218 * secondary cmdbufs and when reusing previous LRZ state.
219 */
220 cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
221
222 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
223 cmd->state.lrz.reuse_previous_state = !clears_depth;
224 }
225
226 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
227 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
228 * dynamically disabled).
229 */
230
231 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)232 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
233 const struct tu_render_pass_attachment *att)
234 {
235 bool has_gpu_tracking =
236 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
237
238 if (!has_gpu_tracking)
239 return;
240
241 if (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ)
242 return;
243
244 if (!vk_format_has_depth(att->format))
245 return;
246
247 cmd->state.lrz.valid = true;
248 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
249 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
250
251 /* We may not have the depth attachment when executing in a secondary
252 * inside a render pass. This means we have to be even more optimistic than
253 * the normal case and enable fast clear even if the depth image doesn't
254 * support it.
255 */
256 cmd->state.lrz.fast_clear = true;
257
258 /* These are not used inside secondaries */
259 cmd->state.lrz.image_view = NULL;
260 cmd->state.lrz.reuse_previous_state = false;
261 }
262
263 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
264 * actually emitting anything. The lrz state needs to be consistent between
265 * renderpasses, but only the first should actually emit commands to disable
266 * lrz etc.
267 */
268 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd,const VkClearValue * clear_values)269 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
270 const VkClearValue *clear_values)
271 {
272 /* Track LRZ valid state */
273 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
274
275 uint32_t a;
276 for (a = 0; a < cmd->state.pass->attachment_count; a++) {
277 if (cmd->state.attachments[a]->image->lrz_height)
278 break;
279 }
280
281 if (a != cmd->state.pass->attachment_count) {
282 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
283 tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
284 if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
285 VkClearValue clear = clear_values[a];
286 cmd->state.lrz.depth_clear_value = clear;
287 cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
288 (clear.depthStencil.depth == 0.f ||
289 clear.depthStencil.depth == 1.f);
290 }
291 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
292 }
293 }
294
295 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd,const VkClearValue * clear_values)296 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
297 const VkClearValue *clear_values)
298 {
299 const struct tu_render_pass *pass = cmd->state.pass;
300
301 int lrz_img_count = 0;
302 for (unsigned i = 0; i < pass->attachment_count; i++) {
303 if (cmd->state.attachments[i]->image->lrz_height)
304 lrz_img_count++;
305 }
306
307 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
308 cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
309 /* Theoretically we could switch between LRZ buffers during the binning
310 * and tiling passes, but it is untested and would add complexity for
311 * presumably extremely rare case.
312 */
313 perf_debug(cmd->device,
314 "Invalidating LRZ because there are several subpasses with "
315 "different depth attachments in a single renderpass");
316
317 for (unsigned i = 0; i < pass->attachment_count; i++) {
318 struct tu_image *image = cmd->state.attachments[i]->image;
319 tu_disable_lrz(cmd, &cmd->cs, image);
320 }
321
322 /* We need a valid LRZ fast-clear base, in case the render pass contents
323 * are in secondaries that enable LRZ, so that they can read that LRZ is
324 * dynamically disabled. It doesn't matter which we use, so just leave
325 * the last one as emitted in tu_disable_lrz().
326 */
327 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
328 return;
329 }
330
331 /* Track LRZ valid state */
332 tu_lrz_begin_resumed_renderpass(cmd, clear_values);
333
334 if (!cmd->state.lrz.valid) {
335 tu6_emit_lrz_buffer(&cmd->cs, NULL);
336 }
337 }
338
339 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)340 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
341 {
342 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
343 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
344 if (a != VK_ATTACHMENT_UNUSED) {
345 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
346 tu_lrz_init_secondary(cmd, att);
347 }
348 }
349
350 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)351 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
352 {
353 /* TODO: If lrz was never valid for the entire renderpass, we could exit
354 * early here. Sometimes we know this ahead of time and null out
355 * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
356 * no secondaries.
357 */
358 if (!cmd->state.lrz.image_view)
359 return;
360
361 struct tu_lrz_state *lrz = &cmd->state.lrz;
362
363 tu6_emit_lrz_buffer(cs, lrz->image_view->image);
364
365 if (lrz->reuse_previous_state) {
366 /* Reuse previous LRZ state, LRZ cache is assumed to be
367 * already invalidated by previous renderpass.
368 */
369 assert(lrz->gpu_dir_tracking);
370
371 tu6_write_lrz_reg(cmd, cs,
372 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
373 return;
374 }
375
376 bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
377 if (invalidate_lrz) {
378 /* Following the blob we elect to disable LRZ for the whole renderpass
379 * if it is known that LRZ is disabled somewhere in the renderpass.
380 *
381 * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
382 * to fail the comparison of depth views.
383 */
384 tu6_disable_lrz_via_depth_view(cmd, cs);
385 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
386 } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
387 if (lrz->gpu_dir_tracking) {
388 tu6_write_lrz_reg(cmd, cs,
389 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
390 }
391
392 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
393 .enable = true,
394 .fc_enable = lrz->fast_clear,
395 .disable_on_wrong_dir = lrz->gpu_dir_tracking,
396 ));
397
398 /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
399 * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
400 * CUR_DIR_UNSET.
401 */
402 tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
403 }
404
405 if (!lrz->fast_clear && !invalidate_lrz) {
406 tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
407
408 /* Even though we disable fast-clear we still have to dirty
409 * fast-clear buffer because both secondary cmdbufs and following
410 * renderpasses won't know that fast-clear is disabled.
411 *
412 * TODO: we could avoid this if we don't store depth and don't
413 * expect secondary cmdbufs.
414 */
415 if (lrz->image_view->image->lrz_fc_size) {
416 tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image);
417 }
418 }
419 }
420
421 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)422 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
423 {
424 if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
425 tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
426
427 if (cmd->state.lrz.gpu_dir_tracking) {
428 tu6_write_lrz_reg(cmd, &cmd->cs,
429 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
430 }
431
432 /* Enable flushing of LRZ fast-clear and of direction buffer */
433 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
434 .enable = true,
435 .fc_enable = cmd->state.lrz.fast_clear,
436 .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
437 ));
438 } else {
439 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
440 }
441
442 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
443
444 /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
445 * additionally clears direction buffer:
446 * GRAS_LRZ_DEPTH_VIEW(.dword = 0)
447 * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
448 * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
449 * LRZ_CLEAR
450 * LRZ_FLUSH
451 * Since it happens after all of the rendering is done there is no known
452 * reason to do such clear.
453 */
454 }
455
456 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)457 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
458 {
459 if (!cmd->state.lrz.image_view)
460 return;
461
462 /* Actually, LRZ buffer could be filled in sysmem, in theory to
463 * be used in another renderpass, but the benefit is rather dubious.
464 */
465
466 struct tu_lrz_state *lrz = &cmd->state.lrz;
467
468 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
469 tu_disable_lrz(cmd, cs, lrz->image_view->image);
470 /* Make sure depth view comparison will fail. */
471 tu6_write_lrz_reg(cmd, cs,
472 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
473 } else {
474 tu6_emit_lrz_buffer(cs, lrz->image_view->image);
475 /* Even though we disable LRZ writes in sysmem mode - there is still
476 * LRZ test, so LRZ should be cleared.
477 */
478 if (lrz->fast_clear) {
479 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
480 .enable = true,
481 .fc_enable = true,
482 ));
483 tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
484 tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
485 } else {
486 tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
487 }
488 }
489 }
490
491 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)492 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
493 {
494 tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
495 }
496
497 /* Disable LRZ outside of renderpass. */
498 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)499 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
500 struct tu_image *image)
501 {
502 if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
503 return;
504
505 if (!image->lrz_height)
506 return;
507
508 tu6_emit_lrz_buffer(cs, image);
509 tu6_disable_lrz_via_depth_view(cmd, cs);
510 }
511
512 /* Clear LRZ, used for out of renderpass depth clears. */
513 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)514 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
515 struct tu_image *image,
516 const VkClearDepthStencilValue *pDepthStencil,
517 uint32_t rangeCount,
518 const VkImageSubresourceRange *pRanges)
519 {
520 if (!rangeCount || !image->lrz_height ||
521 !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
522 return;
523
524 /* We cannot predict which depth subresource would be used later on,
525 * so we just pick the first one with depth cleared and clear the LRZ.
526 */
527 const VkImageSubresourceRange *range = NULL;
528 for (unsigned i = 0; i < rangeCount; i++) {
529 if (pRanges[i].aspectMask &
530 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
531 range = &pRanges[i];
532 break;
533 }
534 }
535
536 if (!range)
537 return;
538
539 bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
540 pDepthStencil->depth == 1.f);
541
542 tu6_emit_lrz_buffer(&cmd->cs, image);
543
544 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
545 .base_layer = range->baseArrayLayer,
546 .layer_count = vk_image_subresource_layer_count(&image->vk, range),
547 .base_mip_level = range->baseMipLevel,
548 ));
549
550 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
551 .enable = true,
552 .fc_enable = fast_clear,
553 .disable_on_wrong_dir = true,
554 ));
555
556 tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
557 tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
558
559 if (!fast_clear) {
560 tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
561 }
562 }
563
564 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)565 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
566 {
567 assert(cmd->state.pass);
568
569 cmd->state.lrz.valid = false;
570 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
571
572 if (cmd->state.lrz.gpu_dir_tracking) {
573 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
574 .enable = true,
575 .dir = LRZ_DIR_INVALID,
576 .disable_on_wrong_dir = true,
577 ));
578 }
579 }
580
581 /* update lrz state based on stencil-test func:
582 *
583 * Conceptually the order of the pipeline is:
584 *
585 *
586 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
587 * | |
588 * if wrmask != 0 if wrmask != 0
589 * | |
590 * v v
591 * Stencil-Write Depth-Write
592 *
593 * Because Stencil-Test can have side effects (Stencil-Write) prior
594 * to depth test, in this case we potentially need to disable early
595 * lrz-test. See:
596 *
597 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
598 */
599 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)600 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
601 VkCompareOp func,
602 bool stencil_write)
603 {
604 switch (func) {
605 case VK_COMPARE_OP_ALWAYS:
606 /* nothing to do for LRZ, but for stencil test when stencil-
607 * write is enabled, we need to disable lrz-test, since
608 * conceptually stencil test and write happens before depth-test.
609 */
610 if (stencil_write) {
611 return false;
612 }
613 break;
614 case VK_COMPARE_OP_NEVER:
615 /* fragment never passes, disable lrz_write for this draw. */
616 gras_lrz_cntl->lrz_write = false;
617 break;
618 default:
619 /* whether the fragment passes or not depends on result
620 * of stencil test, which we cannot know when doing binning
621 * pass.
622 */
623 gras_lrz_cntl->lrz_write = false;
624 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
625 * effects from stencil test we need to disable lrz-test.
626 */
627 if (stencil_write) {
628 return false;
629 }
630 break;
631 }
632
633 return true;
634 }
635
636 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)637 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
638 const uint32_t a)
639 {
640 struct tu_pipeline *pipeline = cmd->state.pipeline;
641 bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
642 bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
643 bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
644 bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
645 VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
646
647 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
648
649 if (!cmd->state.lrz.valid) {
650 return gras_lrz_cntl;
651 }
652
653 /* If depth test is disabled we shouldn't touch LRZ.
654 * Same if there is no depth attachment.
655 */
656 if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
657 (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
658 return gras_lrz_cntl;
659
660 if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
661 /* Without on-gpu LRZ direction tracking - there is nothing we
662 * can do to enable LRZ in secondary command buffers.
663 */
664 return gras_lrz_cntl;
665 }
666
667 gras_lrz_cntl.enable = true;
668 gras_lrz_cntl.lrz_write =
669 z_write_enable &&
670 !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
671 gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable;
672 gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
673 gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
674 gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
675 gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
676
677 /* See comment in tu_pipeline about disabling LRZ write for blending. */
678 if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
679 cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
680 gras_lrz_cntl.lrz_write = false;
681
682 if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
683 cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
684 gras_lrz_cntl.lrz_write = false;
685
686 /* LRZ is disabled until it is cleared, which means that one "wrong"
687 * depth test or shader could disable LRZ until depth buffer is cleared.
688 */
689 bool disable_lrz = false;
690 bool temporary_disable_lrz = false;
691
692 /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
693 * or early fragment tests.
694 */
695 if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
696 perf_debug(cmd->device, "Invalidating LRZ due to FS");
697 disable_lrz = true;
698 }
699
700 /* If Z is not written - it doesn't affect LRZ buffer state.
701 * Which means two things:
702 * - Don't lock direction until Z is written for the first time;
703 * - If Z isn't written and direction IS locked it's possible to just
704 * temporary disable LRZ instead of fully bailing out, when direction
705 * is changed.
706 */
707
708 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
709 switch (depth_compare_op) {
710 case VK_COMPARE_OP_ALWAYS:
711 case VK_COMPARE_OP_NOT_EQUAL:
712 /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
713 * so if there is a depth write - LRZ must be disabled.
714 */
715 if (z_write_enable) {
716 perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
717 disable_lrz = true;
718 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
719 } else {
720 perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
721 temporary_disable_lrz = true;
722 }
723 break;
724 case VK_COMPARE_OP_EQUAL:
725 case VK_COMPARE_OP_NEVER:
726 /* Blob disables LRZ for OP_EQUAL, and from our empirical
727 * evidence it is a right thing to do.
728 *
729 * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
730 * we could just temporary disable LRZ.
731 */
732 temporary_disable_lrz = true;
733 break;
734 case VK_COMPARE_OP_GREATER:
735 case VK_COMPARE_OP_GREATER_OR_EQUAL:
736 lrz_direction = TU_LRZ_GREATER;
737 gras_lrz_cntl.greater = true;
738 gras_lrz_cntl.dir = LRZ_DIR_GE;
739 break;
740 case VK_COMPARE_OP_LESS:
741 case VK_COMPARE_OP_LESS_OR_EQUAL:
742 lrz_direction = TU_LRZ_LESS;
743 gras_lrz_cntl.greater = false;
744 gras_lrz_cntl.dir = LRZ_DIR_LE;
745 break;
746 default:
747 unreachable("bad VK_COMPARE_OP value or uninitialized");
748 break;
749 };
750
751 /* If depthfunc direction is changed, bail out on using LRZ. The
752 * LRZ buffer encodes a min/max depth value per block, but if
753 * we switch from GT/GE <-> LT/LE, those values cannot be
754 * interpreted properly.
755 */
756 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
757 lrz_direction != TU_LRZ_UNKNOWN &&
758 cmd->state.lrz.prev_direction != lrz_direction) {
759 if (z_write_enable) {
760 perf_debug(cmd->device, "Invalidating LRZ due to direction change");
761 disable_lrz = true;
762 } else {
763 perf_debug(cmd->device, "Skipping LRZ due to direction change");
764 temporary_disable_lrz = true;
765 }
766 }
767
768 /* Consider the following sequence of depthfunc changes:
769 *
770 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
771 * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
772 * during second VK_COMPARE_OP_GREATER.
773 *
774 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
775 * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
776 * invalid during COMPARE_OP_LESS.
777 *
778 * This shows that we should keep last KNOWN direction.
779 */
780 if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
781 cmd->state.lrz.prev_direction = lrz_direction;
782
783 /* Invalidate LRZ and disable write if stencil test is enabled */
784 bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
785 if (!disable_lrz && stencil_test_enable) {
786 bool stencil_front_writemask =
787 (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
788 (cmd->state.dynamic_stencil_wrmask & 0xff) :
789 (pipeline->stencil_wrmask & 0xff);
790
791 bool stencil_back_writemask =
792 (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
793 ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
794 (pipeline->stencil_wrmask & 0xff00) >> 8;
795
796 VkCompareOp stencil_front_compare_op =
797 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
798
799 VkCompareOp stencil_back_compare_op =
800 (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
801
802 bool lrz_allowed = true;
803 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
804 &gras_lrz_cntl, stencil_front_compare_op,
805 stencil_front_writemask);
806
807 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
808 &gras_lrz_cntl, stencil_back_compare_op,
809 stencil_back_writemask);
810
811 /* Without depth write it's enough to make sure that depth test
812 * is executed after stencil test, so temporary disabling LRZ is enough.
813 */
814 if (!lrz_allowed) {
815 if (z_write_enable) {
816 perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
817 disable_lrz = true;
818 } else {
819 perf_debug(cmd->device, "Skipping LRZ due to stencil write");
820 temporary_disable_lrz = true;
821 }
822 }
823 }
824
825 if (disable_lrz)
826 cmd->state.lrz.valid = false;
827
828 if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
829 /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
830 * for this it's not enough to emit empty GRAS_LRZ_CNTL.
831 */
832 gras_lrz_cntl.enable = true;
833 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
834
835 return gras_lrz_cntl;
836 }
837
838 if (temporary_disable_lrz)
839 gras_lrz_cntl.enable = false;
840
841 cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
842 if (!cmd->state.lrz.enabled)
843 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
844
845 return gras_lrz_cntl;
846 }
847
848 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)849 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
850 {
851 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
852 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
853
854 tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
855 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
856 }
857