• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_lrz.h"
7 
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12 
13 #include "common/freedreno_gpu_event.h"
14 
15 /* See lrz.rst for how HW works. Here are only the implementation notes.
16  *
17  * There are a number of limitations when LRZ cannot be used:
18  * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
19  * - Writing to stencil buffer
20  * - Writing depth while:
21  *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
22  *   - Using OP_ALWAYS or OP_NOT_EQUAL;
23  * - Clearing depth with vkCmdClearAttachments;
24  * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
25  * - (pre-a650) Using secondary command buffers;
26  * - Sysmem rendering (with small caveat).
27  *
28  * A650+ (gen3+)
29  * =============
30  *
31  * While LRZ could be reused between renderpasses LRZ, it is disabled when
32  * underlying depth buffer is changed.
33  * The following commands could change a depth image:
34  * - vkCmdBlitImage*
35  * - vkCmdCopyBufferToImage*
36  * - vkCmdCopyImage*
37  *
38  * LRZ Fast-Clear
39  * ==============
40  *
41  * It's always valid to fast-clear. On the other hand we disable
42  * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
43  * for perf if some primitives are expected to fail depth test against the
44  * actual depth clear value.
45  *
46  * LRZ Caches
47  * ==========
48  *
49  * ! The policy here is to flush LRZ cache right after it is changed,
50  * so if LRZ data is needed afterwards - there is no need to flush it
51  * before using LRZ.
52  */
53 
54 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)55 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
56 {
57    if (!depth_image) {
58       tu_cs_emit_regs(cs,
59                       A6XX_GRAS_LRZ_BUFFER_BASE(0),
60                       A6XX_GRAS_LRZ_BUFFER_PITCH(0),
61                       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
62       return;
63    }
64 
65    uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
66    uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
67    if (!depth_image->lrz_fc_offset)
68       lrz_fc_iova = 0;
69 
70    tu_cs_emit_regs(cs,
71                    A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
72                    A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
73                    A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
74 }
75 
76 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)77 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
78                   struct tu_reg_value reg)
79 {
80    if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
81       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
82       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
83       tu_cs_emit(cs, reg.reg);
84       tu_cs_emit(cs, reg.value);
85    } else {
86       tu_cs_emit_pkt4(cs, reg.reg, 1);
87       tu_cs_emit(cs, reg.value);
88    }
89 }
90 
91 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)92 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
93 {
94    /* Disable direction by writing invalid depth view. */
95    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
96       .base_layer = 0b11111111111,
97       .layer_count = 0b11111111111,
98       .base_mip_level = 0b1111,
99    ));
100 
101    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
102       .enable = true,
103       .disable_on_wrong_dir = true,
104    ));
105 
106    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
107    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
108 }
109 
110 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)111 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
112                   const struct tu_render_pass_attachment *att,
113                   const struct tu_image_view *view)
114 {
115    if (!view->image->lrz_height) {
116       assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
117       return;
118    }
119 
120    bool clears_depth = att->clear_mask &
121       (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
122    bool has_gpu_tracking =
123       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
124 
125    if (!has_gpu_tracking && !clears_depth)
126       return;
127 
128    /* We need to always have an LRZ view just to disable it if there is a
129     * depth attachment, there are any secondaries, and GPU tracking is
130     * enabled, in order not to rely on loadOp state which doesn't exist with
131     * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
132     * enabled and there will be a NULL/garbage LRZ buffer.
133     */
134    cmd->state.lrz.image_view = view;
135 
136    if (!clears_depth && !att->load)
137       return;
138 
139    cmd->state.lrz.valid = true;
140    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
141    /* Be optimistic and unconditionally enable fast-clear in
142     * secondary cmdbufs and when reusing previous LRZ state.
143     */
144    cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
145 
146    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
147    cmd->state.lrz.reuse_previous_state = !clears_depth;
148 }
149 
150 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
151  * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
152  * dynamically disabled).
153  */
154 
155 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)156 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
157                       const struct tu_render_pass_attachment *att)
158 {
159    bool has_gpu_tracking =
160       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
161 
162    if (!has_gpu_tracking)
163       return;
164 
165    if (!cmd->device->use_lrz)
166       return;
167 
168    if (!vk_format_has_depth(att->format))
169       return;
170 
171    cmd->state.lrz.valid = true;
172    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
173    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
174 
175    /* We may not have the depth attachment when executing in a secondary
176     * inside a render pass. This means we have to be even more optimistic than
177     * the normal case and enable fast clear even if the depth image doesn't
178     * support it.
179     */
180    cmd->state.lrz.fast_clear = true;
181 
182    /* These are not used inside secondaries */
183    cmd->state.lrz.image_view = NULL;
184    cmd->state.lrz.reuse_previous_state = false;
185 }
186 
187 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
188  * actually emitting anything. The lrz state needs to be consistent between
189  * renderpasses, but only the first should actually emit commands to disable
190  * lrz etc.
191  */
192 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)193 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
194 {
195     /* Track LRZ valid state */
196    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
197 
198    uint32_t a;
199    for (a = 0; a < cmd->state.pass->attachment_count; a++) {
200       if (cmd->state.attachments[a]->image->lrz_height)
201          break;
202    }
203 
204    if (a != cmd->state.pass->attachment_count) {
205       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
206       tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
207       if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
208          VkClearValue clear = cmd->state.clear_values[a];
209          cmd->state.lrz.depth_clear_value = clear;
210          cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
211                                      (clear.depthStencil.depth == 0.f ||
212                                       clear.depthStencil.depth == 1.f);
213       }
214       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
215    }
216 }
217 
218 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)219 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
220 {
221    const struct tu_render_pass *pass = cmd->state.pass;
222 
223    int lrz_img_count = 0;
224    for (unsigned i = 0; i < pass->attachment_count; i++) {
225       if (cmd->state.attachments[i]->image->lrz_height)
226          lrz_img_count++;
227    }
228 
229    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
230        cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
231       /* Theoretically we could switch between LRZ buffers during the binning
232        * and tiling passes, but it is untested and would add complexity for
233        * presumably extremely rare case.
234        */
235       perf_debug(cmd->device,
236                  "Invalidating LRZ because there are several subpasses with "
237                  "different depth attachments in a single renderpass");
238 
239       for (unsigned i = 0; i < pass->attachment_count; i++) {
240          struct tu_image *image = cmd->state.attachments[i]->image;
241          tu_disable_lrz(cmd, &cmd->cs, image);
242       }
243 
244       /* We need a valid LRZ fast-clear base, in case the render pass contents
245        * are in secondaries that enable LRZ, so that they can read that LRZ is
246        * dynamically disabled. It doesn't matter which we use, so just leave
247        * the last one as emitted in tu_disable_lrz().
248        */
249       memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
250       return;
251    }
252 
253     /* Track LRZ valid state */
254    tu_lrz_begin_resumed_renderpass(cmd);
255 
256    if (!cmd->state.lrz.valid) {
257       tu6_emit_lrz_buffer(&cmd->cs, NULL);
258    }
259 }
260 
261 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)262 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
263 {
264    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
265    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
266    if (a != VK_ATTACHMENT_UNUSED) {
267       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
268       tu_lrz_init_secondary(cmd, att);
269    }
270 }
271 
272 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)273 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
274 {
275    /* TODO: If lrz was never valid for the entire renderpass, we could exit
276     * early here. Sometimes we know this ahead of time and null out
277     * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
278     * no secondaries.
279     */
280    if (!cmd->state.lrz.image_view)
281       return;
282 
283    struct tu_lrz_state *lrz = &cmd->state.lrz;
284 
285    tu6_emit_lrz_buffer(cs, lrz->image_view->image);
286 
287    if (lrz->reuse_previous_state) {
288       /* Reuse previous LRZ state, LRZ cache is assumed to be
289        * already invalidated by previous renderpass.
290        */
291       assert(lrz->gpu_dir_tracking);
292 
293       tu6_write_lrz_reg(cmd, cs,
294          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
295       return;
296    }
297 
298    bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
299    if (invalidate_lrz) {
300       /* Following the blob we elect to disable LRZ for the whole renderpass
301        * if it is known that LRZ is disabled somewhere in the renderpass.
302        *
303        * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
304        * to fail the comparison of depth views.
305        */
306       tu6_disable_lrz_via_depth_view(cmd, cs);
307       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
308    } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
309       if (lrz->gpu_dir_tracking) {
310          tu6_write_lrz_reg(cmd, cs,
311             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
312       }
313 
314       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
315          .enable = true,
316          .fc_enable = lrz->fast_clear,
317          .disable_on_wrong_dir = lrz->gpu_dir_tracking,
318       ));
319 
320       /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
321        * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
322        *  CUR_DIR_UNSET.
323        */
324       tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
325    }
326 
327    if (!lrz->fast_clear && !invalidate_lrz) {
328       tu6_clear_lrz<A6XX>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
329       /* Even though we disable fast-clear we still have to dirty
330        * fast-clear buffer because both secondary cmdbufs and following
331        * renderpasses won't know that fast-clear is disabled.
332        *
333        * TODO: we could avoid this if we don't store depth and don't
334        * expect secondary cmdbufs.
335        */
336       if (lrz->image_view->image->lrz_fc_size) {
337          tu6_dirty_lrz_fc<A6XX>(cmd, cs, lrz->image_view->image);
338       }
339    }
340 }
341 
342 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)343 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
344 {
345    if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
346       tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
347 
348       if (cmd->state.lrz.gpu_dir_tracking) {
349          tu6_write_lrz_reg(cmd, &cmd->cs,
350             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
351       }
352 
353       /* Enable flushing of LRZ fast-clear and of direction buffer */
354       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
355          .enable = true,
356          .fc_enable = cmd->state.lrz.fast_clear,
357          .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
358       ));
359    } else {
360       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
361    }
362 
363    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
364 
365    /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
366     * additionally clears direction buffer:
367     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0)
368     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
369     *  A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
370     *  LRZ_CLEAR
371     *  LRZ_FLUSH
372     * Since it happens after all of the rendering is done there is no known
373     * reason to do such clear.
374     */
375 }
376 
377 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)378 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
379 {
380    if (!cmd->state.lrz.image_view)
381       return;
382 
383    /* Actually, LRZ buffer could be filled in sysmem, in theory to
384     * be used in another renderpass, but the benefit is rather dubious.
385     */
386 
387    struct tu_lrz_state *lrz = &cmd->state.lrz;
388 
389    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
390       tu_disable_lrz(cmd, cs, lrz->image_view->image);
391       /* Make sure depth view comparison will fail. */
392       tu6_write_lrz_reg(cmd, cs,
393          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
394    } else {
395       tu6_emit_lrz_buffer(cs, lrz->image_view->image);
396       /* Even though we disable LRZ writes in sysmem mode - there is still
397        * LRZ test, so LRZ should be cleared.
398        */
399       if (lrz->fast_clear) {
400          tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
401             .enable = true,
402             .fc_enable = true,
403          ));
404          tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_CLEAR);
405          tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
406       } else {
407          tu6_clear_lrz<A6XX>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
408       }
409    }
410 }
411 
412 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)413 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
414 {
415    tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
416 }
417 
418 /* Disable LRZ outside of renderpass. */
419 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)420 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
421                struct tu_image *image)
422 {
423    if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
424       return;
425 
426    if (!image->lrz_height)
427       return;
428 
429    tu6_emit_lrz_buffer(cs, image);
430    tu6_disable_lrz_via_depth_view(cmd, cs);
431 }
432 
433 /* Clear LRZ, used for out of renderpass depth clears. */
434 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)435 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
436                          struct tu_image *image,
437                          const VkClearDepthStencilValue *pDepthStencil,
438                          uint32_t rangeCount,
439                          const VkImageSubresourceRange *pRanges)
440 {
441    if (!rangeCount || !image->lrz_height ||
442        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
443       return;
444 
445    /* We cannot predict which depth subresource would be used later on,
446     * so we just pick the first one with depth cleared and clear the LRZ.
447     */
448    const VkImageSubresourceRange *range = NULL;
449    for (unsigned i = 0; i < rangeCount; i++) {
450       if (pRanges[i].aspectMask &
451             (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
452          range = &pRanges[i];
453          break;
454       }
455    }
456 
457    if (!range)
458       return;
459 
460    bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
461                                             pDepthStencil->depth == 1.f);
462 
463    tu6_emit_lrz_buffer(&cmd->cs, image);
464 
465    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
466          .base_layer = range->baseArrayLayer,
467          .layer_count = vk_image_subresource_layer_count(&image->vk, range),
468          .base_mip_level = range->baseMipLevel,
469    ));
470 
471    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
472       .enable = true,
473       .fc_enable = fast_clear,
474       .disable_on_wrong_dir = true,
475    ));
476 
477    tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_CLEAR);
478    tu_emit_event_write<A6XX>(cmd, &cmd->cs, FD_LRZ_FLUSH);
479 
480    if (!fast_clear) {
481       tu6_clear_lrz<A6XX>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
482    }
483 }
484 
485 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)486 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
487 {
488    assert(cmd->state.pass);
489 
490    cmd->state.lrz.valid = false;
491    cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
492 
493    if (cmd->state.lrz.gpu_dir_tracking) {
494       tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
495          .enable = true,
496          .dir = LRZ_DIR_INVALID,
497          .disable_on_wrong_dir = true,
498       ));
499    }
500 }
501 
502 /* update lrz state based on stencil-test func:
503  *
504  * Conceptually the order of the pipeline is:
505  *
506  *
507  *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
508  *                              |                |
509  *                       if wrmask != 0     if wrmask != 0
510  *                              |                |
511  *                              v                v
512  *                        Stencil-Write      Depth-Write
513  *
514  * Because Stencil-Test can have side effects (Stencil-Write) prior
515  * to depth test, in this case we potentially need to disable early
516  * lrz-test. See:
517  *
518  * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
519  */
520 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)521 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
522                            VkCompareOp func,
523                            bool stencil_write)
524 {
525    switch (func) {
526    case VK_COMPARE_OP_ALWAYS:
527       /* nothing to do for LRZ, but for stencil test when stencil-
528        * write is enabled, we need to disable lrz-test, since
529        * conceptually stencil test and write happens before depth-test.
530        */
531       if (stencil_write) {
532          return false;
533       }
534       break;
535    case VK_COMPARE_OP_NEVER:
536       /* fragment never passes, disable lrz_write for this draw. */
537       gras_lrz_cntl->lrz_write = false;
538       break;
539    default:
540       /* whether the fragment passes or not depends on result
541        * of stencil test, which we cannot know when doing binning
542        * pass.
543        */
544       gras_lrz_cntl->lrz_write = false;
545       /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
546        * effects from stencil test we need to disable lrz-test.
547        */
548       if (stencil_write) {
549          return false;
550       }
551       break;
552    }
553 
554    return true;
555 }
556 
557 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)558 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
559                         const uint32_t a)
560 {
561    const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
562    bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
563    bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
564    bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
565    VkCompareOp depth_compare_op =
566       cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
567 
568    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
569 
570    if (!cmd->state.lrz.valid) {
571       return gras_lrz_cntl;
572    }
573 
574    /* If depth test is disabled we shouldn't touch LRZ.
575     * Same if there is no depth attachment.
576     */
577    if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
578       return gras_lrz_cntl;
579 
580    if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
581       /* Without on-gpu LRZ direction tracking - there is nothing we
582        * can do to enable LRZ in secondary command buffers.
583        */
584       return gras_lrz_cntl;
585    }
586 
587    /* See comment in tu_pipeline about disabling LRZ write for blending. */
588    bool reads_dest = cmd->state.blend_reads_dest;
589 
590    gras_lrz_cntl.enable = true;
591    gras_lrz_cntl.lrz_write =
592       z_write_enable &&
593       !reads_dest &&
594       !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
595    gras_lrz_cntl.z_test_enable = z_write_enable;
596    gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
597    gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
598    gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
599    gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
600 
601 
602    /* LRZ is disabled until it is cleared, which means that one "wrong"
603     * depth test or shader could disable LRZ until depth buffer is cleared.
604     */
605    bool disable_lrz = false;
606    bool temporary_disable_lrz = false;
607 
608    /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
609     * fragment tests.  We have to skip LRZ testing and updating, but as long as
610     * the depth direction stayed the same we can continue with LRZ testing later.
611     */
612    if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
613       if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
614          perf_debug(cmd->device, "Skipping LRZ due to FS");
615          temporary_disable_lrz = true;
616       } else {
617          perf_debug(cmd->device, "Disabling LRZ due to FS (TODO: fix for gpu-direction-tracking case");
618          disable_lrz = true;
619       }
620    }
621 
622    /* If Z is not written - it doesn't affect LRZ buffer state.
623     * Which means two things:
624     * - Don't lock direction until Z is written for the first time;
625     * - If Z isn't written and direction IS locked it's possible to just
626     *   temporary disable LRZ instead of fully bailing out, when direction
627     *   is changed.
628     */
629 
630    enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
631    switch (depth_compare_op) {
632    case VK_COMPARE_OP_ALWAYS:
633    case VK_COMPARE_OP_NOT_EQUAL:
634       /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
635        * so if there is a depth write - LRZ must be disabled.
636        */
637       if (z_write_enable) {
638          perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
639          disable_lrz = true;
640          gras_lrz_cntl.dir = LRZ_DIR_INVALID;
641       } else {
642          perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
643          temporary_disable_lrz = true;
644       }
645       break;
646    case VK_COMPARE_OP_EQUAL:
647    case VK_COMPARE_OP_NEVER:
648       /* Blob disables LRZ for OP_EQUAL, and from our empirical
649        * evidence it is a right thing to do.
650        *
651        * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
652        * we could just temporary disable LRZ.
653        */
654       temporary_disable_lrz = true;
655       break;
656    case VK_COMPARE_OP_GREATER:
657    case VK_COMPARE_OP_GREATER_OR_EQUAL:
658       lrz_direction = TU_LRZ_GREATER;
659       gras_lrz_cntl.greater = true;
660       gras_lrz_cntl.dir = LRZ_DIR_GE;
661       break;
662    case VK_COMPARE_OP_LESS:
663    case VK_COMPARE_OP_LESS_OR_EQUAL:
664       lrz_direction = TU_LRZ_LESS;
665       gras_lrz_cntl.greater = false;
666       gras_lrz_cntl.dir = LRZ_DIR_LE;
667       break;
668    default:
669       unreachable("bad VK_COMPARE_OP value or uninitialized");
670       break;
671    };
672 
673    /* If depthfunc direction is changed, bail out on using LRZ. The
674     * LRZ buffer encodes a min/max depth value per block, but if
675     * we switch from GT/GE <-> LT/LE, those values cannot be
676     * interpreted properly.
677     */
678    if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
679        lrz_direction != TU_LRZ_UNKNOWN &&
680        cmd->state.lrz.prev_direction != lrz_direction) {
681       if (z_write_enable) {
682          perf_debug(cmd->device, "Invalidating LRZ due to direction change");
683          disable_lrz = true;
684       } else {
685          perf_debug(cmd->device, "Skipping LRZ due to direction change");
686          temporary_disable_lrz = true;
687       }
688    }
689 
690    /* Consider the following sequence of depthfunc changes:
691     *
692     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
693     * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
694     * during second VK_COMPARE_OP_GREATER.
695     *
696     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
697     * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
698     * invalid during COMPARE_OP_LESS.
699     *
700     * This shows that we should keep last KNOWN direction.
701     */
702    if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
703       cmd->state.lrz.prev_direction = lrz_direction;
704 
705    /* Invalidate LRZ and disable write if stencil test is enabled */
706    bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
707    if (!disable_lrz && stencil_test_enable) {
708       VkCompareOp stencil_front_compare_op = (VkCompareOp)
709          cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
710 
711       VkCompareOp stencil_back_compare_op = (VkCompareOp)
712          cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
713 
714       bool lrz_allowed = true;
715       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
716                                       &gras_lrz_cntl, stencil_front_compare_op,
717                                       cmd->state.stencil_front_write);
718 
719       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
720                                       &gras_lrz_cntl, stencil_back_compare_op,
721                                       cmd->state.stencil_back_write);
722 
723       /* Without depth write it's enough to make sure that depth test
724        * is executed after stencil test, so temporary disabling LRZ is enough.
725        */
726       if (!lrz_allowed) {
727          if (z_write_enable) {
728             perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
729             disable_lrz = true;
730          } else {
731             perf_debug(cmd->device, "Skipping LRZ due to stencil write");
732             temporary_disable_lrz = true;
733          }
734       }
735    }
736 
737    /* Writing depth with blend enabled means we need to invalidate LRZ,
738     * because the written depth value could mean that a later draw with
739     * depth enabled (where we would otherwise write LRZ) could have
740     * fragments which don't pass the depth test due to this draw.  For
741     * example, consider this sequence of draws, with depth mode GREATER:
742     *
743     *   draw A:
744     *     z=0.1, fragments pass
745     *   draw B:
746     *     z=0.4, fragments pass
747     *     blend enabled (LRZ write disabled)
748     *     depth write enabled
749     *   draw C:
750     *     z=0.2, fragments don't pass
751     *     blend disabled
752     *     depth write enabled
753     *
754     * Normally looking at the state in draw C, we'd assume we could
755     * enable LRZ write.  But this would cause early-z/lrz to discard
756     * fragments from draw A which should be visible due to draw B.
757     */
758    if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
759       perf_debug(cmd->device, "Invalidating LRZ due to blend+depthwrite");
760       disable_lrz = true;
761    }
762 
763    if (disable_lrz)
764       cmd->state.lrz.valid = false;
765 
766    if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
767       /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
768        * for this it's not enough to emit empty GRAS_LRZ_CNTL.
769        */
770       gras_lrz_cntl.enable = true;
771       gras_lrz_cntl.dir = LRZ_DIR_INVALID;
772 
773       return gras_lrz_cntl;
774    }
775 
776    if (temporary_disable_lrz)
777       gras_lrz_cntl.enable = false;
778 
779    cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
780    if (!cmd->state.lrz.enabled)
781       memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
782 
783    return gras_lrz_cntl;
784 }
785 
786 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)787 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
788 {
789    const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
790    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
791 
792    tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
793    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
794 }
795