• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_lrz.h"
7 
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12 
13 /* Low-resolution Z buffer is very similar to a depth prepass that helps
14  * the HW avoid executing the fragment shader on those fragments that will
15  * be subsequently discarded by the depth test afterwards.
16  *
17  * The interesting part of this feature is that it allows applications
18  * to submit the vertices in any order.
19  *
20  * In the binning pass it is possible to store the depth value of each
21  * vertex into internal low resolution depth buffer and quickly test
22  * the primitives against it during the render pass.
23  *
24  * There are a number of limitations when LRZ cannot be used:
25  * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
26  * - Writing to stencil buffer
27  * - Writing depth while:
28  *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
29  *   - Using OP_ALWAYS or OP_NOT_EQUAL;
30  * - Clearing depth with vkCmdClearAttachments;
31  * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
32  * - (pre-a650) Using secondary command buffers;
33  * - Sysmem rendering (with small caveat).
34  *
35  * Pre-a650 (before gen3)
36  * ======================
37  *
38  * The direction is fully tracked on CPU. In renderpass LRZ starts with
39  * unknown direction, the direction is set first time when depth write occurs
40  * and if it does change afterwards - direction becomes invalid and LRZ is
41  * disabled for the rest of the renderpass.
42  *
43  * Since direction is not tracked by GPU - it's impossible to know whether
44  * LRZ is enabled during construction of secondary command buffers.
45  *
46  * For the same reason it's impossible to reuse LRZ between renderpasses.
47  *
48  * A650+ (gen3+)
49  * =============
50  *
51  * Now LRZ direction could be tracked on GPU. There are to parts:
52  * - Direction byte which stores current LRZ direction;
53  * - Parameters of the last used depth view.
54  *
55  * The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL
56  * is used - its direction is compared to previously known direction
57  * and direction byte is set to disabled when directions are incompatible.
58  *
59  * Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks
60  * if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value
61  * stored in the buffer, if not - LRZ is disabled. (This is necessary
62  * because depth buffer may have several layers and mip levels, on the
63  * other hand LRZ buffer represents only a single layer + mip level).
64  *
65  * LRZ direction between renderpasses is disabled when underlying depth
66  * buffer is changed, the following commands could change depth image:
67  * - vkCmdBlitImage*
68  * - vkCmdCopyBufferToImage*
69  * - vkCmdCopyImage*
70  *
71  * LRZ Fast-Clear
72  * ==============
73  *
74  * The LRZ fast-clear buffer is initialized to zeroes and read/written
75  * when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block.
76  * '0' means block has original depth clear value, and '1' means that the
77  * corresponding block in LRZ has been modified.
78  *
79  * LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is
80  * written the LRZ block which corresponds to a single fast-clear bit is cleared:
81  * - To 0.0 if depth comparison is GREATER;
82  * - To 1.0 if depth comparison is LESS;
83  *
84  * This way it's always valid to fast-clear. On the other hand we disable
85  * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
86  * for perf if some primitives are expected to fail depth test against the
87  * actual depth clear value.
88  *
89  * LRZ Precision
90  * =============
91  *
92  * LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is
93  * not enough to represent all values of Z32_UNORM or Z32_FLOAT.
94  * This especially rises questions in context of fast-clear, if fast-clear
95  * uses a value which cannot be precisely represented by LRZ - we wouldn't
96  * be able to round it in the correct direction since direction is tracked
97  * on GPU.
98  *
99  * However, it seems that depth comparisons with LRZ values have some "slack"
100  * and nothing special should be done for such depth clear values.
101  *
102  * How it was tested:
103  * - Clear Z32_FLOAT attachment to 1.f / (1 << 17)
104  *   - LRZ buffer contains all zeroes
105  * - Do draws and check whether all samples are passing:
106  *   - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing;
107  *   - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing;
108  *   - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples;
109  *   - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing;
110  *   - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing;
111  * In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated.
112  *
113  * LRZ Caches
114  * ==========
115  *
116  * ! The policy here is to flush LRZ cache right after it is changed,
117  * so if LRZ data is needed afterwards - there is no need to flush it
118  * before using LRZ.
119  *
120  * LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches:
121  * - Cache for fast-clear buffer;
122  * - Cache for direction byte + depth view params.
123  * They could be cleared by LRZ_CLEAR. To become visible in GPU memory
124  * the caches should be flushed with LRZ_FLUSH afterwards.
125  *
126  * GRAS_LRZ_CNTL reads from these caches.
127  */
128 
129 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)130 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
131 {
132    if (!depth_image) {
133       tu_cs_emit_regs(cs,
134                       A6XX_GRAS_LRZ_BUFFER_BASE(0),
135                       A6XX_GRAS_LRZ_BUFFER_PITCH(0),
136                       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
137       return;
138    }
139 
140    uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
141    uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
142    if (!depth_image->lrz_fc_offset)
143       lrz_fc_iova = 0;
144 
145    tu_cs_emit_regs(cs,
146                    A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
147                    A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
148                    A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
149 }
150 
151 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)152 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
153                   struct tu_reg_value reg)
154 {
155    if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
156       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
157       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
158       tu_cs_emit(cs, reg.reg);
159       tu_cs_emit(cs, reg.value);
160    } else {
161       tu_cs_emit_pkt4(cs, reg.reg, 1);
162       tu_cs_emit(cs, reg.value);
163    }
164 }
165 
166 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)167 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
168 {
169    /* Disable direction by writing invalid depth view. */
170    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
171       .base_layer = 0b11111111111,
172       .layer_count = 0b11111111111,
173       .base_mip_level = 0b1111,
174    ));
175 
176    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
177       .enable = true,
178       .disable_on_wrong_dir = true,
179    ));
180 
181    tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
182    tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
183 }
184 
185 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)186 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
187                   const struct tu_render_pass_attachment *att,
188                   const struct tu_image_view *view)
189 {
190    if (!view->image->lrz_height) {
191       assert((cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ) ||
192              !vk_format_has_depth(att->format));
193       return;
194    }
195 
196    bool clears_depth = att->clear_mask &
197       (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
198    bool has_gpu_tracking =
199       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
200 
201    if (!has_gpu_tracking && !clears_depth)
202       return;
203 
204    /* We need to always have an LRZ view just to disable it if there is a
205     * depth attachment, there are any secondaries, and GPU tracking is
206     * enabled, in order not to rely on loadOp state which doesn't exist with
207     * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
208     * enabled and there will be a NULL/garbage LRZ buffer.
209     */
210    cmd->state.lrz.image_view = view;
211 
212    if (!clears_depth && !att->load)
213       return;
214 
215    cmd->state.lrz.valid = true;
216    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
217    /* Be optimistic and unconditionally enable fast-clear in
218     * secondary cmdbufs and when reusing previous LRZ state.
219     */
220    cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
221 
222    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
223    cmd->state.lrz.reuse_previous_state = !clears_depth;
224 }
225 
226 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
227  * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
228  * dynamically disabled).
229  */
230 
231 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)232 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
233                       const struct tu_render_pass_attachment *att)
234 {
235    bool has_gpu_tracking =
236       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
237 
238    if (!has_gpu_tracking)
239       return;
240 
241    if (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ)
242       return;
243 
244    if (!vk_format_has_depth(att->format))
245       return;
246 
247    cmd->state.lrz.valid = true;
248    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
249    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
250 
251    /* We may not have the depth attachment when executing in a secondary
252     * inside a render pass. This means we have to be even more optimistic than
253     * the normal case and enable fast clear even if the depth image doesn't
254     * support it.
255     */
256    cmd->state.lrz.fast_clear = true;
257 
258    /* These are not used inside secondaries */
259    cmd->state.lrz.image_view = NULL;
260    cmd->state.lrz.reuse_previous_state = false;
261 }
262 
263 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
264  * actually emitting anything. The lrz state needs to be consistent between
265  * renderpasses, but only the first should actually emit commands to disable
266  * lrz etc.
267  */
268 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd,const VkClearValue * clear_values)269 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
270                                 const VkClearValue *clear_values)
271 {
272     /* Track LRZ valid state */
273    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
274 
275    uint32_t a;
276    for (a = 0; a < cmd->state.pass->attachment_count; a++) {
277       if (cmd->state.attachments[a]->image->lrz_height)
278          break;
279    }
280 
281    if (a != cmd->state.pass->attachment_count) {
282       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
283       tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
284       if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
285          VkClearValue clear = clear_values[a];
286          cmd->state.lrz.depth_clear_value = clear;
287          cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
288                                      (clear.depthStencil.depth == 0.f ||
289                                       clear.depthStencil.depth == 1.f);
290       }
291       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
292    }
293 }
294 
295 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd,const VkClearValue * clear_values)296 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
297                         const VkClearValue *clear_values)
298 {
299    const struct tu_render_pass *pass = cmd->state.pass;
300 
301    int lrz_img_count = 0;
302    for (unsigned i = 0; i < pass->attachment_count; i++) {
303       if (cmd->state.attachments[i]->image->lrz_height)
304          lrz_img_count++;
305    }
306 
307    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
308        cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
309       /* Theoretically we could switch between LRZ buffers during the binning
310        * and tiling passes, but it is untested and would add complexity for
311        * presumably extremely rare case.
312        */
313       perf_debug(cmd->device,
314                  "Invalidating LRZ because there are several subpasses with "
315                  "different depth attachments in a single renderpass");
316 
317       for (unsigned i = 0; i < pass->attachment_count; i++) {
318          struct tu_image *image = cmd->state.attachments[i]->image;
319          tu_disable_lrz(cmd, &cmd->cs, image);
320       }
321 
322       /* We need a valid LRZ fast-clear base, in case the render pass contents
323        * are in secondaries that enable LRZ, so that they can read that LRZ is
324        * dynamically disabled. It doesn't matter which we use, so just leave
325        * the last one as emitted in tu_disable_lrz().
326        */
327       memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
328       return;
329    }
330 
331     /* Track LRZ valid state */
332    tu_lrz_begin_resumed_renderpass(cmd, clear_values);
333 
334    if (!cmd->state.lrz.valid) {
335       tu6_emit_lrz_buffer(&cmd->cs, NULL);
336    }
337 }
338 
339 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)340 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
341 {
342    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
343    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
344    if (a != VK_ATTACHMENT_UNUSED) {
345       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
346       tu_lrz_init_secondary(cmd, att);
347    }
348 }
349 
350 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)351 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
352 {
353    /* TODO: If lrz was never valid for the entire renderpass, we could exit
354     * early here. Sometimes we know this ahead of time and null out
355     * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
356     * no secondaries.
357     */
358    if (!cmd->state.lrz.image_view)
359       return;
360 
361    struct tu_lrz_state *lrz = &cmd->state.lrz;
362 
363    tu6_emit_lrz_buffer(cs, lrz->image_view->image);
364 
365    if (lrz->reuse_previous_state) {
366       /* Reuse previous LRZ state, LRZ cache is assumed to be
367        * already invalidated by previous renderpass.
368        */
369       assert(lrz->gpu_dir_tracking);
370 
371       tu6_write_lrz_reg(cmd, cs,
372          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
373       return;
374    }
375 
376    bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
377    if (invalidate_lrz) {
378       /* Following the blob we elect to disable LRZ for the whole renderpass
379        * if it is known that LRZ is disabled somewhere in the renderpass.
380        *
381        * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
382        * to fail the comparison of depth views.
383        */
384       tu6_disable_lrz_via_depth_view(cmd, cs);
385       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
386    } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
387       if (lrz->gpu_dir_tracking) {
388          tu6_write_lrz_reg(cmd, cs,
389             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
390       }
391 
392       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
393          .enable = true,
394          .fc_enable = lrz->fast_clear,
395          .disable_on_wrong_dir = lrz->gpu_dir_tracking,
396       ));
397 
398       /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
399        * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
400        *  CUR_DIR_UNSET.
401        */
402       tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
403    }
404 
405    if (!lrz->fast_clear && !invalidate_lrz) {
406       tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
407 
408       /* Even though we disable fast-clear we still have to dirty
409        * fast-clear buffer because both secondary cmdbufs and following
410        * renderpasses won't know that fast-clear is disabled.
411        *
412        * TODO: we could avoid this if we don't store depth and don't
413        * expect secondary cmdbufs.
414        */
415       if (lrz->image_view->image->lrz_fc_size) {
416          tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image);
417       }
418    }
419 }
420 
421 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)422 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
423 {
424    if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
425       tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
426 
427       if (cmd->state.lrz.gpu_dir_tracking) {
428          tu6_write_lrz_reg(cmd, &cmd->cs,
429             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
430       }
431 
432       /* Enable flushing of LRZ fast-clear and of direction buffer */
433       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
434          .enable = true,
435          .fc_enable = cmd->state.lrz.fast_clear,
436          .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
437       ));
438    } else {
439       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
440    }
441 
442    tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
443 
444    /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
445     * additionally clears direction buffer:
446     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0)
447     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
448     *  A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
449     *  LRZ_CLEAR
450     *  LRZ_FLUSH
451     * Since it happens after all of the rendering is done there is no known
452     * reason to do such clear.
453     */
454 }
455 
456 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)457 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
458 {
459    if (!cmd->state.lrz.image_view)
460       return;
461 
462    /* Actually, LRZ buffer could be filled in sysmem, in theory to
463     * be used in another renderpass, but the benefit is rather dubious.
464     */
465 
466    struct tu_lrz_state *lrz = &cmd->state.lrz;
467 
468    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
469       tu_disable_lrz(cmd, cs, lrz->image_view->image);
470       /* Make sure depth view comparison will fail. */
471       tu6_write_lrz_reg(cmd, cs,
472          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
473    } else {
474       tu6_emit_lrz_buffer(cs, lrz->image_view->image);
475       /* Even though we disable LRZ writes in sysmem mode - there is still
476        * LRZ test, so LRZ should be cleared.
477        */
478       if (lrz->fast_clear) {
479          tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
480             .enable = true,
481             .fc_enable = true,
482          ));
483          tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
484          tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
485       } else {
486          tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
487       }
488    }
489 }
490 
491 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)492 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
493 {
494    tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
495 }
496 
497 /* Disable LRZ outside of renderpass. */
498 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)499 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
500                struct tu_image *image)
501 {
502    if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
503       return;
504 
505    if (!image->lrz_height)
506       return;
507 
508    tu6_emit_lrz_buffer(cs, image);
509    tu6_disable_lrz_via_depth_view(cmd, cs);
510 }
511 
512 /* Clear LRZ, used for out of renderpass depth clears. */
513 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)514 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
515                          struct tu_image *image,
516                          const VkClearDepthStencilValue *pDepthStencil,
517                          uint32_t rangeCount,
518                          const VkImageSubresourceRange *pRanges)
519 {
520    if (!rangeCount || !image->lrz_height ||
521        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
522       return;
523 
524    /* We cannot predict which depth subresource would be used later on,
525     * so we just pick the first one with depth cleared and clear the LRZ.
526     */
527    const VkImageSubresourceRange *range = NULL;
528    for (unsigned i = 0; i < rangeCount; i++) {
529       if (pRanges[i].aspectMask &
530             (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
531          range = &pRanges[i];
532          break;
533       }
534    }
535 
536    if (!range)
537       return;
538 
539    bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
540                                             pDepthStencil->depth == 1.f);
541 
542    tu6_emit_lrz_buffer(&cmd->cs, image);
543 
544    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
545          .base_layer = range->baseArrayLayer,
546          .layer_count = vk_image_subresource_layer_count(&image->vk, range),
547          .base_mip_level = range->baseMipLevel,
548    ));
549 
550    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
551       .enable = true,
552       .fc_enable = fast_clear,
553       .disable_on_wrong_dir = true,
554    ));
555 
556    tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
557    tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
558 
559    if (!fast_clear) {
560       tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
561    }
562 }
563 
564 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)565 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
566 {
567    assert(cmd->state.pass);
568 
569    cmd->state.lrz.valid = false;
570    cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
571 
572    if (cmd->state.lrz.gpu_dir_tracking) {
573       tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
574          .enable = true,
575          .dir = LRZ_DIR_INVALID,
576          .disable_on_wrong_dir = true,
577       ));
578    }
579 }
580 
581 /* update lrz state based on stencil-test func:
582  *
583  * Conceptually the order of the pipeline is:
584  *
585  *
586  *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
587  *                              |                |
588  *                       if wrmask != 0     if wrmask != 0
589  *                              |                |
590  *                              v                v
591  *                        Stencil-Write      Depth-Write
592  *
593  * Because Stencil-Test can have side effects (Stencil-Write) prior
594  * to depth test, in this case we potentially need to disable early
595  * lrz-test. See:
596  *
597  * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
598  */
599 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)600 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
601                            VkCompareOp func,
602                            bool stencil_write)
603 {
604    switch (func) {
605    case VK_COMPARE_OP_ALWAYS:
606       /* nothing to do for LRZ, but for stencil test when stencil-
607        * write is enabled, we need to disable lrz-test, since
608        * conceptually stencil test and write happens before depth-test.
609        */
610       if (stencil_write) {
611          return false;
612       }
613       break;
614    case VK_COMPARE_OP_NEVER:
615       /* fragment never passes, disable lrz_write for this draw. */
616       gras_lrz_cntl->lrz_write = false;
617       break;
618    default:
619       /* whether the fragment passes or not depends on result
620        * of stencil test, which we cannot know when doing binning
621        * pass.
622        */
623       gras_lrz_cntl->lrz_write = false;
624       /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
625        * effects from stencil test we need to disable lrz-test.
626        */
627       if (stencil_write) {
628          return false;
629       }
630       break;
631    }
632 
633    return true;
634 }
635 
636 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)637 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
638                         const uint32_t a)
639 {
640    struct tu_pipeline *pipeline = cmd->state.pipeline;
641    bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
642    bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
643    bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
644    bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
645    VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
646 
647    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
648 
649    if (!cmd->state.lrz.valid) {
650       return gras_lrz_cntl;
651    }
652 
653    /* If depth test is disabled we shouldn't touch LRZ.
654     * Same if there is no depth attachment.
655     */
656    if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
657        (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
658       return gras_lrz_cntl;
659 
660    if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
661       /* Without on-gpu LRZ direction tracking - there is nothing we
662        * can do to enable LRZ in secondary command buffers.
663        */
664       return gras_lrz_cntl;
665    }
666 
667    gras_lrz_cntl.enable = true;
668    gras_lrz_cntl.lrz_write =
669       z_write_enable &&
670       !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
671    gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable;
672    gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
673    gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
674    gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
675    gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
676 
677    /* See comment in tu_pipeline about disabling LRZ write for blending. */
678    if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
679        cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
680       gras_lrz_cntl.lrz_write = false;
681 
682    if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
683         cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
684       gras_lrz_cntl.lrz_write = false;
685 
686    /* LRZ is disabled until it is cleared, which means that one "wrong"
687     * depth test or shader could disable LRZ until depth buffer is cleared.
688     */
689    bool disable_lrz = false;
690    bool temporary_disable_lrz = false;
691 
692    /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
693     * or early fragment tests.
694     */
695    if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
696       perf_debug(cmd->device, "Invalidating LRZ due to FS");
697       disable_lrz = true;
698    }
699 
700    /* If Z is not written - it doesn't affect LRZ buffer state.
701     * Which means two things:
702     * - Don't lock direction until Z is written for the first time;
703     * - If Z isn't written and direction IS locked it's possible to just
704     *   temporary disable LRZ instead of fully bailing out, when direction
705     *   is changed.
706     */
707 
708    enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
709    switch (depth_compare_op) {
710    case VK_COMPARE_OP_ALWAYS:
711    case VK_COMPARE_OP_NOT_EQUAL:
712       /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
713        * so if there is a depth write - LRZ must be disabled.
714        */
715       if (z_write_enable) {
716          perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
717          disable_lrz = true;
718          gras_lrz_cntl.dir = LRZ_DIR_INVALID;
719       } else {
720          perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
721          temporary_disable_lrz = true;
722       }
723       break;
724    case VK_COMPARE_OP_EQUAL:
725    case VK_COMPARE_OP_NEVER:
726       /* Blob disables LRZ for OP_EQUAL, and from our empirical
727        * evidence it is a right thing to do.
728        *
729        * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
730        * we could just temporary disable LRZ.
731        */
732       temporary_disable_lrz = true;
733       break;
734    case VK_COMPARE_OP_GREATER:
735    case VK_COMPARE_OP_GREATER_OR_EQUAL:
736       lrz_direction = TU_LRZ_GREATER;
737       gras_lrz_cntl.greater = true;
738       gras_lrz_cntl.dir = LRZ_DIR_GE;
739       break;
740    case VK_COMPARE_OP_LESS:
741    case VK_COMPARE_OP_LESS_OR_EQUAL:
742       lrz_direction = TU_LRZ_LESS;
743       gras_lrz_cntl.greater = false;
744       gras_lrz_cntl.dir = LRZ_DIR_LE;
745       break;
746    default:
747       unreachable("bad VK_COMPARE_OP value or uninitialized");
748       break;
749    };
750 
751    /* If depthfunc direction is changed, bail out on using LRZ. The
752     * LRZ buffer encodes a min/max depth value per block, but if
753     * we switch from GT/GE <-> LT/LE, those values cannot be
754     * interpreted properly.
755     */
756    if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
757        lrz_direction != TU_LRZ_UNKNOWN &&
758        cmd->state.lrz.prev_direction != lrz_direction) {
759       if (z_write_enable) {
760          perf_debug(cmd->device, "Invalidating LRZ due to direction change");
761          disable_lrz = true;
762       } else {
763          perf_debug(cmd->device, "Skipping LRZ due to direction change");
764          temporary_disable_lrz = true;
765       }
766    }
767 
768    /* Consider the following sequence of depthfunc changes:
769     *
770     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
771     * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
772     * during second VK_COMPARE_OP_GREATER.
773     *
774     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
775     * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
776     * invalid during COMPARE_OP_LESS.
777     *
778     * This shows that we should keep last KNOWN direction.
779     */
780    if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
781       cmd->state.lrz.prev_direction = lrz_direction;
782 
783    /* Invalidate LRZ and disable write if stencil test is enabled */
784    bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
785    if (!disable_lrz && stencil_test_enable) {
786       bool stencil_front_writemask =
787          (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
788          (cmd->state.dynamic_stencil_wrmask & 0xff) :
789          (pipeline->stencil_wrmask & 0xff);
790 
791       bool stencil_back_writemask =
792          (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
793          ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
794          (pipeline->stencil_wrmask & 0xff00) >> 8;
795 
796       VkCompareOp stencil_front_compare_op =
797          (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
798 
799       VkCompareOp stencil_back_compare_op =
800          (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
801 
802       bool lrz_allowed = true;
803       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
804                                       &gras_lrz_cntl, stencil_front_compare_op,
805                                       stencil_front_writemask);
806 
807       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
808                                       &gras_lrz_cntl, stencil_back_compare_op,
809                                       stencil_back_writemask);
810 
811       /* Without depth write it's enough to make sure that depth test
812        * is executed after stencil test, so temporary disabling LRZ is enough.
813        */
814       if (!lrz_allowed) {
815          if (z_write_enable) {
816             perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
817             disable_lrz = true;
818          } else {
819             perf_debug(cmd->device, "Skipping LRZ due to stencil write");
820             temporary_disable_lrz = true;
821          }
822       }
823    }
824 
825    if (disable_lrz)
826       cmd->state.lrz.valid = false;
827 
828    if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
829       /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
830        * for this it's not enough to emit empty GRAS_LRZ_CNTL.
831        */
832       gras_lrz_cntl.enable = true;
833       gras_lrz_cntl.dir = LRZ_DIR_INVALID;
834 
835       return gras_lrz_cntl;
836    }
837 
838    if (temporary_disable_lrz)
839       gras_lrz_cntl.enable = false;
840 
841    cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
842    if (!cmd->state.lrz.enabled)
843       memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
844 
845    return gras_lrz_cntl;
846 }
847 
848 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)849 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
850 {
851    const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
852    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
853 
854    tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
855    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
856 }
857