• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_lrz.h"
7 
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12 
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15 
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17  *
18  * There are a number of limitations when LRZ cannot be used:
19  * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20  * - Writing to stencil buffer
21  * - Writing depth while:
22  *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23  *   - Using OP_ALWAYS or OP_NOT_EQUAL;
24  * - Clearing depth with vkCmdClearAttachments;
25  * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26  * - (pre-a650) Using secondary command buffers;
27  * - Sysmem rendering (with small caveat).
28  *
29  * A650+ (gen3+)
30  * =============
31  *
32  * While LRZ could be reused between renderpasses LRZ, it is disabled when
33  * underlying depth buffer is changed.
34  * The following commands could change a depth image:
35  * - vkCmdBlitImage*
36  * - vkCmdCopyBufferToImage*
37  * - vkCmdCopyImage*
38  *
39  * LRZ Fast-Clear
40  * ==============
41  *
42  * It's always valid to fast-clear. On the other hand we disable
43  * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44  * for perf if some primitives are expected to fail depth test against the
45  * actual depth clear value.
46  *
47  * LRZ Caches
48  * ==========
49  *
50  * ! The policy here is to flush LRZ cache right after it is changed,
51  * so if LRZ data is needed afterwards - there is no need to flush it
52  * before using LRZ.
53  */
54 
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57    cmd->state.rp.lrz_disable_reason = reason;
58    cmd->state.rp.lrz_disabled_at_draw = cmd->state.rp.drawcall_count;
59    perf_debug(cmd->device, "Disabling LRZ because '%s' at draw %u", reason,
60               cmd->state.rp.lrz_disabled_at_draw);
61 }
62 
63 template <chip CHIP>
64 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)65 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
66 {
67    if (!depth_image) {
68       tu_cs_emit_regs(cs,
69                       A6XX_GRAS_LRZ_BUFFER_BASE(0),
70                       A6XX_GRAS_LRZ_BUFFER_PITCH(0),
71                       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
72 
73       if (CHIP >= A7XX)
74          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
75 
76       return;
77    }
78 
79    uint64_t lrz_iova = depth_image->iova + depth_image->lrz_layout.lrz_offset;
80    uint64_t lrz_fc_iova =
81       depth_image->iova + depth_image->lrz_layout.lrz_fc_offset;
82    if (!depth_image->lrz_layout.lrz_fc_offset)
83       lrz_fc_iova = 0;
84 
85    tu_cs_emit_regs(
86       cs, A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
87       A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_layout.lrz_pitch,
88                                  .array_pitch =
89                                     depth_image->lrz_layout.lrz_layer_size),
90       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
91 
92    if (CHIP >= A7XX) {
93       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
94          .depth_format = tu6_pipe2depth(depth_image->vk.format)
95       ));
96    }
97 }
98 
99 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)100 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
101                   struct tu_reg_value reg)
102 {
103    if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
104       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
105       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
106       tu_cs_emit(cs, reg.reg);
107       tu_cs_emit(cs, reg.value);
108    } else {
109       tu_cs_emit_pkt4(cs, reg.reg, 1);
110       tu_cs_emit(cs, reg.value);
111    }
112 }
113 
114 template <chip CHIP>
115 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)116 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
117                    struct A6XX_GRAS_LRZ_CNTL cntl)
118 {
119    if (CHIP >= A7XX) {
120       // A7XX split LRZ_CNTL into two seperate registers.
121       struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
122          .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
123          .fc_enable = cntl.fc_enable,
124       );
125       cntl.disable_on_wrong_dir = false;
126       cntl.fc_enable = false;
127 
128       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
129       tu6_write_lrz_reg(cmd, cs, cntl2);
130    } else {
131       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
132    }
133 }
134 
135 template <chip CHIP>
136 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)137 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
138 {
139    /* Disable direction by writing invalid depth view. */
140    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
141       .base_layer = 0b11111111111,
142       .layer_count = 0b11111111111,
143       .base_mip_level = 0b1111,
144    ));
145 
146    tu6_write_lrz_cntl<CHIP>(cmd, cs, {
147       .enable = true,
148       .disable_on_wrong_dir = true,
149    });
150 
151    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
152    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
153 }
154 
155 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)156 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
157                   const struct tu_render_pass_attachment *att,
158                   const struct tu_image_view *view)
159 {
160    if (!view->image->lrz_layout.lrz_total_size) {
161       assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
162       return;
163    }
164 
165    bool clears_depth = att->clear_mask &
166       (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
167    bool has_gpu_tracking =
168       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
169 
170    if (!has_gpu_tracking && !clears_depth)
171       return;
172 
173    /* We need to always have an LRZ view just to disable it if there is a
174     * depth attachment, there are any secondaries, and GPU tracking is
175     * enabled, in order not to rely on loadOp state which doesn't exist with
176     * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
177     * enabled and there will be a NULL/garbage LRZ buffer.
178     */
179    cmd->state.lrz.image_view = view;
180 
181    if (!clears_depth && !att->load)
182       return;
183 
184    cmd->state.lrz.valid = true;
185    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
186    /* Be optimistic and unconditionally enable fast-clear in
187     * secondary cmdbufs and when reusing previous LRZ state.
188     */
189    cmd->state.lrz.fast_clear =
190       view->image->lrz_layout.lrz_fc_size > 0 && !TU_DEBUG(NOLRZFC);
191 
192    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
193    cmd->state.lrz.reuse_previous_state = !clears_depth;
194 }
195 
196 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
197  * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
198  * dynamically disabled).
199  */
200 
201 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)202 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
203                       const struct tu_render_pass_attachment *att)
204 {
205    bool has_gpu_tracking =
206       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
207 
208    if (!has_gpu_tracking)
209       return;
210 
211    if (!cmd->device->use_lrz)
212       return;
213 
214    if (!vk_format_has_depth(att->format))
215       return;
216 
217    cmd->state.lrz.valid = true;
218    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
219    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
220 
221    /* We may not have the depth attachment when executing in a secondary
222     * inside a render pass. This means we have to be even more optimistic than
223     * the normal case and enable fast clear even if the depth image doesn't
224     * support it.
225     */
226    cmd->state.lrz.fast_clear = true;
227 
228    /* These are not used inside secondaries */
229    cmd->state.lrz.image_view = NULL;
230    cmd->state.lrz.reuse_previous_state = false;
231 }
232 
233 template <chip CHIP>
234 bool
tu_lrzfc_depth_supported(float depth)235 tu_lrzfc_depth_supported(float depth) {
236    /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
237    return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
238 }
239 
240 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
241  * actually emitting anything. The lrz state needs to be consistent between
242  * renderpasses, but only the first should actually emit commands to disable
243  * lrz etc.
244  */
245 template <chip CHIP>
246 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)247 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
248 {
249     /* Track LRZ valid state */
250    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
251 
252    uint32_t a;
253    for (a = 0; a < cmd->state.pass->attachment_count; a++) {
254       if (cmd->state.attachments[a]->image->lrz_layout.lrz_total_size)
255          break;
256    }
257 
258    if (a != cmd->state.pass->attachment_count) {
259       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
260       tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
261       if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
262          VkClearValue clear = cmd->state.clear_values[a];
263          cmd->state.lrz.depth_clear_value = clear;
264          cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
265                                      tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
266       }
267       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
268    }
269 }
270 TU_GENX(tu_lrz_begin_resumed_renderpass);
271 
272 template <chip CHIP>
273 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)274 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
275 {
276    const struct tu_render_pass *pass = cmd->state.pass;
277 
278    cmd->state.rp.lrz_disable_reason = "";
279    cmd->state.rp.lrz_disabled_at_draw = 0;
280 
281    int lrz_img_count = 0;
282    for (unsigned i = 0; i < pass->attachment_count; i++) {
283       if (cmd->state.attachments[i]->image->lrz_layout.lrz_total_size)
284          lrz_img_count++;
285    }
286 
287    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
288        cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
289       /* Theoretically we could switch between LRZ buffers during the binning
290        * and tiling passes, but it is untested and would add complexity for
291        * presumably extremely rare case.
292        */
293       tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
294 
295       for (unsigned i = 0; i < pass->attachment_count; i++) {
296          struct tu_image *image = cmd->state.attachments[i]->image;
297          tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
298       }
299 
300       /* We need a valid LRZ fast-clear base, in case the render pass contents
301        * are in secondaries that enable LRZ, so that they can read that LRZ is
302        * dynamically disabled. It doesn't matter which we use, so just leave
303        * the last one as emitted in tu_disable_lrz().
304        */
305       memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
306       return;
307    }
308 
309     /* Track LRZ valid state */
310    tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
311 
312    if (!cmd->state.lrz.valid || TU_DEBUG(NOLRZ)) {
313       tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {});
314       tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
315    }
316 }
317 TU_GENX(tu_lrz_begin_renderpass);
318 
319 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)320 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
321 {
322    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
323    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
324    if (a != VK_ATTACHMENT_UNUSED) {
325       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
326       tu_lrz_init_secondary(cmd, att);
327    }
328 }
329 
330 template <chip CHIP>
331 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)332 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
333 {
334    /* TODO: If lrz was never valid for the entire renderpass, we could exit
335     * early here. Sometimes we know this ahead of time and null out
336     * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
337     * no secondaries.
338     */
339    if (!cmd->state.lrz.image_view)
340       return;
341 
342    struct tu_lrz_state *lrz = &cmd->state.lrz;
343 
344    tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
345 
346    if (lrz->reuse_previous_state) {
347       /* Reuse previous LRZ state, LRZ cache is assumed to be
348        * already invalidated by previous renderpass.
349        */
350       assert(lrz->gpu_dir_tracking);
351 
352       tu6_write_lrz_reg(cmd, cs,
353          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
354       return;
355    }
356 
357    if (lrz->disable_for_rp) {
358       /* We may deem necessary to disable LRZ for the whole renderpass.
359        * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
360        * to fail the comparison of depth views.
361        * TODO: Find if there are conditions where it is beneficial.
362        */
363       tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
364       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
365    } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
366       if (lrz->gpu_dir_tracking) {
367          tu6_write_lrz_reg(cmd, cs,
368             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
369       }
370 
371       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
372          .enable = true,
373          .fc_enable = lrz->fast_clear,
374          .disable_on_wrong_dir = lrz->gpu_dir_tracking,
375       });
376 
377       /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
378        * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
379        *  CUR_DIR_UNSET.
380        */
381       if (CHIP >= A7XX)
382          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
383       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
384    }
385 
386    if (!lrz->fast_clear && !lrz->disable_for_rp) {
387       tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
388       /* Even though we disable fast-clear we still have to dirty
389        * fast-clear buffer because both secondary cmdbufs and following
390        * renderpasses won't know that fast-clear is disabled.
391        *
392        * TODO: we could avoid this if we don't store depth and don't
393        * expect secondary cmdbufs.
394        */
395       if (lrz->image_view->image->lrz_layout.lrz_fc_size > 0) {
396          tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
397       }
398    }
399 }
400 TU_GENX(tu_lrz_tiling_begin);
401 
402 /* We need to re-emit LRZ state before each tile due to skipsaverestore.
403  */
404 template <chip CHIP>
405 void
tu_lrz_before_tile(struct tu_cmd_buffer * cmd,struct tu_cs * cs)406 tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
407 {
408    struct tu_lrz_state *lrz = &cmd->state.lrz;
409 
410    if (!lrz->image_view) {
411       tu6_emit_lrz_buffer<CHIP>(cs, NULL);
412    } else {
413       tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
414 
415       if (lrz->gpu_dir_tracking) {
416          if (lrz->disable_for_rp) {
417             /* Make sure we fail the comparison of depth views */
418             tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
419          } else {
420             tu6_write_lrz_reg(cmd, cs,
421                A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
422          }
423       }
424    }
425 }
426 TU_GENX(tu_lrz_before_tile);
427 
428 template <chip CHIP>
429 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)430 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
431 {
432    if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
433       tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
434 
435       if (cmd->state.lrz.gpu_dir_tracking) {
436          tu6_write_lrz_reg(cmd, &cmd->cs,
437             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
438       }
439 
440       /* Enable flushing of LRZ fast-clear and of direction buffer */
441       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
442          .enable = true,
443          .fc_enable = cmd->state.lrz.fast_clear,
444          .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
445       });
446    } else {
447       tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
448    }
449 
450    /* If we haven't disabled LRZ during renderpass, we need to disable it here
451     * for next renderpass to not use invalid LRZ values.
452     */
453    if (cmd->state.lrz.gpu_dir_tracking && !cmd->state.lrz.disable_for_rp &&
454        !cmd->state.lrz.valid) {
455       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
456          .base_layer = 0b11111111111,
457          .layer_count = 0b11111111111,
458          .base_mip_level = 0b1111,
459       ));
460 
461       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
462       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
463    } else {
464       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
465    }
466 }
467 TU_GENX(tu_lrz_tiling_end);
468 
469 template <chip CHIP>
470 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)471 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
472 {
473    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
474       tu_lrz_tiling_begin<CHIP>(cmd, cs);
475       return;
476    }
477 
478    if (!cmd->state.lrz.image_view)
479       return;
480 
481    /* Actually, LRZ buffer could be filled in sysmem, in theory to
482     * be used in another renderpass, but the benefit is rather dubious.
483     */
484 
485    struct tu_lrz_state *lrz = &cmd->state.lrz;
486 
487    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
488       tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
489       /* Make sure depth view comparison will fail. */
490       tu6_write_lrz_reg(cmd, cs,
491          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
492    } else {
493       tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
494       /* Even though we disable LRZ writes in sysmem mode - there is still
495        * LRZ test, so LRZ should be cleared.
496        */
497       if (lrz->fast_clear) {
498          tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
499             .enable = true,
500             .fc_enable = true,
501          });
502 
503          if (CHIP >= A7XX)
504             tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
505          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
506          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
507       } else {
508          tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
509       }
510    }
511 }
512 TU_GENX(tu_lrz_sysmem_begin);
513 
514 template <chip CHIP>
515 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)516 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
517 {
518    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
519       tu_lrz_tiling_end<CHIP>(cmd, cs);
520       return;
521    }
522 
523    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
524 }
525 TU_GENX(tu_lrz_sysmem_end);
526 
527 /* Disable LRZ outside of renderpass. */
528 template <chip CHIP>
529 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)530 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
531                struct tu_image *image)
532 {
533    if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
534       return;
535 
536    if (!image->lrz_layout.lrz_total_size)
537       return;
538 
539    tu6_emit_lrz_buffer<CHIP>(cs, image);
540    tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
541 }
542 TU_GENX(tu_disable_lrz);
543 
544 /* Disable LRZ from the CPU, for host image copy */
545 template <chip CHIP>
546 void
tu_disable_lrz_cpu(struct tu_device * device,struct tu_image * image)547 tu_disable_lrz_cpu(struct tu_device *device, struct tu_image *image)
548 {
549    if (!device->physical_device->info->a6xx.has_lrz_dir_tracking)
550       return;
551 
552    if (!image->lrz_layout.lrz_total_size)
553       return;
554 
555    const unsigned lrz_dir_offset = offsetof(fd_lrzfc_layout<CHIP>, dir_track);
556    uint8_t *lrz_dir_tracking =
557       (uint8_t *)image->map + image->lrz_layout.lrz_fc_offset + lrz_dir_offset;
558 
559    *lrz_dir_tracking = FD_LRZ_GPU_DIR_DISABLED;
560 
561    if (image->bo->cached_non_coherent) {
562       tu_bo_sync_cache(
563          device, image->bo,
564          image->bo_offset + image->lrz_layout.lrz_offset + lrz_dir_offset, 1,
565          TU_MEM_SYNC_CACHE_TO_GPU);
566    }
567 }
568 TU_GENX(tu_disable_lrz_cpu);
569 
570 /* Clear LRZ, used for out of renderpass depth clears. */
571 template <chip CHIP>
572 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)573 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
574                          struct tu_image *image,
575                          const VkClearDepthStencilValue *pDepthStencil,
576                          uint32_t rangeCount,
577                          const VkImageSubresourceRange *pRanges)
578 {
579    if (!rangeCount || !image->lrz_layout.lrz_total_size ||
580        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
581       return;
582 
583    /* We cannot predict which depth subresource would be used later on,
584     * so we just pick the first one with depth cleared and clear the LRZ.
585     */
586    const VkImageSubresourceRange *range = NULL;
587    for (unsigned i = 0; i < rangeCount; i++) {
588       if (pRanges[i].aspectMask &
589             (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
590          range = &pRanges[i];
591          break;
592       }
593    }
594 
595    if (!range)
596       return;
597 
598    bool fast_clear = image->lrz_layout.lrz_fc_size &&
599                      tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth) &&
600                      !TU_DEBUG(NOLRZFC);
601 
602    tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
603 
604    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
605          .base_layer = range->baseArrayLayer,
606          .layer_count = vk_image_subresource_layer_count(&image->vk, range),
607          .base_mip_level = range->baseMipLevel,
608    ));
609 
610    tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
611       .enable = true,
612       .fc_enable = fast_clear,
613       .disable_on_wrong_dir = true,
614    });
615 
616    if (CHIP >= A7XX)
617       tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
618    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
619    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
620 
621    if (!fast_clear) {
622       tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
623    }
624 }
625 TU_GENX(tu_lrz_clear_depth_image);
626 
627 template <chip CHIP>
628 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd,const char * reason)629 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd,
630                                  const char *reason)
631 {
632    assert(cmd->state.pass);
633 
634    tu_lrz_disable_reason(cmd, reason);
635 
636    cmd->state.lrz.valid = false;
637    cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
638 }
639 TU_GENX(tu_lrz_disable_during_renderpass);
640 
641 template <chip CHIP>
642 void
tu_lrz_flush_valid_during_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs)643 tu_lrz_flush_valid_during_renderpass(struct tu_cmd_buffer *cmd,
644                                      struct tu_cs *cs)
645 {
646    if (cmd->state.lrz.valid || cmd->state.lrz.disable_for_rp)
647       return;
648 
649    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
650       .base_layer = 0b11111111111,
651       .layer_count = 0b11111111111,
652       .base_mip_level = 0b1111,
653    ));
654 }
655 TU_GENX(tu_lrz_flush_valid_during_renderpass);
656 
657 /* update lrz state based on stencil-test func:
658  *
659  * Conceptually the order of the pipeline is:
660  *
661  *
662  *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
663  *                              |                |
664  *                       if wrmask != 0     if wrmask != 0
665  *                              |                |
666  *                              v                v
667  *                        Stencil-Write      Depth-Write
668  *
669  * Because Stencil-Test can have side effects (Stencil-Write) prior
670  * to depth test, in this case we potentially need to disable early
671  * lrz-test. See:
672  *
673  * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
674  */
675 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)676 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
677                            VkCompareOp func,
678                            bool stencil_write)
679 {
680    switch (func) {
681    case VK_COMPARE_OP_ALWAYS:
682       /* nothing to do for LRZ, but for stencil test when stencil-
683        * write is enabled, we need to disable lrz-test, since
684        * conceptually stencil test and write happens before depth-test.
685        */
686       if (stencil_write) {
687          return false;
688       }
689       break;
690    case VK_COMPARE_OP_NEVER:
691       /* fragment never passes, disable lrz_write for this draw. */
692       gras_lrz_cntl->lrz_write = false;
693       break;
694    default:
695       /* whether the fragment passes or not depends on result
696        * of stencil test, which we cannot know when doing binning
697        * pass.
698        */
699       gras_lrz_cntl->lrz_write = false;
700       /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
701        * effects from stencil test we need to disable lrz-test.
702        */
703       if (stencil_write) {
704          return false;
705       }
706       break;
707    }
708 
709    return true;
710 }
711 
712 template <chip CHIP>
713 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)714 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
715                         const uint32_t a)
716 {
717    const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
718    bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
719    bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
720    bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
721    VkCompareOp depth_compare_op =
722       cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
723 
724    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
725 
726    if (!cmd->state.lrz.valid) {
727       return gras_lrz_cntl;
728    }
729 
730    /* If depth test is disabled we shouldn't touch LRZ.
731     * Same if there is no depth attachment.
732     */
733    if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
734       return gras_lrz_cntl;
735 
736    if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
737       /* Without on-gpu LRZ direction tracking - there is nothing we
738        * can do to enable LRZ in secondary command buffers.
739        */
740       return gras_lrz_cntl;
741    }
742 
743    /* See comment in tu_pipeline about disabling LRZ write for blending. */
744    bool reads_dest = cmd->state.blend_reads_dest;
745 
746    gras_lrz_cntl.enable = true;
747    gras_lrz_cntl.lrz_write =
748       z_write_enable &&
749       !reads_dest &&
750       !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
751    gras_lrz_cntl.z_test_enable = z_write_enable;
752    gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
753    gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
754    gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
755    gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
756 
757    if (CHIP >= A7XX)
758       gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
759 
760    /* LRZ is disabled until it is cleared, which means that one "wrong"
761     * depth test or shader could disable LRZ until depth buffer is cleared.
762     */
763    bool disable_lrz = false;
764    bool temporary_disable_lrz = false;
765 
766    /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
767     * fragment tests.  We have to skip LRZ testing and updating, but as long as
768     * the depth direction stayed the same we can continue with LRZ testing later.
769     */
770    if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
771       if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
772          perf_debug(cmd->device, "Skipping LRZ due to FS");
773          temporary_disable_lrz = true;
774       } else {
775          tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
776          disable_lrz = true;
777       }
778    }
779 
780    /* If Z is not written - it doesn't affect LRZ buffer state.
781     * Which means two things:
782     * - Don't lock direction until Z is written for the first time;
783     * - If Z isn't written and direction IS locked it's possible to just
784     *   temporary disable LRZ instead of fully bailing out, when direction
785     *   is changed.
786     */
787 
788    enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
789    switch (depth_compare_op) {
790    case VK_COMPARE_OP_ALWAYS:
791    case VK_COMPARE_OP_NOT_EQUAL:
792       /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
793        * so if there is a depth write - LRZ must be disabled.
794        */
795       if (z_write_enable) {
796          tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
797          disable_lrz = true;
798          gras_lrz_cntl.dir = LRZ_DIR_INVALID;
799       } else {
800          perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
801          temporary_disable_lrz = true;
802       }
803       break;
804    case VK_COMPARE_OP_EQUAL:
805    case VK_COMPARE_OP_NEVER:
806       /* Blob disables LRZ for OP_EQUAL, and from our empirical
807        * evidence it is a right thing to do.
808        *
809        * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
810        * we could just temporary disable LRZ.
811        */
812       temporary_disable_lrz = true;
813       break;
814    case VK_COMPARE_OP_GREATER:
815    case VK_COMPARE_OP_GREATER_OR_EQUAL:
816       lrz_direction = TU_LRZ_GREATER;
817       gras_lrz_cntl.greater = true;
818       gras_lrz_cntl.dir = LRZ_DIR_GE;
819       break;
820    case VK_COMPARE_OP_LESS:
821    case VK_COMPARE_OP_LESS_OR_EQUAL:
822       lrz_direction = TU_LRZ_LESS;
823       gras_lrz_cntl.greater = false;
824       gras_lrz_cntl.dir = LRZ_DIR_LE;
825       break;
826    default:
827       unreachable("bad VK_COMPARE_OP value or uninitialized");
828       break;
829    };
830 
831    /* If depthfunc direction is changed, bail out on using LRZ. The
832     * LRZ buffer encodes a min/max depth value per block, but if
833     * we switch from GT/GE <-> LT/LE, those values cannot be
834     * interpreted properly.
835     */
836    if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
837        lrz_direction != TU_LRZ_UNKNOWN &&
838        cmd->state.lrz.prev_direction != lrz_direction) {
839       if (z_write_enable) {
840          tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
841          disable_lrz = true;
842       } else {
843          perf_debug(cmd->device, "Skipping LRZ due to direction change");
844          temporary_disable_lrz = true;
845       }
846    }
847 
848    /* Consider the following sequence of depthfunc changes:
849     *
850     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
851     * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
852     * during second VK_COMPARE_OP_GREATER.
853     *
854     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
855     * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
856     * invalid during COMPARE_OP_LESS.
857     *
858     * This shows that we should keep last KNOWN direction.
859     */
860    if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
861       cmd->state.lrz.prev_direction = lrz_direction;
862 
863    /* Invalidate LRZ and disable write if stencil test is enabled */
864    bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
865    if (!disable_lrz && stencil_test_enable) {
866       VkCompareOp stencil_front_compare_op = (VkCompareOp)
867          cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
868 
869       VkCompareOp stencil_back_compare_op = (VkCompareOp)
870          cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
871 
872       bool lrz_allowed = true;
873       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
874                                       &gras_lrz_cntl, stencil_front_compare_op,
875                                       cmd->state.stencil_front_write);
876 
877       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
878                                       &gras_lrz_cntl, stencil_back_compare_op,
879                                       cmd->state.stencil_back_write);
880 
881       /* Without depth write it's enough to make sure that depth test
882        * is executed after stencil test, so temporary disabling LRZ is enough.
883        */
884       if (!lrz_allowed) {
885          if (z_write_enable) {
886             tu_lrz_disable_reason(cmd, "Stencil write");
887             disable_lrz = true;
888          } else {
889             perf_debug(cmd->device, "Skipping LRZ due to stencil write");
890             temporary_disable_lrz = true;
891          }
892       }
893    }
894 
895    /* Writing depth with blend enabled means we need to invalidate LRZ,
896     * because the written depth value could mean that a later draw with
897     * depth enabled (where we would otherwise write LRZ) could have
898     * fragments which don't pass the depth test due to this draw.  For
899     * example, consider this sequence of draws, with depth mode GREATER:
900     *
901     *   draw A:
902     *     z=0.1, fragments pass
903     *   draw B:
904     *     z=0.4, fragments pass
905     *     blend enabled (LRZ write disabled)
906     *     depth write enabled
907     *   draw C:
908     *     z=0.2, fragments don't pass
909     *     blend disabled
910     *     depth write enabled
911     *
912     * Normally looking at the state in draw C, we'd assume we could
913     * enable LRZ write.  But this would cause early-z/lrz to discard
914     * fragments from draw A which should be visible due to draw B.
915     */
916    if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
917       tu_lrz_disable_reason(cmd, "Depth write + blending");
918       disable_lrz = true;
919    }
920 
921    if (disable_lrz)
922       cmd->state.lrz.valid = false;
923 
924    if (temporary_disable_lrz || disable_lrz)
925       gras_lrz_cntl.enable = false;
926 
927    cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
928    if (!cmd->state.lrz.enabled)
929       memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
930 
931    return gras_lrz_cntl;
932 }
933 
934 template <chip CHIP>
935 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)936 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
937 {
938    const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
939    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
940 
941    tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
942    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
943 }
944 TU_GENX(tu6_emit_lrz);
945