• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_lrz.h"
7 
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12 
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15 
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17  *
18  * There are a number of limitations when LRZ cannot be used:
19  * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20  * - Writing to stencil buffer
21  * - Writing depth while:
22  *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23  *   - Using OP_ALWAYS or OP_NOT_EQUAL;
24  * - Clearing depth with vkCmdClearAttachments;
25  * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26  * - (pre-a650) Using secondary command buffers;
27  * - Sysmem rendering (with small caveat).
28  *
29  * A650+ (gen3+)
30  * =============
31  *
32  * While LRZ could be reused between renderpasses LRZ, it is disabled when
33  * underlying depth buffer is changed.
34  * The following commands could change a depth image:
35  * - vkCmdBlitImage*
36  * - vkCmdCopyBufferToImage*
37  * - vkCmdCopyImage*
38  *
39  * LRZ Fast-Clear
40  * ==============
41  *
42  * It's always valid to fast-clear. On the other hand we disable
43  * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44  * for perf if some primitives are expected to fail depth test against the
45  * actual depth clear value.
46  *
47  * LRZ Caches
48  * ==========
49  *
50  * ! The policy here is to flush LRZ cache right after it is changed,
51  * so if LRZ data is needed afterwards - there is no need to flush it
52  * before using LRZ.
53  */
54 
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57    cmd->state.rp.lrz_disable_reason = reason;
58    perf_debug(cmd->device, "Disabling LRZ because '%s'", reason);
59 }
60 
61 template <chip CHIP>
62 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)63 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
64 {
65    if (!depth_image) {
66       tu_cs_emit_regs(cs,
67                       A6XX_GRAS_LRZ_BUFFER_BASE(0),
68                       A6XX_GRAS_LRZ_BUFFER_PITCH(0),
69                       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
70 
71       if (CHIP >= A7XX)
72          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
73 
74       return;
75    }
76 
77    uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
78    uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
79    if (!depth_image->lrz_fc_offset)
80       lrz_fc_iova = 0;
81 
82    tu_cs_emit_regs(cs,
83                    A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
84                    A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
85                    A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
86 
87    if (CHIP >= A7XX) {
88       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
89          .depth_format = tu6_pipe2depth(depth_image->vk.format)
90       ));
91    }
92 }
93 
94 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)95 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
96                   struct tu_reg_value reg)
97 {
98    if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
99       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
100       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
101       tu_cs_emit(cs, reg.reg);
102       tu_cs_emit(cs, reg.value);
103    } else {
104       tu_cs_emit_pkt4(cs, reg.reg, 1);
105       tu_cs_emit(cs, reg.value);
106    }
107 }
108 
109 template <chip CHIP>
110 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)111 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
112                    struct A6XX_GRAS_LRZ_CNTL cntl)
113 {
114    if (CHIP >= A7XX) {
115       // A7XX split LRZ_CNTL into two seperate registers.
116       struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
117          .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
118          .fc_enable = cntl.fc_enable,
119       );
120       cntl.disable_on_wrong_dir = false;
121       cntl.fc_enable = false;
122 
123       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
124       tu6_write_lrz_reg(cmd, cs, cntl2);
125    } else {
126       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
127    }
128 }
129 
130 template <chip CHIP>
131 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)132 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
133 {
134    /* Disable direction by writing invalid depth view. */
135    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
136       .base_layer = 0b11111111111,
137       .layer_count = 0b11111111111,
138       .base_mip_level = 0b1111,
139    ));
140 
141    tu6_write_lrz_cntl<CHIP>(cmd, cs, {
142       .enable = true,
143       .disable_on_wrong_dir = true,
144    });
145 
146    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
147    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
148 }
149 
150 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)151 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
152                   const struct tu_render_pass_attachment *att,
153                   const struct tu_image_view *view)
154 {
155    if (!view->image->lrz_height) {
156       assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
157       return;
158    }
159 
160    bool clears_depth = att->clear_mask &
161       (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
162    bool has_gpu_tracking =
163       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
164 
165    if (!has_gpu_tracking && !clears_depth)
166       return;
167 
168    /* We need to always have an LRZ view just to disable it if there is a
169     * depth attachment, there are any secondaries, and GPU tracking is
170     * enabled, in order not to rely on loadOp state which doesn't exist with
171     * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
172     * enabled and there will be a NULL/garbage LRZ buffer.
173     */
174    cmd->state.lrz.image_view = view;
175 
176    if (!clears_depth && !att->load)
177       return;
178 
179    cmd->state.lrz.valid = true;
180    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
181    /* Be optimistic and unconditionally enable fast-clear in
182     * secondary cmdbufs and when reusing previous LRZ state.
183     */
184    cmd->state.lrz.fast_clear = view->image->has_lrz_fc;
185 
186    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
187    cmd->state.lrz.reuse_previous_state = !clears_depth;
188 }
189 
190 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
191  * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
192  * dynamically disabled).
193  */
194 
195 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)196 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
197                       const struct tu_render_pass_attachment *att)
198 {
199    bool has_gpu_tracking =
200       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
201 
202    if (!has_gpu_tracking)
203       return;
204 
205    if (!cmd->device->use_lrz)
206       return;
207 
208    if (!vk_format_has_depth(att->format))
209       return;
210 
211    cmd->state.lrz.valid = true;
212    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
213    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
214 
215    /* We may not have the depth attachment when executing in a secondary
216     * inside a render pass. This means we have to be even more optimistic than
217     * the normal case and enable fast clear even if the depth image doesn't
218     * support it.
219     */
220    cmd->state.lrz.fast_clear = true;
221 
222    /* These are not used inside secondaries */
223    cmd->state.lrz.image_view = NULL;
224    cmd->state.lrz.reuse_previous_state = false;
225 }
226 
227 template <chip CHIP>
228 bool
tu_lrzfc_depth_supported(float depth)229 tu_lrzfc_depth_supported(float depth) {
230    /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
231    return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
232 }
233 
234 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
235  * actually emitting anything. The lrz state needs to be consistent between
236  * renderpasses, but only the first should actually emit commands to disable
237  * lrz etc.
238  */
239 template <chip CHIP>
240 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)241 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
242 {
243     /* Track LRZ valid state */
244    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
245 
246    uint32_t a;
247    for (a = 0; a < cmd->state.pass->attachment_count; a++) {
248       if (cmd->state.attachments[a]->image->lrz_height)
249          break;
250    }
251 
252    if (a != cmd->state.pass->attachment_count) {
253       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
254       tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
255       if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
256          VkClearValue clear = cmd->state.clear_values[a];
257          cmd->state.lrz.depth_clear_value = clear;
258          cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
259                                      tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
260       }
261       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
262    }
263 }
264 TU_GENX(tu_lrz_begin_resumed_renderpass);
265 
266 template <chip CHIP>
267 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)268 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
269 {
270    const struct tu_render_pass *pass = cmd->state.pass;
271 
272    cmd->state.rp.lrz_disable_reason = "";
273 
274    int lrz_img_count = 0;
275    for (unsigned i = 0; i < pass->attachment_count; i++) {
276       if (cmd->state.attachments[i]->image->lrz_height)
277          lrz_img_count++;
278    }
279 
280    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
281        cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
282       /* Theoretically we could switch between LRZ buffers during the binning
283        * and tiling passes, but it is untested and would add complexity for
284        * presumably extremely rare case.
285        */
286       tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
287 
288       for (unsigned i = 0; i < pass->attachment_count; i++) {
289          struct tu_image *image = cmd->state.attachments[i]->image;
290          tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
291       }
292 
293       /* We need a valid LRZ fast-clear base, in case the render pass contents
294        * are in secondaries that enable LRZ, so that they can read that LRZ is
295        * dynamically disabled. It doesn't matter which we use, so just leave
296        * the last one as emitted in tu_disable_lrz().
297        */
298       memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
299       return;
300    }
301 
302     /* Track LRZ valid state */
303    tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
304 
305    if (!cmd->state.lrz.valid) {
306       tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
307    }
308 }
309 TU_GENX(tu_lrz_begin_renderpass);
310 
311 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)312 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
313 {
314    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
315    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
316    if (a != VK_ATTACHMENT_UNUSED) {
317       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
318       tu_lrz_init_secondary(cmd, att);
319    }
320 }
321 
322 template <chip CHIP>
323 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)324 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
325 {
326    /* TODO: If lrz was never valid for the entire renderpass, we could exit
327     * early here. Sometimes we know this ahead of time and null out
328     * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
329     * no secondaries.
330     */
331    if (!cmd->state.lrz.image_view)
332       return;
333 
334    struct tu_lrz_state *lrz = &cmd->state.lrz;
335 
336    tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
337 
338    if (lrz->reuse_previous_state) {
339       /* Reuse previous LRZ state, LRZ cache is assumed to be
340        * already invalidated by previous renderpass.
341        */
342       assert(lrz->gpu_dir_tracking);
343 
344       tu6_write_lrz_reg(cmd, cs,
345          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
346       return;
347    }
348 
349    bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
350    if (invalidate_lrz) {
351       /* Following the blob we elect to disable LRZ for the whole renderpass
352        * if it is known that LRZ is disabled somewhere in the renderpass.
353        *
354        * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
355        * to fail the comparison of depth views.
356        */
357       tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
358       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
359    } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
360       if (lrz->gpu_dir_tracking) {
361          tu6_write_lrz_reg(cmd, cs,
362             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
363       }
364 
365       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
366          .enable = true,
367          .fc_enable = lrz->fast_clear,
368          .disable_on_wrong_dir = lrz->gpu_dir_tracking,
369       });
370 
371       /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
372        * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
373        *  CUR_DIR_UNSET.
374        */
375       if (CHIP >= A7XX)
376          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
377       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
378    }
379 
380    if (!lrz->fast_clear && !invalidate_lrz) {
381       tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
382       /* Even though we disable fast-clear we still have to dirty
383        * fast-clear buffer because both secondary cmdbufs and following
384        * renderpasses won't know that fast-clear is disabled.
385        *
386        * TODO: we could avoid this if we don't store depth and don't
387        * expect secondary cmdbufs.
388        */
389       if (lrz->image_view->image->has_lrz_fc) {
390          tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
391       }
392    }
393 }
394 TU_GENX(tu_lrz_tiling_begin);
395 
396 /* We need to re-emit LRZ state before each tile due to skipsaverestore.
397  */
398 template <chip CHIP>
399 void
tu_lrz_before_tile(struct tu_cmd_buffer * cmd,struct tu_cs * cs)400 tu_lrz_before_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
401 {
402    struct tu_lrz_state *lrz = &cmd->state.lrz;
403 
404    if (!lrz->image_view) {
405       tu6_emit_lrz_buffer<CHIP>(cs, NULL);
406    } else {
407       tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
408 
409       if (lrz->gpu_dir_tracking) {
410          if (!lrz->valid) {
411             /* Make sure we fail the comparison of depth views */
412             tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
413          } else {
414             tu6_write_lrz_reg(cmd, cs,
415                A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
416          }
417       }
418    }
419 }
420 TU_GENX(tu_lrz_before_tile);
421 
422 template <chip CHIP>
423 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)424 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
425 {
426    if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
427       tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
428 
429       if (cmd->state.lrz.gpu_dir_tracking) {
430          tu6_write_lrz_reg(cmd, &cmd->cs,
431             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
432       }
433 
434       /* Enable flushing of LRZ fast-clear and of direction buffer */
435       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
436          .enable = true,
437          .fc_enable = cmd->state.lrz.fast_clear,
438          .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
439       });
440    } else {
441       tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
442    }
443 
444    tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
445 
446    /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
447     * additionally clears direction buffer:
448     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0)
449     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
450     *  A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
451     *  LRZ_CLEAR
452     *  LRZ_FLUSH
453     * Since it happens after all of the rendering is done there is no known
454     * reason to do such clear.
455     */
456 }
457 TU_GENX(tu_lrz_tiling_end);
458 
459 template <chip CHIP>
460 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)461 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
462 {
463    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
464       tu_lrz_tiling_begin<CHIP>(cmd, cs);
465       return;
466    }
467 
468    if (!cmd->state.lrz.image_view)
469       return;
470 
471    /* Actually, LRZ buffer could be filled in sysmem, in theory to
472     * be used in another renderpass, but the benefit is rather dubious.
473     */
474 
475    struct tu_lrz_state *lrz = &cmd->state.lrz;
476 
477    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
478       tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
479       /* Make sure depth view comparison will fail. */
480       tu6_write_lrz_reg(cmd, cs,
481          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
482    } else {
483       tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
484       /* Even though we disable LRZ writes in sysmem mode - there is still
485        * LRZ test, so LRZ should be cleared.
486        */
487       if (lrz->fast_clear) {
488          tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
489             .enable = true,
490             .fc_enable = true,
491          });
492 
493          if (CHIP >= A7XX)
494             tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
495          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
496          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
497       } else {
498          tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
499       }
500    }
501 }
502 TU_GENX(tu_lrz_sysmem_begin);
503 
504 template <chip CHIP>
505 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)506 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
507 {
508    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
509       tu_lrz_tiling_end<CHIP>(cmd, cs);
510       return;
511    }
512 
513    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
514 }
515 TU_GENX(tu_lrz_sysmem_end);
516 
517 /* Disable LRZ outside of renderpass. */
518 template <chip CHIP>
519 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)520 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
521                struct tu_image *image)
522 {
523    if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
524       return;
525 
526    if (!image->lrz_height)
527       return;
528 
529    tu6_emit_lrz_buffer<CHIP>(cs, image);
530    tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
531 }
532 TU_GENX(tu_disable_lrz);
533 
534 /* Disable LRZ from the CPU, for host image copy */
535 template <chip CHIP>
536 void
tu_disable_lrz_cpu(struct tu_device * device,struct tu_image * image)537 tu_disable_lrz_cpu(struct tu_device *device, struct tu_image *image)
538 {
539    if (!device->physical_device->info->a6xx.has_lrz_dir_tracking)
540       return;
541 
542    if (!image->lrz_height)
543       return;
544 
545    const unsigned lrz_dir_offset = offsetof(fd_lrzfc_layout<CHIP>, dir_track);
546    uint8_t *lrz_dir_tracking =
547       (uint8_t *)image->map + image->lrz_fc_offset + lrz_dir_offset;
548 
549    *lrz_dir_tracking = FD_LRZ_GPU_DIR_DISABLED;
550 
551    if (image->bo->cached_non_coherent) {
552       tu_bo_sync_cache(device, image->bo,
553                        image->bo_offset + image->lrz_offset + lrz_dir_offset,
554                        1, TU_MEM_SYNC_CACHE_TO_GPU);
555    }
556 }
557 TU_GENX(tu_disable_lrz_cpu);
558 
559 /* Clear LRZ, used for out of renderpass depth clears. */
560 template <chip CHIP>
561 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)562 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
563                          struct tu_image *image,
564                          const VkClearDepthStencilValue *pDepthStencil,
565                          uint32_t rangeCount,
566                          const VkImageSubresourceRange *pRanges)
567 {
568    if (!rangeCount || !image->lrz_height ||
569        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
570       return;
571 
572    /* We cannot predict which depth subresource would be used later on,
573     * so we just pick the first one with depth cleared and clear the LRZ.
574     */
575    const VkImageSubresourceRange *range = NULL;
576    for (unsigned i = 0; i < rangeCount; i++) {
577       if (pRanges[i].aspectMask &
578             (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
579          range = &pRanges[i];
580          break;
581       }
582    }
583 
584    if (!range)
585       return;
586 
587    bool fast_clear = image->has_lrz_fc &&
588                      tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth);
589 
590    tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
591 
592    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
593          .base_layer = range->baseArrayLayer,
594          .layer_count = vk_image_subresource_layer_count(&image->vk, range),
595          .base_mip_level = range->baseMipLevel,
596    ));
597 
598    tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
599       .enable = true,
600       .fc_enable = fast_clear,
601       .disable_on_wrong_dir = true,
602    });
603 
604    if (CHIP >= A7XX)
605       tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
606    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
607    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
608 
609    if (!fast_clear) {
610       tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
611    }
612 }
613 TU_GENX(tu_lrz_clear_depth_image);
614 
615 template <chip CHIP>
616 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)617 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
618 {
619    assert(cmd->state.pass);
620 
621    cmd->state.lrz.valid = false;
622    cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
623 
624    if (cmd->state.lrz.gpu_dir_tracking) {
625       tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
626          .enable = true,
627          .dir = LRZ_DIR_INVALID,
628          .disable_on_wrong_dir = true,
629       });
630    }
631 }
632 TU_GENX(tu_lrz_disable_during_renderpass);
633 
634 /* update lrz state based on stencil-test func:
635  *
636  * Conceptually the order of the pipeline is:
637  *
638  *
639  *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
640  *                              |                |
641  *                       if wrmask != 0     if wrmask != 0
642  *                              |                |
643  *                              v                v
644  *                        Stencil-Write      Depth-Write
645  *
646  * Because Stencil-Test can have side effects (Stencil-Write) prior
647  * to depth test, in this case we potentially need to disable early
648  * lrz-test. See:
649  *
650  * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
651  */
652 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)653 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
654                            VkCompareOp func,
655                            bool stencil_write)
656 {
657    switch (func) {
658    case VK_COMPARE_OP_ALWAYS:
659       /* nothing to do for LRZ, but for stencil test when stencil-
660        * write is enabled, we need to disable lrz-test, since
661        * conceptually stencil test and write happens before depth-test.
662        */
663       if (stencil_write) {
664          return false;
665       }
666       break;
667    case VK_COMPARE_OP_NEVER:
668       /* fragment never passes, disable lrz_write for this draw. */
669       gras_lrz_cntl->lrz_write = false;
670       break;
671    default:
672       /* whether the fragment passes or not depends on result
673        * of stencil test, which we cannot know when doing binning
674        * pass.
675        */
676       gras_lrz_cntl->lrz_write = false;
677       /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
678        * effects from stencil test we need to disable lrz-test.
679        */
680       if (stencil_write) {
681          return false;
682       }
683       break;
684    }
685 
686    return true;
687 }
688 
689 template <chip CHIP>
690 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)691 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
692                         const uint32_t a)
693 {
694    const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
695    bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
696    bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
697    bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
698    VkCompareOp depth_compare_op =
699       cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
700 
701    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
702 
703    if (!cmd->state.lrz.valid) {
704       return gras_lrz_cntl;
705    }
706 
707    /* If depth test is disabled we shouldn't touch LRZ.
708     * Same if there is no depth attachment.
709     */
710    if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
711       return gras_lrz_cntl;
712 
713    if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
714       /* Without on-gpu LRZ direction tracking - there is nothing we
715        * can do to enable LRZ in secondary command buffers.
716        */
717       return gras_lrz_cntl;
718    }
719 
720    /* See comment in tu_pipeline about disabling LRZ write for blending. */
721    bool reads_dest = cmd->state.blend_reads_dest;
722 
723    gras_lrz_cntl.enable = true;
724    gras_lrz_cntl.lrz_write =
725       z_write_enable &&
726       !reads_dest &&
727       !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
728    gras_lrz_cntl.z_test_enable = z_write_enable;
729    gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
730    gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
731    gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
732    gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
733 
734    if (CHIP >= A7XX)
735       gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
736 
737    /* LRZ is disabled until it is cleared, which means that one "wrong"
738     * depth test or shader could disable LRZ until depth buffer is cleared.
739     */
740    bool disable_lrz = false;
741    bool temporary_disable_lrz = false;
742 
743    /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
744     * fragment tests.  We have to skip LRZ testing and updating, but as long as
745     * the depth direction stayed the same we can continue with LRZ testing later.
746     */
747    if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
748       if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
749          perf_debug(cmd->device, "Skipping LRZ due to FS");
750          temporary_disable_lrz = true;
751       } else {
752          tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
753          disable_lrz = true;
754       }
755    }
756 
757    /* If Z is not written - it doesn't affect LRZ buffer state.
758     * Which means two things:
759     * - Don't lock direction until Z is written for the first time;
760     * - If Z isn't written and direction IS locked it's possible to just
761     *   temporary disable LRZ instead of fully bailing out, when direction
762     *   is changed.
763     */
764 
765    enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
766    switch (depth_compare_op) {
767    case VK_COMPARE_OP_ALWAYS:
768    case VK_COMPARE_OP_NOT_EQUAL:
769       /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
770        * so if there is a depth write - LRZ must be disabled.
771        */
772       if (z_write_enable) {
773          tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
774          disable_lrz = true;
775          gras_lrz_cntl.dir = LRZ_DIR_INVALID;
776       } else {
777          perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
778          temporary_disable_lrz = true;
779       }
780       break;
781    case VK_COMPARE_OP_EQUAL:
782    case VK_COMPARE_OP_NEVER:
783       /* Blob disables LRZ for OP_EQUAL, and from our empirical
784        * evidence it is a right thing to do.
785        *
786        * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
787        * we could just temporary disable LRZ.
788        */
789       temporary_disable_lrz = true;
790       break;
791    case VK_COMPARE_OP_GREATER:
792    case VK_COMPARE_OP_GREATER_OR_EQUAL:
793       lrz_direction = TU_LRZ_GREATER;
794       gras_lrz_cntl.greater = true;
795       gras_lrz_cntl.dir = LRZ_DIR_GE;
796       break;
797    case VK_COMPARE_OP_LESS:
798    case VK_COMPARE_OP_LESS_OR_EQUAL:
799       lrz_direction = TU_LRZ_LESS;
800       gras_lrz_cntl.greater = false;
801       gras_lrz_cntl.dir = LRZ_DIR_LE;
802       break;
803    default:
804       unreachable("bad VK_COMPARE_OP value or uninitialized");
805       break;
806    };
807 
808    /* If depthfunc direction is changed, bail out on using LRZ. The
809     * LRZ buffer encodes a min/max depth value per block, but if
810     * we switch from GT/GE <-> LT/LE, those values cannot be
811     * interpreted properly.
812     */
813    if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
814        lrz_direction != TU_LRZ_UNKNOWN &&
815        cmd->state.lrz.prev_direction != lrz_direction) {
816       if (z_write_enable) {
817          tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
818          disable_lrz = true;
819       } else {
820          perf_debug(cmd->device, "Skipping LRZ due to direction change");
821          temporary_disable_lrz = true;
822       }
823    }
824 
825    /* Consider the following sequence of depthfunc changes:
826     *
827     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
828     * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
829     * during second VK_COMPARE_OP_GREATER.
830     *
831     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
832     * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
833     * invalid during COMPARE_OP_LESS.
834     *
835     * This shows that we should keep last KNOWN direction.
836     */
837    if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
838       cmd->state.lrz.prev_direction = lrz_direction;
839 
840    /* Invalidate LRZ and disable write if stencil test is enabled */
841    bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
842    if (!disable_lrz && stencil_test_enable) {
843       VkCompareOp stencil_front_compare_op = (VkCompareOp)
844          cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
845 
846       VkCompareOp stencil_back_compare_op = (VkCompareOp)
847          cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
848 
849       bool lrz_allowed = true;
850       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
851                                       &gras_lrz_cntl, stencil_front_compare_op,
852                                       cmd->state.stencil_front_write);
853 
854       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
855                                       &gras_lrz_cntl, stencil_back_compare_op,
856                                       cmd->state.stencil_back_write);
857 
858       /* Without depth write it's enough to make sure that depth test
859        * is executed after stencil test, so temporary disabling LRZ is enough.
860        */
861       if (!lrz_allowed) {
862          if (z_write_enable) {
863             tu_lrz_disable_reason(cmd, "Stencil write");
864             disable_lrz = true;
865          } else {
866             perf_debug(cmd->device, "Skipping LRZ due to stencil write");
867             temporary_disable_lrz = true;
868          }
869       }
870    }
871 
872    /* Writing depth with blend enabled means we need to invalidate LRZ,
873     * because the written depth value could mean that a later draw with
874     * depth enabled (where we would otherwise write LRZ) could have
875     * fragments which don't pass the depth test due to this draw.  For
876     * example, consider this sequence of draws, with depth mode GREATER:
877     *
878     *   draw A:
879     *     z=0.1, fragments pass
880     *   draw B:
881     *     z=0.4, fragments pass
882     *     blend enabled (LRZ write disabled)
883     *     depth write enabled
884     *   draw C:
885     *     z=0.2, fragments don't pass
886     *     blend disabled
887     *     depth write enabled
888     *
889     * Normally looking at the state in draw C, we'd assume we could
890     * enable LRZ write.  But this would cause early-z/lrz to discard
891     * fragments from draw A which should be visible due to draw B.
892     */
893    if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
894       tu_lrz_disable_reason(cmd, "Depth write + blending");
895       disable_lrz = true;
896    }
897 
898    if (disable_lrz)
899       cmd->state.lrz.valid = false;
900 
901    if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
902       /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
903        * for this it's not enough to emit empty GRAS_LRZ_CNTL.
904        */
905       gras_lrz_cntl.enable = true;
906       gras_lrz_cntl.dir = LRZ_DIR_INVALID;
907 
908       return gras_lrz_cntl;
909    }
910 
911    if (temporary_disable_lrz)
912       gras_lrz_cntl.enable = false;
913 
914    cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
915    if (!cmd->state.lrz.enabled)
916       memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
917 
918    return gras_lrz_cntl;
919 }
920 
921 template <chip CHIP>
922 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)923 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
924 {
925    const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
926    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
927 
928    tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
929    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
930 }
931 TU_GENX(tu6_emit_lrz);
932