• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  * SPDX-License-Identifier: MIT
5  *
6  * Authors:
7  *    Rob Clark <robclark@freedesktop.org>
8  */
9 
10 #define FD_BO_NO_HARDPIN 1
11 
12 #include <stdio.h>
13 
14 #include "pipe/p_state.h"
15 #include "util/format/u_format.h"
16 #include "util/u_inlines.h"
17 #include "util/u_memory.h"
18 #include "util/u_string.h"
19 
20 #include "freedreno_draw.h"
21 #include "freedreno_resource.h"
22 #include "freedreno_state.h"
23 #include "freedreno_tracepoints.h"
24 
25 #include "fd6_barrier.h"
26 #include "fd6_blitter.h"
27 #include "fd6_context.h"
28 #include "fd6_draw.h"
29 #include "fd6_emit.h"
30 #include "fd6_gmem.h"
31 #include "fd6_pack.h"
32 #include "fd6_program.h"
33 #include "fd6_resource.h"
34 #include "fd6_zsa.h"
35 
36 /**
37  * Emits the flags registers, suitable for RB_MRT_FLAG_BUFFER,
38  * RB_DEPTH_FLAG_BUFFER, SP_PS_2D_SRC_FLAGS, and RB_BLIT_FLAG_DST.
39  */
40 void
fd6_emit_flag_reference(struct fd_ringbuffer * ring,struct fd_resource * rsc,int level,int layer)41 fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc,
42                         int level, int layer)
43 {
44    if (fd_resource_ubwc_enabled(rsc, level)) {
45       OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0,
46                 0);
47       OUT_RING(ring, A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(
48                         fdl_ubwc_pitch(&rsc->layout, level)) |
49                         A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(
50                            rsc->layout.ubwc_layer_size >> 2));
51    } else {
52       OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */
53       OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */
54       OUT_RING(ring, 0x00000000);
55    }
56 }
57 
58 template <chip CHIP>
59 static void
emit_mrt(struct fd_ringbuffer * ring,struct pipe_framebuffer_state * pfb,const struct fd_gmem_stateobj * gmem)60 emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
61          const struct fd_gmem_stateobj *gmem)
62 {
63    unsigned srgb_cntl = 0;
64    unsigned i;
65 
66    /* Note, GLES 3.2 says "If the fragment’s layer number is negative, or
67     * greater than or equal to the minimum number of layers of any attachment,
68     * the effects of the fragment on the framebuffer contents are undefined."
69     */
70    unsigned max_layer_index = 0;
71    enum a6xx_format mrt0_format = FMT6_NONE;
72 
73    for (i = 0; i < pfb->nr_cbufs; i++) {
74       enum a3xx_color_swap swap = WZYX;
75       bool sint = false, uint = false;
76       struct fd_resource *rsc = NULL;
77       ASSERTED struct fdl_slice *slice = NULL;
78       uint32_t stride = 0;
79       uint32_t array_stride = 0;
80       uint32_t offset;
81 
82       if (!pfb->cbufs[i])
83          continue;
84 
85       struct pipe_surface *psurf = pfb->cbufs[i];
86       enum pipe_format pformat = psurf->format;
87       rsc = fd_resource(psurf->texture);
88 
89       uint32_t base = gmem ? gmem->cbuf_base[i] : 0;
90       slice = fd_resource_slice(rsc, psurf->u.tex.level);
91       enum a6xx_tile_mode tile_mode = (enum a6xx_tile_mode)
92             fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
93       enum a6xx_format format = fd6_color_format(pformat, tile_mode);
94       sint = util_format_is_pure_sint(pformat);
95       uint = util_format_is_pure_uint(pformat);
96 
97       if (util_format_is_srgb(pformat))
98          srgb_cntl |= (1 << i);
99 
100       offset =
101          fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer);
102 
103       stride = fd_resource_pitch(rsc, psurf->u.tex.level);
104       array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level);
105       swap = fd6_color_swap(pformat, (enum a6xx_tile_mode)rsc->layout.tile_mode, false);
106 
107       max_layer_index = psurf->u.tex.last_layer - psurf->u.tex.first_layer;
108 
109       assert((offset + slice->size0) <= fd_bo_size(rsc->bo));
110 
111       /* Batch with no draws? */
112       fd_ringbuffer_attach_bo(ring, rsc->bo);
113 
114       OUT_REG(ring,
115          RB_MRT_BUF_INFO(CHIP, i,
116             .color_format = format,
117             .color_tile_mode = tile_mode,
118             .color_swap = swap,
119             .losslesscompen = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level),
120          ),
121          A6XX_RB_MRT_PITCH(i, stride),
122          A6XX_RB_MRT_ARRAY_PITCH(i, array_stride),
123          A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset),
124          A6XX_RB_MRT_BASE_GMEM(i, base));
125 
126       OUT_REG(ring, A6XX_SP_FS_MRT_REG(i, .color_format = format,
127                                        .color_sint = sint, .color_uint = uint));
128 
129       OUT_PKT4(ring, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 3);
130       fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level,
131                               psurf->u.tex.first_layer);
132 
133       if (i == 0)
134          mrt0_format = format;
135    }
136    if (pfb->zsbuf)
137       max_layer_index = pfb->zsbuf->u.tex.last_layer - pfb->zsbuf->u.tex.first_layer;
138 
139    OUT_REG(ring, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = mrt0_format));
140 
141    OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl));
142    OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl));
143 
144    OUT_REG(ring, A6XX_GRAS_MAX_LAYER_INDEX(max_layer_index));
145 }
146 
147 template <chip CHIP>
148 static void
emit_zs(struct fd_context * ctx,struct fd_ringbuffer * ring,struct pipe_surface * zsbuf,const struct fd_gmem_stateobj * gmem)149 emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
150         struct pipe_surface *zsbuf, const struct fd_gmem_stateobj *gmem)
151 {
152    if (zsbuf) {
153       struct fd_resource *rsc = fd_resource(zsbuf->texture);
154       struct fd_resource *stencil = rsc->stencil;
155       uint32_t stride = fd_resource_pitch(rsc, zsbuf->u.tex.level);
156       uint32_t array_stride = fd_resource_layer_stride(rsc, zsbuf->u.tex.level);
157       uint32_t base = gmem ? gmem->zsbuf_base[0] : 0;
158       uint32_t offset =
159          fd_resource_offset(rsc, zsbuf->u.tex.level, zsbuf->u.tex.first_layer);
160 
161       /* We could have a depth buffer, but no draws with depth write/test
162        * enabled, in which case it wouldn't have been part of the batch
163        * resource tracking
164        */
165       fd_ringbuffer_attach_bo(ring, rsc->bo);
166 
167       if (zsbuf->format == PIPE_FORMAT_S8_UINT) {
168          /* S8 is implemented as Z32_S8 minus the Z32 plane: */
169          enum a6xx_depth_format fmt = DEPTH6_32;
170 
171          OUT_REG(ring,
172             RB_DEPTH_BUFFER_INFO(CHIP,
173                .depth_format = fmt,
174                .tilemode = TILE6_3,
175                .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
176             ),
177             A6XX_RB_DEPTH_BUFFER_PITCH(0),
178             A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
179             A6XX_RB_DEPTH_BUFFER_BASE(.qword = 0),
180             A6XX_RB_DEPTH_BUFFER_BASE_GMEM(base));
181 
182          OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
183 
184          stencil = rsc;
185       } else {
186          enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format);
187 
188          OUT_REG(ring,
189             RB_DEPTH_BUFFER_INFO(CHIP,
190                .depth_format = fmt,
191                .tilemode = TILE6_3,
192                .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
193             ),
194             A6XX_RB_DEPTH_BUFFER_PITCH(stride),
195             A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(array_stride),
196             A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset),
197             A6XX_RB_DEPTH_BUFFER_BASE_GMEM(base));
198 
199          OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
200 
201          OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
202          fd6_emit_flag_reference(ring, rsc, zsbuf->u.tex.level,
203                                  zsbuf->u.tex.first_layer);
204       }
205 
206       if (stencil) {
207          stride = fd_resource_pitch(stencil, zsbuf->u.tex.level);
208          array_stride = fd_resource_layer_stride(stencil, zsbuf->u.tex.level);
209          uint32_t base = gmem ? gmem->zsbuf_base[1] : 0;
210          uint32_t offset =
211             fd_resource_offset(stencil, zsbuf->u.tex.level, zsbuf->u.tex.first_layer);
212 
213          fd_ringbuffer_attach_bo(ring, stencil->bo);
214 
215          OUT_REG(ring,
216             RB_STENCIL_INFO(
217                CHIP,
218                .separate_stencil = true,
219                .tilemode = TILE6_3,
220             ),
221             A6XX_RB_STENCIL_BUFFER_PITCH(stride),
222             A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride),
223             A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset),
224             A6XX_RB_STENCIL_BUFFER_BASE_GMEM(base)
225          );
226       } else {
227          OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
228       }
229    } else {
230       OUT_REG(ring,
231               RB_DEPTH_BUFFER_INFO(
232                     CHIP,
233                     .depth_format = DEPTH6_NONE,
234               ),
235               A6XX_RB_DEPTH_BUFFER_PITCH(),
236               A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(),
237               A6XX_RB_DEPTH_BUFFER_BASE(),
238               A6XX_RB_DEPTH_BUFFER_BASE_GMEM(),
239       );
240 
241       OUT_REG(ring,
242               A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
243 
244       OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
245    }
246 }
247 
248 template <chip CHIP>
249 static void
emit_lrz(struct fd_batch * batch,struct fd_batch_subpass * subpass)250 emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass)
251 {
252    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
253    struct fd_ringbuffer *ring = batch->gmem;
254 
255    if (!subpass->lrz) {
256       OUT_REG(ring, A6XX_GRAS_LRZ_BUFFER_BASE(),
257               A6XX_GRAS_LRZ_BUFFER_PITCH(),
258               A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
259       if (CHIP >= A7XX)
260          OUT_REG(ring, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
261       return;
262    }
263 
264    /* When swapping LRZ buffers we need to flush LRZ cache..
265     * we possibly don't need this during the binning pass, it
266     * appears that the corruption happens on the read-side, ie.
267     * we change the LRZ buffer after a sub-pass, but get a
268     * cache-hit on stale data from the previous LRZ buffer.
269     */
270    fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
271 
272    struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture);
273    OUT_REG(ring,
274       A6XX_GRAS_LRZ_BUFFER_BASE(.bo = subpass->lrz),
275       A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = zsbuf->lrz_pitch),
276       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(
277          .bo = zsbuf->lrz_fc_offset ? subpass->lrz : NULL,
278          .bo_offset = zsbuf->lrz_fc_offset
279       ),
280    );
281    fd_ringbuffer_attach_bo(ring, subpass->lrz);
282 
283    if (CHIP >= A7XX) {
284       OUT_REG(ring,
285          A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
286             .depth_format = fd6_pipe2depth(pfb->zsbuf->format),
287          )
288       );
289    }
290 }
291 
292 /* Emit any needed lrz clears to the prologue cmds
293  */
294 template <chip CHIP>
295 static void
emit_lrz_clears(struct fd_batch * batch)296 emit_lrz_clears(struct fd_batch *batch)
297 {
298    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
299    struct fd_context *ctx = batch->ctx;
300    unsigned count = 0;
301 
302    if (!pfb->zsbuf)
303       return;
304 
305    struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture);
306 
307    foreach_subpass (subpass, batch) {
308       /* The lrz buffer isn't explicitly tracked by the batch resource
309        * tracking (tracking the zsbuf is sufficient), but it still needs
310        * to be attached to the ring
311        */
312       if (subpass->lrz)
313          fd_ringbuffer_attach_bo(batch->gmem, subpass->lrz);
314 
315       if (!(subpass->fast_cleared & FD_BUFFER_LRZ))
316          continue;
317 
318       subpass->fast_cleared &= ~FD_BUFFER_LRZ;
319 
320       /* prep before first clear: */
321       if (count == 0) {
322          struct fd_ringbuffer *ring = fd_batch_get_prologue(batch);
323 
324          fd6_emit_ccu_cntl<CHIP>(ring, ctx->screen, false);
325 
326          OUT_PKT7(ring, CP_SET_MARKER, 1);
327          OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE));
328 
329          fd6_emit_flushes<CHIP>(ctx, ring, FD6_FLUSH_CACHE);
330 
331          if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
332              ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) {
333             /* This a non-context register, so we have to WFI before changing. */
334             OUT_WFI5(ring);
335             OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1);
336             OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
337          }
338       }
339 
340       fd6_clear_lrz<CHIP>(batch, zsbuf, subpass->lrz, subpass->clear_depth);
341 
342       count++;
343    }
344 
345    /* cleanup after last clear: */
346    if (count > 0) {
347       struct fd_ringbuffer *ring = fd_batch_get_prologue(batch);
348 
349       if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
350           ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) {
351          OUT_WFI5(ring);
352          OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1);
353          OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
354       }
355 
356       /* Clearing writes via CCU color in the PS stage, and LRZ is read via
357        * UCHE in the earlier GRAS stage.
358        *
359        * Note tu also asks for WFI but maybe that is only needed if
360        * has_ccu_flush_bug (and it is added by fd6_emit_flushes() already
361        * in that case)
362        */
363       fd6_emit_flushes<CHIP>(batch->ctx, ring,
364                              FD6_FLUSH_CCU_COLOR |
365                              FD6_INVALIDATE_CACHE);
366    }
367 }
368 
369 static bool
use_hw_binning(struct fd_batch * batch)370 use_hw_binning(struct fd_batch *batch)
371 {
372    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
373 
374    if ((gmem->maxpw * gmem->maxph) > 32)
375       return false;
376 
377    return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) >= 2) &&
378           (batch->num_draws > 0);
379 }
380 
381 static void
patch_fb_read_gmem(struct fd_batch * batch)382 patch_fb_read_gmem(struct fd_batch *batch)
383 {
384    struct fd_screen *screen = batch->ctx->screen;
385    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
386    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
387 
388    unsigned num_patches = fd_patch_num_elements(&batch->fb_read_patches);
389    if (!num_patches)
390       return;
391 
392    for (unsigned i = 0; i < num_patches; i++) {
393      struct fd_cs_patch *patch =
394         fd_patch_element(&batch->fb_read_patches, i);
395       int buf = patch->val;
396       struct pipe_surface *psurf = pfb->cbufs[buf];
397       struct pipe_resource *prsc = psurf->texture;
398       struct fd_resource *rsc = fd_resource(prsc);
399       enum pipe_format format = psurf->format;
400 
401       uint8_t swiz[4];
402       fdl6_format_swiz(psurf->format, false, swiz);
403 
404       uint64_t base = screen->gmem_base + gmem->cbuf_base[buf];
405       /* always TILE6_2 mode in GMEM, which also means no swap: */
406       uint32_t descriptor[FDL6_TEX_CONST_DWORDS] = {
407             A6XX_TEX_CONST_0_FMT(fd6_texture_format(
408                   format, (enum a6xx_tile_mode)rsc->layout.tile_mode, false)) |
409             A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) |
410             A6XX_TEX_CONST_0_SWAP(WZYX) |
411             A6XX_TEX_CONST_0_TILE_MODE(TILE6_2) |
412             COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
413             A6XX_TEX_CONST_0_SWIZ_X(fdl6_swiz(swiz[0])) |
414             A6XX_TEX_CONST_0_SWIZ_Y(fdl6_swiz(swiz[1])) |
415             A6XX_TEX_CONST_0_SWIZ_Z(fdl6_swiz(swiz[2])) |
416             A6XX_TEX_CONST_0_SWIZ_W(fdl6_swiz(swiz[3])),
417 
418          A6XX_TEX_CONST_1_WIDTH(pfb->width) |
419             A6XX_TEX_CONST_1_HEIGHT(pfb->height),
420 
421          A6XX_TEX_CONST_2_PITCH(gmem->bin_w * gmem->cbuf_cpp[buf]) |
422             A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D),
423 
424          A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size),
425          A6XX_TEX_CONST_4_BASE_LO(base),
426 
427          A6XX_TEX_CONST_5_BASE_HI(base >> 32) |
428             A6XX_TEX_CONST_5_DEPTH(prsc->array_size)
429       };
430 
431       memcpy(patch->cs, descriptor, FDL6_TEX_CONST_DWORDS * 4);
432    }
433 
434    util_dynarray_clear(&batch->fb_read_patches);
435 }
436 
437 template <chip CHIP>
438 static void
patch_fb_read_sysmem(struct fd_batch * batch)439 patch_fb_read_sysmem(struct fd_batch *batch)
440 {
441    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
442 
443    unsigned num_patches =
444       fd_patch_num_elements(&batch->fb_read_patches);
445    if (!num_patches)
446       return;
447    for (unsigned i = 0; i < num_patches; i++) {
448      struct fd_cs_patch *patch =
449         fd_patch_element(&batch->fb_read_patches, i);
450       int buf = patch->val;
451 
452       struct pipe_surface *psurf = pfb->cbufs[buf];
453       if (!psurf)
454          return;
455 
456       struct pipe_resource *prsc = psurf->texture;
457       struct fd_resource *rsc = fd_resource(prsc);
458 
459       struct fdl_view_args args = {
460          .chip = CHIP,
461 
462          .iova = fd_bo_get_iova(rsc->bo),
463 
464          .base_miplevel = psurf->u.tex.level,
465          .level_count = 1,
466 
467          .base_array_layer = psurf->u.tex.first_layer,
468          .layer_count = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1,
469 
470          .swiz = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
471                   PIPE_SWIZZLE_W},
472          .format = psurf->format,
473 
474          .type = FDL_VIEW_TYPE_2D,
475          .chroma_offsets = {FDL_CHROMA_LOCATION_COSITED_EVEN,
476                             FDL_CHROMA_LOCATION_COSITED_EVEN},
477       };
478       const struct fdl_layout *layouts[3] = {&rsc->layout, NULL, NULL};
479       struct fdl6_view view;
480       fdl6_view_init(&view, layouts, &args,
481                      batch->ctx->screen->info->a6xx.has_z24uint_s8uint);
482       memcpy(patch->cs, view.descriptor, FDL6_TEX_CONST_DWORDS * 4);
483    }
484 
485    util_dynarray_clear(&batch->fb_read_patches);
486 }
487 
488 template <chip CHIP>
489 static void
update_render_cntl(struct fd_batch * batch,struct pipe_framebuffer_state * pfb,bool binning)490 update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb,
491                    bool binning)
492 {
493    struct fd_ringbuffer *ring = batch->gmem;
494 
495    if (CHIP >= A7XX) {
496       OUT_REG(ring,
497          RB_RENDER_CNTL(
498             CHIP,
499             .binning = binning,
500             .raster_mode = TYPE_TILED,
501             .raster_direction = LR_TB
502          )
503       );
504       OUT_REG(ring,
505          A7XX_GRAS_SU_RENDER_CNTL(
506             .binning = binning,
507          )
508       );
509       return;
510    }
511 
512    struct fd_screen *screen = batch->ctx->screen;
513    bool depth_ubwc_enable = false;
514    uint32_t mrts_ubwc_enable = 0;
515    int i;
516 
517    if (pfb->zsbuf) {
518       struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
519       depth_ubwc_enable =
520          fd_resource_ubwc_enabled(rsc, pfb->zsbuf->u.tex.level);
521    }
522 
523    for (i = 0; i < pfb->nr_cbufs; i++) {
524       if (!pfb->cbufs[i])
525          continue;
526 
527       struct pipe_surface *psurf = pfb->cbufs[i];
528       struct fd_resource *rsc = fd_resource(psurf->texture);
529 
530       if (fd_resource_ubwc_enabled(rsc, psurf->u.tex.level))
531          mrts_ubwc_enable |= 1 << i;
532    }
533 
534    struct fd_reg_pair rb_render_cntl = RB_RENDER_CNTL(
535          CHIP,
536          .ccusinglecachelinesize = 2,
537          .binning = binning,
538          .flag_depth = depth_ubwc_enable,
539          .flag_mrts = mrts_ubwc_enable,
540    );
541 
542    if (screen->info->a6xx.has_cp_reg_write) {
543       OUT_PKT(ring, CP_REG_WRITE,
544               CP_REG_WRITE_0(TRACK_RENDER_CNTL),
545               CP_REG_WRITE_1(rb_render_cntl.reg),
546               CP_REG_WRITE_2(rb_render_cntl.value),
547       );
548    } else {
549       OUT_REG(ring, rb_render_cntl);
550    }
551 }
552 
553 static void
update_vsc_pipe(struct fd_batch * batch)554 update_vsc_pipe(struct fd_batch *batch)
555 {
556    struct fd_context *ctx = batch->ctx;
557    struct fd6_context *fd6_ctx = fd6_context(ctx);
558    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
559    struct fd_ringbuffer *ring = batch->gmem;
560    unsigned max_vsc_pipes = batch->ctx->screen->info->num_vsc_pipes;
561    int i;
562 
563    if (batch->draw_strm_bits / 8 > fd6_ctx->vsc_draw_strm_pitch) {
564       if (fd6_ctx->vsc_draw_strm)
565          fd_bo_del(fd6_ctx->vsc_draw_strm);
566       fd6_ctx->vsc_draw_strm = NULL;
567       /* Note: probably only need to align to 0x40, but aligning stronger
568        * reduces the odds that we will have to realloc again on the next
569        * frame:
570        */
571       fd6_ctx->vsc_draw_strm_pitch = align(batch->draw_strm_bits / 8, 0x4000);
572       mesa_logd("pre-resize VSC_DRAW_STRM_PITCH to: 0x%x",
573                 fd6_ctx->vsc_draw_strm_pitch);
574    }
575 
576    if (batch->prim_strm_bits / 8 > fd6_ctx->vsc_prim_strm_pitch) {
577       if (fd6_ctx->vsc_prim_strm)
578          fd_bo_del(fd6_ctx->vsc_prim_strm);
579       fd6_ctx->vsc_prim_strm = NULL;
580       fd6_ctx->vsc_prim_strm_pitch = align(batch->prim_strm_bits / 8, 0x4000);
581       mesa_logd("pre-resize VSC_PRIM_STRM_PITCH to: 0x%x",
582                 fd6_ctx->vsc_prim_strm_pitch);
583    }
584 
585    if (!fd6_ctx->vsc_draw_strm) {
586       /* We also use four bytes per vsc pipe at the end of the draw
587        * stream buffer for VSC_DRAW_STRM_SIZE written back by hw
588        * (see VSC_DRAW_STRM_SIZE_ADDRESS)
589        */
590       unsigned sz = (max_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch) +
591                     (max_vsc_pipes * 4);
592       fd6_ctx->vsc_draw_strm =
593          fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_draw_strm");
594    }
595 
596    if (!fd6_ctx->vsc_prim_strm) {
597       unsigned sz = max_vsc_pipes * fd6_ctx->vsc_prim_strm_pitch;
598       fd6_ctx->vsc_prim_strm =
599          fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_prim_strm");
600    }
601 
602    fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_draw_strm);
603    fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_prim_strm);
604 
605    OUT_REG(ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h),
606            A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm,
607                                            .bo_offset = max_vsc_pipes *
608                                               fd6_ctx->vsc_draw_strm_pitch));
609 
610    OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, .ny = gmem->nbins_y));
611 
612    OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), max_vsc_pipes);
613    for (i = 0; i < max_vsc_pipes; i++) {
614       const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i];
615       OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
616                         A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
617                         A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) |
618                         A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
619    }
620 
621    OUT_REG(
622       ring, A6XX_VSC_PRIM_STRM_ADDRESS(.bo = fd6_ctx->vsc_prim_strm),
623       A6XX_VSC_PRIM_STRM_PITCH(.dword = fd6_ctx->vsc_prim_strm_pitch),
624       A6XX_VSC_PRIM_STRM_LIMIT(.dword = fd6_ctx->vsc_prim_strm_pitch - 64));
625 
626    OUT_REG(
627       ring, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = fd6_ctx->vsc_draw_strm),
628       A6XX_VSC_DRAW_STRM_PITCH(.dword = fd6_ctx->vsc_draw_strm_pitch),
629       A6XX_VSC_DRAW_STRM_LIMIT(.dword = fd6_ctx->vsc_draw_strm_pitch - 64));
630 }
631 
632 /*
633  * If overflow is detected, either 0x1 (VSC_DRAW_STRM overflow) or 0x3
634  * (VSC_PRIM_STRM overflow) plus the size of the overflowed buffer is
635  * written to control->vsc_overflow.  This allows the CPU to
636  * detect which buffer overflowed (and, since the current size is
637  * encoded as well, this protects against already-submitted but
638  * not executed batches from fooling the CPU into increasing the
639  * size again unnecessarily).
640  */
641 static void
emit_vsc_overflow_test(struct fd_batch * batch)642 emit_vsc_overflow_test(struct fd_batch *batch)
643 {
644    struct fd_ringbuffer *ring = batch->gmem;
645    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
646    struct fd6_context *fd6_ctx = fd6_context(batch->ctx);
647 
648    assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0);
649    assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0);
650 
651    /* Check for overflow, write vsc_scratch if detected: */
652    for (int i = 0; i < gmem->num_vsc_pipes; i++) {
653       OUT_PKT7(ring, CP_COND_WRITE5, 8);
654       OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
655                         CP_COND_WRITE5_0_WRITE_MEMORY);
656       OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(
657                         REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
658       OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
659       OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch - 64));
660       OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
661       OUT_RELOC(ring,
662                 control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */
663       OUT_RING(ring,
664                CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch));
665 
666       OUT_PKT7(ring, CP_COND_WRITE5, 8);
667       OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
668                         CP_COND_WRITE5_0_WRITE_MEMORY);
669       OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(
670                         REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
671       OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
672       OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch - 64));
673       OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
674       OUT_RELOC(ring,
675                 control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */
676       OUT_RING(ring,
677                CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch));
678    }
679 
680    OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
681 }
682 
683 static void
check_vsc_overflow(struct fd_context * ctx)684 check_vsc_overflow(struct fd_context *ctx)
685 {
686    struct fd6_context *fd6_ctx = fd6_context(ctx);
687    struct fd6_control *control =
688          (struct fd6_control *)fd_bo_map(fd6_ctx->control_mem);
689    uint32_t vsc_overflow = control->vsc_overflow;
690 
691    if (!vsc_overflow)
692       return;
693 
694    /* clear overflow flag: */
695    control->vsc_overflow = 0;
696 
697    unsigned buffer = vsc_overflow & 0x3;
698    unsigned size = vsc_overflow & ~0x3;
699 
700    if (buffer == 0x1) {
701       /* VSC_DRAW_STRM overflow: */
702 
703       if (size < fd6_ctx->vsc_draw_strm_pitch) {
704          /* we've already increased the size, this overflow is
705           * from a batch submitted before resize, but executed
706           * after
707           */
708          return;
709       }
710 
711       fd_bo_del(fd6_ctx->vsc_draw_strm);
712       fd6_ctx->vsc_draw_strm = NULL;
713       fd6_ctx->vsc_draw_strm_pitch *= 2;
714 
715       mesa_logd("resized VSC_DRAW_STRM_PITCH to: 0x%x",
716                 fd6_ctx->vsc_draw_strm_pitch);
717 
718    } else if (buffer == 0x3) {
719       /* VSC_PRIM_STRM overflow: */
720 
721       if (size < fd6_ctx->vsc_prim_strm_pitch) {
722          /* we've already increased the size */
723          return;
724       }
725 
726       fd_bo_del(fd6_ctx->vsc_prim_strm);
727       fd6_ctx->vsc_prim_strm = NULL;
728       fd6_ctx->vsc_prim_strm_pitch *= 2;
729 
730       mesa_logd("resized VSC_PRIM_STRM_PITCH to: 0x%x",
731                 fd6_ctx->vsc_prim_strm_pitch);
732 
733    } else {
734       /* NOTE: it's possible, for example, for overflow to corrupt the
735        * control page.  I mostly just see this hit if I set initial VSC
736        * buffer size extremely small.  Things still seem to recover,
737        * but maybe we should pre-emptively realloc vsc_data/vsc_data2
738        * and hope for different memory placement?
739        */
740       mesa_loge("invalid vsc_overflow value: 0x%08x", vsc_overflow);
741    }
742 }
743 
744 template <chip CHIP>
745 static void
emit_common_init(struct fd_batch * batch)746 emit_common_init(struct fd_batch *batch)
747 {
748    struct fd_context *ctx = batch->ctx;
749    struct fd_ringbuffer *ring = batch->gmem;
750    struct fd_autotune *at = &batch->ctx->autotune;
751    struct fd_batch_result *result = batch->autotune_result;
752 
753    if (!result)
754       return;
755 
756    fd_ringbuffer_attach_bo(ring, at->results_mem);
757 
758    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
759    OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
760 
761    if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
762       OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
763       OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start));
764 
765       fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
766 
767       /* Copied from blob's cmdstream, not sure why it is done. */
768       if (CHIP == A7XX) {
769          fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
770       }
771    } else {
772       OUT_PKT(ring, CP_EVENT_WRITE7,
773          CP_EVENT_WRITE7_0(
774             .event = ZPASS_DONE,
775             .write_sample_count = true,
776          ),
777          EV_DST_RAM_CP_EVENT_WRITE7_1(
778             results_ptr(at, result[result->idx].samples_start)
779          ),
780       );
781    }
782 }
783 
784 template <chip CHIP>
785 static void
emit_common_fini(struct fd_batch * batch)786 emit_common_fini(struct fd_batch *batch)
787 {
788    struct fd_context *ctx = batch->ctx;
789    struct fd_ringbuffer *ring = batch->gmem;
790    struct fd_autotune *at = &batch->ctx->autotune;
791    struct fd_batch_result *result = batch->autotune_result;
792 
793    fd6_emit_flushes<CHIP>(batch->ctx, ring, batch->barrier);
794 
795    if (!result)
796       return;
797 
798    fd_ringbuffer_attach_bo(ring, at->results_mem);
799 
800    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
801    OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
802 
803    if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
804       OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
805       OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end));
806 
807       fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
808    } else {
809       OUT_PKT(ring, CP_EVENT_WRITE7,
810          CP_EVENT_WRITE7_0(
811             .event = ZPASS_DONE,
812             .write_sample_count = true,
813             .sample_count_end_offset = true,
814             .write_accum_sample_count_diff = true,
815          ),
816          EV_DST_RAM_CP_EVENT_WRITE7_1(
817             results_ptr(at, result[result->idx].samples_start)
818          ),
819       );
820    }
821 
822    fd6_fence_write<CHIP>(ring, result->fence, results_ptr(at, fence));
823 }
824 
825 /*
826  * Emit conditional CP_INDIRECT_BRANCH based on VSC_STATE[p], ie. the IB
827  * is skipped for tiles that have no visible geometry.
828  *
829  * If we aren't using binning pass, this just emits a normal IB.
830  */
831 static void
emit_conditional_ib(struct fd_batch * batch,const struct fd_tile * tile,struct fd_ringbuffer * target)832 emit_conditional_ib(struct fd_batch *batch, const struct fd_tile *tile,
833                     struct fd_ringbuffer *target)
834 {
835    struct fd_ringbuffer *ring = batch->gmem;
836 
837    /* If we have fast clear, that won't count in the VSC state, so it
838     * forces an unconditional IB (because we know there is something
839     * to do for this tile)
840     */
841    if (batch->cleared || !use_hw_binning(batch)) {
842       fd6_emit_ib(batch->gmem, target);
843       return;
844    }
845 
846    if (target->cur == target->start)
847       return;
848 
849    emit_marker6(ring, 6);
850 
851    unsigned count = fd_ringbuffer_cmd_count(target);
852 
853    BEGIN_RING(ring, 5 + 4 * count); /* ensure conditional doesn't get split */
854 
855    OUT_PKT7(ring, CP_REG_TEST, 1);
856    OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) |
857                      A6XX_CP_REG_TEST_0_BIT(tile->n) |
858                      A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
859 
860    OUT_PKT7(ring, CP_COND_REG_EXEC, 2);
861    OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
862    OUT_RING(ring, PRED_TEST_CP_COND_REG_EXEC_1_DWORDS(4 * count));
863 
864    for (unsigned i = 0; i < count; i++) {
865       uint32_t dwords;
866       OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3);
867       dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4;
868       assert(dwords > 0);
869       OUT_RING(ring, dwords);
870    }
871 
872    emit_marker6(ring, 6);
873 }
874 
875 static void
set_scissor(struct fd_ringbuffer * ring,uint32_t x1,uint32_t y1,uint32_t x2,uint32_t y2)876 set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2,
877             uint32_t y2)
878 {
879    OUT_REG(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
880            A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
881 
882    OUT_REG(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
883            A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
884 }
885 
886 template <chip CHIP>
887 static void
set_tessfactor_bo(struct fd_ringbuffer * ring,struct fd_batch * batch)888 set_tessfactor_bo(struct fd_ringbuffer *ring, struct fd_batch *batch)
889 {
890    /* This happens after all drawing has been emitted to the draw CS, so we know
891     * whether we need the tess BO pointers.
892     */
893    if (!batch->tessellation)
894       return;
895 
896    struct fd_screen *screen = batch->ctx->screen;
897 
898    assert(screen->tess_bo);
899    fd_ringbuffer_attach_bo(ring, screen->tess_bo);
900    OUT_REG(ring, PC_TESSFACTOR_ADDR(CHIP, screen->tess_bo));
901    /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
902    OUT_WFI5(ring);
903 }
904 
905 struct bin_size_params {
906    enum a6xx_render_mode render_mode;
907    bool force_lrz_write_dis;
908    enum a6xx_buffers_location buffers_location;
909    enum a6xx_lrz_feedback_mask lrz_feedback_zmode_mask;
910 };
911 
912 template <chip CHIP>
913 static void
set_bin_size(struct fd_ringbuffer * ring,const struct fd_gmem_stateobj * gmem,struct bin_size_params p)914 set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem,
915              struct bin_size_params p)
916 {
917    unsigned w = gmem ? gmem->bin_w : 0;
918    unsigned h = gmem ? gmem->bin_h : 0;
919 
920    if (CHIP == A6XX) {
921       OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
922             .binw = w, .binh = h,
923             .render_mode = p.render_mode,
924             .force_lrz_write_dis = p.force_lrz_write_dis,
925             .buffers_location = p.buffers_location,
926             .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
927       ));
928    } else {
929       OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
930             .binw = w, .binh = h,
931             .render_mode = p.render_mode,
932             .force_lrz_write_dis = p.force_lrz_write_dis,
933             .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
934       ));
935    }
936    OUT_REG(ring, RB_BIN_CONTROL(
937          CHIP,
938          .binw = w, .binh = h,
939          .render_mode = p.render_mode,
940          .force_lrz_write_dis = p.force_lrz_write_dis,
941          .buffers_location = p.buffers_location,
942          .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
943    ));
944    /* no flag for RB_BIN_CONTROL2... */
945    OUT_REG(ring, A6XX_RB_BIN_CONTROL2(.binw = w, .binh = h));
946 }
947 
948 template <chip CHIP>
949 static void
emit_binning_pass(struct fd_batch * batch)950 emit_binning_pass(struct fd_batch *batch) assert_dt
951 {
952    struct fd_ringbuffer *ring = batch->gmem;
953    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
954    struct fd_screen *screen = batch->ctx->screen;
955 
956    assert(!batch->tessellation);
957 
958    set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1);
959 
960    emit_marker6(ring, 7);
961    OUT_PKT7(ring, CP_SET_MARKER, 1);
962    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
963    emit_marker6(ring, 7);
964 
965    OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
966    OUT_RING(ring, 0x1);
967 
968    OUT_PKT7(ring, CP_SET_MODE, 1);
969    OUT_RING(ring, 0x1);
970 
971    OUT_WFI5(ring);
972 
973    OUT_REG(ring, A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
974 
975    update_vsc_pipe(batch);
976 
977    if (CHIP == A6XX) {
978       OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
979       OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
980    }
981 
982    OUT_PKT7(ring, CP_EVENT_WRITE, 1);
983    OUT_RING(ring, UNK_2C);
984 
985    OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1);
986    OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) | A6XX_RB_WINDOW_OFFSET_Y(0));
987 
988    OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
989    OUT_RING(ring,
990             A6XX_SP_TP_WINDOW_OFFSET_X(0) | A6XX_SP_TP_WINDOW_OFFSET_Y(0));
991 
992    /* emit IB to binning drawcmds: */
993    trace_start_binning_ib(&batch->trace, ring);
994    foreach_subpass (subpass, batch) {
995       emit_lrz<CHIP>(batch, subpass);
996       fd6_emit_ib(ring, subpass->draw);
997    }
998    trace_end_binning_ib(&batch->trace, ring);
999 
1000    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1001    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1002                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1003                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1004    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1005    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1006 
1007    OUT_PKT7(ring, CP_EVENT_WRITE, 1);
1008    OUT_RING(ring, UNK_2D);
1009 
1010    /* This flush is probably required because the VSC, which produces the
1011     * visibility stream, is a client of UCHE, whereas the CP needs to read
1012     * the visibility stream (without caching) to do draw skipping. The
1013     * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1014     * submitted are finished before reading the VSC regs (in
1015     * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly
1016     * as part of draws).
1017     */
1018    fd6_emit_flushes<CHIP>(batch->ctx, ring,
1019                           FD6_FLUSH_CACHE |
1020                           FD6_WAIT_FOR_IDLE |
1021                           FD6_WAIT_FOR_ME);
1022 
1023    trace_start_vsc_overflow_test(&batch->trace, batch->gmem);
1024    emit_vsc_overflow_test(batch);
1025    trace_end_vsc_overflow_test(&batch->trace, batch->gmem);
1026 
1027    OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1028    OUT_RING(ring, 0x0);
1029 
1030    OUT_PKT7(ring, CP_SET_MODE, 1);
1031    OUT_RING(ring, 0x0);
1032 
1033    fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1034 }
1035 
1036 static void
emit_msaa(struct fd_ringbuffer * ring,unsigned nr)1037 emit_msaa(struct fd_ringbuffer *ring, unsigned nr)
1038 {
1039    enum a3xx_msaa_samples samples = fd_msaa_samples(nr);
1040 
1041    OUT_PKT4(ring, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2);
1042    OUT_RING(ring, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples));
1043    OUT_RING(ring, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) |
1044                      COND(samples == MSAA_ONE,
1045                           A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE));
1046 
1047    OUT_PKT4(ring, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2);
1048    OUT_RING(ring, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples));
1049    OUT_RING(ring, A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) |
1050                      COND(samples == MSAA_ONE,
1051                           A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE));
1052 
1053    OUT_PKT4(ring, REG_A6XX_RB_RAS_MSAA_CNTL, 2);
1054    OUT_RING(ring, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples));
1055    OUT_RING(ring,
1056             A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) |
1057                COND(samples == MSAA_ONE, A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE));
1058 
1059    OUT_PKT4(ring, REG_A6XX_RB_BLIT_GMEM_MSAA_CNTL, 1);
1060    OUT_RING(ring, A6XX_RB_BLIT_GMEM_MSAA_CNTL_SAMPLES(samples));
1061 }
1062 
1063 template <chip CHIP>
1064 static void prepare_tile_setup(struct fd_batch *batch);
1065 template <chip CHIP>
1066 static void prepare_tile_fini(struct fd_batch *batch);
1067 
1068 static void
fd7_emit_static_binning_regs(struct fd_ringbuffer * ring)1069 fd7_emit_static_binning_regs(struct fd_ringbuffer *ring)
1070 {
1071    OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x0));
1072    OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(0x0));
1073    OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
1074    OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
1075    OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
1076    OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1077 }
1078 
1079 template <chip CHIP>
1080 struct fd_ringbuffer *
fd6_build_preemption_preamble(struct fd_context * ctx)1081 fd6_build_preemption_preamble(struct fd_context *ctx)
1082 {
1083    struct fd_screen *screen = ctx->screen;
1084    struct fd_ringbuffer *ring;
1085 
1086    ring = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1087    fd6_emit_static_regs<CHIP>(ctx, ring);
1088    fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
1089 
1090    if (CHIP == A6XX) {
1091       OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1092       OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1093    } else if (CHIP >= A7XX) {
1094       fd7_emit_static_binning_regs(ring);
1095    }
1096 
1097    /* TODO use CP_MEM_TO_SCRATCH_MEM on a7xx. The VSC scratch mem should be
1098     * automatically saved, unlike GPU registers, so we wouldn't have to
1099     * manually restore this state.
1100     */
1101    OUT_PKT7(ring, CP_MEM_TO_REG, 3);
1102    OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VSC_STATE(0)) |
1103                   CP_MEM_TO_REG_0_CNT(32));
1104    OUT_RELOC(ring, control_ptr(fd6_context(ctx), vsc_state));
1105 
1106    return ring;
1107 }
1108 FD_GENX(fd6_build_preemption_preamble);
1109 
1110 /* before first tile */
1111 template <chip CHIP>
1112 static void
fd6_emit_tile_init(struct fd_batch * batch)1113 fd6_emit_tile_init(struct fd_batch *batch) assert_dt
1114 {
1115    struct fd_ringbuffer *ring = batch->gmem;
1116    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1117    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1118    struct fd_screen *screen = batch->ctx->screen;
1119 
1120    emit_lrz_clears<CHIP>(batch);
1121 
1122    fd6_emit_restore<CHIP>(batch, ring);
1123 
1124    fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
1125 
1126    if (batch->prologue) {
1127       trace_start_prologue(&batch->trace, ring);
1128       fd6_emit_ib(ring, batch->prologue);
1129       trace_end_prologue(&batch->trace, ring);
1130    }
1131 
1132    fd6_cache_inv<CHIP>(batch->ctx, ring);
1133 
1134    prepare_tile_setup<CHIP>(batch);
1135    prepare_tile_fini<CHIP>(batch);
1136 
1137    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1138    OUT_RING(ring, 0x0);
1139 
1140    /* blob controls "local" in IB2, but I think that is not required */
1141    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
1142    OUT_RING(ring, 0x1);
1143 
1144    fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1145 
1146    emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, batch->gmem_state);
1147    emit_mrt<CHIP>(ring, pfb, batch->gmem_state);
1148    emit_msaa(ring, pfb->samples);
1149    patch_fb_read_gmem(batch);
1150 
1151    if (CHIP >= A7XX)
1152       fd7_emit_static_binning_regs(ring);
1153 
1154    if (use_hw_binning(batch)) {
1155       /* enable stream-out during binning pass: */
1156       OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1157 
1158       set_bin_size<CHIP>(ring, gmem, {
1159             .render_mode = BINNING_PASS,
1160             .buffers_location = BUFFERS_IN_GMEM,
1161             .lrz_feedback_zmode_mask = LRZ_FEEDBACK_NONE,
1162       });
1163       update_render_cntl<CHIP>(batch, pfb, true);
1164       emit_binning_pass<CHIP>(batch);
1165 
1166       /* and disable stream-out for draw pass: */
1167       OUT_REG(ring, A6XX_VPC_SO_DISABLE(true));
1168 
1169       /*
1170        * NOTE: even if we detect VSC overflow and disable use of
1171        * visibility stream in draw pass, it is still safe to execute
1172        * the reset of these cmds:
1173        */
1174 
1175       set_bin_size<CHIP>(ring, gmem, {
1176             .render_mode = RENDERING_PASS,
1177             .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1178             .buffers_location = BUFFERS_IN_GMEM,
1179             .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback
1180                                           ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1181                                           : LRZ_FEEDBACK_NONE,
1182       });
1183 
1184       OUT_REG(ring, A6XX_VFD_MODE_CNTL(RENDERING_PASS));
1185 
1186       if (CHIP == A6XX) {
1187          OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1188          OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1189       }
1190 
1191       OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1192       OUT_RING(ring, 0x1);
1193 
1194       /* Upload state regs to memory to be restored on skipsaverestore
1195        * preemption.
1196        */
1197       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
1198       OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_STATE_REG(0)) |
1199                      CP_REG_TO_MEM_0_CNT(32));
1200       OUT_RELOC(ring, control_ptr(fd6_context(batch->ctx), vsc_state));
1201    } else {
1202       /* no binning pass, so enable stream-out for draw pass:: */
1203       OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1204 
1205       set_bin_size<CHIP>(ring, gmem, {
1206             .render_mode = RENDERING_PASS,
1207             .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1208             .buffers_location = BUFFERS_IN_GMEM,
1209             .lrz_feedback_zmode_mask =
1210                screen->info->a6xx.has_lrz_feedback
1211                   ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z
1212                   : LRZ_FEEDBACK_NONE,
1213       });
1214    }
1215 
1216    update_render_cntl<CHIP>(batch, pfb, false);
1217 
1218    emit_common_init<CHIP>(batch);
1219 }
1220 
1221 template <chip CHIP>
1222 static void
set_window_offset(struct fd_ringbuffer * ring,uint32_t x1,uint32_t y1)1223 set_window_offset(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1)
1224 {
1225    OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1);
1226    OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1));
1227 
1228    OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET2, 1);
1229    OUT_RING(ring, A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1));
1230 
1231    OUT_REG(ring, SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1));
1232 
1233    OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
1234    OUT_RING(ring,
1235             A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1));
1236 }
1237 
1238 /* before mem2gmem */
1239 template <chip CHIP>
1240 static void
fd6_emit_tile_prep(struct fd_batch * batch,const struct fd_tile * tile)1241 fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
1242 {
1243    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1244    struct fd_screen *screen = batch->ctx->screen;
1245    struct fd_context *ctx = batch->ctx;
1246    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1247    struct fd6_context *fd6_ctx = fd6_context(ctx);
1248    struct fd_ringbuffer *ring = batch->gmem;
1249 
1250    emit_marker6(ring, 7);
1251    OUT_PKT7(ring, CP_SET_MARKER, 1);
1252    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_START) |
1253                   A6XX_CP_SET_MARKER_0_USES_GMEM);
1254    emit_marker6(ring, 7);
1255 
1256    uint32_t x1 = tile->xoff;
1257    uint32_t y1 = tile->yoff;
1258    uint32_t x2 = tile->xoff + tile->bin_w - 1;
1259    uint32_t y2 = tile->yoff + tile->bin_h - 1;
1260 
1261    set_scissor(ring, x1, y1, x2, y2);
1262    set_tessfactor_bo<CHIP>(ring, batch);
1263 
1264    fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1265 
1266    emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, batch->gmem_state);
1267    emit_mrt<CHIP>(ring, pfb, batch->gmem_state);
1268    emit_msaa(ring, pfb->samples);
1269 
1270    if (use_hw_binning(batch)) {
1271       const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p];
1272       unsigned num_vsc_pipes = ctx->screen->info->num_vsc_pipes;
1273 
1274       OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
1275 
1276       OUT_PKT7(ring, CP_SET_MODE, 1);
1277       OUT_RING(ring, 0x0);
1278 
1279       OUT_PKT7(ring, CP_SET_BIN_DATA5, 7);
1280       OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
1281                         CP_SET_BIN_DATA5_0_VSC_N(tile->n));
1282       OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */
1283                 (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
1284       OUT_RELOC(
1285          ring, fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */
1286          (tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch),
1287          0, 0);
1288       OUT_RELOC(ring, fd6_ctx->vsc_prim_strm,
1289                 (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0);
1290 
1291       OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1292       OUT_RING(ring, 0x0);
1293 
1294       /* and disable stream-out for draw pass: */
1295       OUT_REG(ring, A6XX_VPC_SO_DISABLE(true));
1296 
1297       /*
1298        * NOTE: even if we detect VSC overflow and disable use of
1299        * visibility stream in draw pass, it is still safe to execute
1300        * the reset of these cmds:
1301        */
1302 
1303       set_bin_size<CHIP>(ring, gmem, {
1304             .render_mode = RENDERING_PASS,
1305             .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1306             .buffers_location = BUFFERS_IN_GMEM,
1307             .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback
1308                                           ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1309                                           : LRZ_FEEDBACK_NONE,
1310       });
1311 
1312       OUT_REG(ring, A6XX_VFD_MODE_CNTL(RENDERING_PASS));
1313 
1314       if (CHIP == A6XX) {
1315          OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1316          OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1317       }
1318 
1319       OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1320       OUT_RING(ring, 0x1);
1321 
1322    } else {
1323       OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1324       OUT_RING(ring, 0x1);
1325 
1326       /* no binning pass, so enable stream-out for draw pass:: */
1327       OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1328 
1329       set_bin_size<CHIP>(ring, gmem, {
1330             .render_mode = RENDERING_PASS,
1331             .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1332             .buffers_location = BUFFERS_IN_GMEM,
1333             .lrz_feedback_zmode_mask =
1334                screen->info->a6xx.has_lrz_feedback
1335                   ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z
1336                   : LRZ_FEEDBACK_NONE,
1337       });
1338    }
1339 
1340    set_window_offset<CHIP>(ring, x1, y1);
1341 
1342    set_bin_size<CHIP>(ring, gmem, {
1343          .render_mode = RENDERING_PASS,
1344          .force_lrz_write_dis = !ctx->screen->info->a6xx.has_lrz_feedback,
1345          .buffers_location = BUFFERS_IN_GMEM,
1346          .lrz_feedback_zmode_mask = ctx->screen->info->a6xx.has_lrz_feedback
1347                                        ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1348                                        : LRZ_FEEDBACK_NONE,
1349    });
1350 
1351    OUT_PKT7(ring, CP_SET_MODE, 1);
1352    OUT_RING(ring, 0x0);
1353 }
1354 
1355 static void
set_blit_scissor(struct fd_batch * batch,struct fd_ringbuffer * ring)1356 set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring)
1357 {
1358    const struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1359 
1360    struct pipe_scissor_state blit_scissor;
1361 
1362    blit_scissor.minx = 0;
1363    blit_scissor.miny = 0;
1364    blit_scissor.maxx = ALIGN(pfb->width, 16);
1365    blit_scissor.maxy = ALIGN(pfb->height, 4);
1366 
1367    OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
1368    OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_TL_X(blit_scissor.minx) |
1369                      A6XX_RB_BLIT_SCISSOR_TL_Y(blit_scissor.miny));
1370    OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_BR_X(blit_scissor.maxx - 1) |
1371                      A6XX_RB_BLIT_SCISSOR_BR_Y(blit_scissor.maxy - 1));
1372 }
1373 
1374 template <chip CHIP>
1375 static void
emit_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,bool stencil)1376 emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base,
1377           struct pipe_surface *psurf, bool stencil)
1378 {
1379    struct fd_resource *rsc = fd_resource(psurf->texture);
1380    enum pipe_format pfmt = psurf->format;
1381    uint32_t offset;
1382    bool ubwc_enabled;
1383 
1384    assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
1385 
1386    /* separate stencil case: */
1387    if (stencil) {
1388       rsc = rsc->stencil;
1389       pfmt = rsc->b.b.format;
1390    }
1391 
1392    offset =
1393       fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer);
1394    ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level);
1395 
1396    assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
1397 
1398    enum a6xx_tile_mode tile_mode = (enum a6xx_tile_mode)
1399          fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level);
1400    enum a6xx_format format = fd6_color_format(pfmt, tile_mode);
1401    uint32_t stride = fd_resource_pitch(rsc, psurf->u.tex.level);
1402    uint32_t array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level);
1403    enum a3xx_color_swap swap =
1404          fd6_color_swap(pfmt, (enum a6xx_tile_mode)rsc->layout.tile_mode,
1405                         false);
1406    enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->b.b.nr_samples);
1407 
1408    OUT_REG(ring,
1409            A6XX_RB_BLIT_DST_INFO(
1410                  .tile_mode = tile_mode,
1411                  .flags = ubwc_enabled,
1412                  .samples = samples,
1413                  .color_swap = swap,
1414                  .color_format = format,
1415            ),
1416            A6XX_RB_BLIT_DST(.bo = rsc->bo, .bo_offset = offset),
1417            A6XX_RB_BLIT_DST_PITCH(stride),
1418            A6XX_RB_BLIT_DST_ARRAY_PITCH(array_stride));
1419 
1420    OUT_REG(ring, A6XX_RB_BLIT_BASE_GMEM(.dword = base));
1421 
1422    if (ubwc_enabled) {
1423       OUT_PKT4(ring, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1424       fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level,
1425                               psurf->u.tex.first_layer);
1426    }
1427 
1428    if (CHIP >= A7XX)
1429       OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1430 
1431    fd6_emit_blit<CHIP>(batch->ctx, ring);
1432 }
1433 
1434 template <chip CHIP>
1435 static void
emit_restore_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,unsigned buffer)1436 emit_restore_blit(struct fd_batch *batch, struct fd_ringbuffer *ring,
1437                   uint32_t base, struct pipe_surface *psurf, unsigned buffer)
1438 {
1439    bool stencil = (buffer == FD_BUFFER_STENCIL);
1440 
1441    OUT_REG(ring,
1442            A6XX_RB_BLIT_INFO(
1443                  .type = BLIT_EVENT_LOAD,
1444                  .sample_0 = util_format_is_pure_integer(psurf->format),
1445                  .depth = (buffer == FD_BUFFER_DEPTH),
1446            ),
1447    );
1448 
1449    emit_blit<CHIP>(batch, ring, base, psurf, stencil);
1450 }
1451 
1452 template <chip CHIP>
1453 static void
emit_subpass_clears(struct fd_batch * batch,struct fd_batch_subpass * subpass)1454 emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass)
1455 {
1456    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1457    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1458    struct fd_ringbuffer *ring = subpass->subpass_clears;
1459    enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples);
1460 
1461    uint32_t buffers = subpass->fast_cleared;
1462 
1463    if (buffers & PIPE_CLEAR_COLOR) {
1464 
1465       for (int i = 0; i < pfb->nr_cbufs; i++) {
1466          union pipe_color_union *color = &subpass->clear_color[i];
1467          union util_color uc = {0};
1468 
1469          if (!pfb->cbufs[i])
1470             continue;
1471 
1472          if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
1473             continue;
1474 
1475          enum pipe_format pfmt = pfb->cbufs[i]->format;
1476 
1477          // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP??
1478          union pipe_color_union swapped;
1479          switch (fd6_color_swap(pfmt, TILE6_LINEAR, false)) {
1480          case WZYX:
1481             swapped.ui[0] = color->ui[0];
1482             swapped.ui[1] = color->ui[1];
1483             swapped.ui[2] = color->ui[2];
1484             swapped.ui[3] = color->ui[3];
1485             break;
1486          case WXYZ:
1487             swapped.ui[2] = color->ui[0];
1488             swapped.ui[1] = color->ui[1];
1489             swapped.ui[0] = color->ui[2];
1490             swapped.ui[3] = color->ui[3];
1491             break;
1492          case ZYXW:
1493             swapped.ui[3] = color->ui[0];
1494             swapped.ui[0] = color->ui[1];
1495             swapped.ui[1] = color->ui[2];
1496             swapped.ui[2] = color->ui[3];
1497             break;
1498          case XYZW:
1499             swapped.ui[3] = color->ui[0];
1500             swapped.ui[2] = color->ui[1];
1501             swapped.ui[1] = color->ui[2];
1502             swapped.ui[0] = color->ui[3];
1503             break;
1504          }
1505 
1506          util_pack_color_union(pfmt, &uc, &swapped);
1507 
1508          OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1509          OUT_RING(ring,
1510                   A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1511                      A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1512                      A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
1513 
1514          OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1515          OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1516                            A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf));
1517 
1518          OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1519          OUT_RING(ring, gmem->cbuf_base[i]);
1520 
1521          OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1522          OUT_RING(ring, 0);
1523 
1524          OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
1525          OUT_RING(ring, uc.ui[0]);
1526          OUT_RING(ring, uc.ui[1]);
1527          OUT_RING(ring, uc.ui[2]);
1528          OUT_RING(ring, uc.ui[3]);
1529 
1530          if (CHIP >= A7XX)
1531             OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1532 
1533          fd6_emit_blit<CHIP>(batch->ctx, ring);
1534       }
1535    }
1536 
1537    const bool has_depth = pfb->zsbuf;
1538    const bool has_separate_stencil =
1539       has_depth && fd_resource(pfb->zsbuf->texture)->stencil;
1540 
1541    /* First clear depth or combined depth/stencil. */
1542    if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) ||
1543        (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) {
1544       enum pipe_format pfmt = pfb->zsbuf->format;
1545       uint32_t clear_value;
1546       uint32_t mask = 0;
1547 
1548       if (has_separate_stencil) {
1549          pfmt = util_format_get_depth_only(pfb->zsbuf->format);
1550          clear_value = util_pack_z(pfmt, subpass->clear_depth);
1551       } else {
1552          pfmt = pfb->zsbuf->format;
1553          clear_value =
1554             util_pack_z_stencil(pfmt, subpass->clear_depth, subpass->clear_stencil);
1555       }
1556 
1557       if (buffers & PIPE_CLEAR_DEPTH)
1558          mask |= 0x1;
1559 
1560       if (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))
1561          mask |= 0x2;
1562 
1563       OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1564       OUT_RING(ring,
1565                A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1566                   A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1567                   A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
1568 
1569       OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1570       OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1571                         A6XX_RB_BLIT_INFO_DEPTH |
1572                         A6XX_RB_BLIT_INFO_CLEAR_MASK(mask));
1573 
1574       OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1575       OUT_RING(ring, gmem->zsbuf_base[0]);
1576 
1577       OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1578       OUT_RING(ring, 0);
1579 
1580       OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1);
1581       OUT_RING(ring, clear_value);
1582 
1583       fd6_emit_blit<CHIP>(batch->ctx, ring);
1584    }
1585 
1586    /* Then clear the separate stencil buffer in case of 32 bit depth
1587     * formats with separate stencil. */
1588    if (has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) {
1589       OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1590       OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1591                         A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1592                         A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(FMT6_8_UINT));
1593 
1594       OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1595       OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1596                         A6XX_RB_BLIT_INFO_DEPTH |
1597                         A6XX_RB_BLIT_INFO_CLEAR_MASK(0x1));
1598 
1599       OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1600       OUT_RING(ring, gmem->zsbuf_base[1]);
1601 
1602       OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1603       OUT_RING(ring, 0);
1604 
1605       OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1);
1606       OUT_RING(ring, subpass->clear_stencil & 0xff);
1607 
1608       fd6_emit_blit<CHIP>(batch->ctx, ring);
1609    }
1610 }
1611 
1612 /*
1613  * transfer from system memory to gmem
1614  */
1615 template <chip CHIP>
1616 static void
emit_restore_blits(struct fd_batch * batch,struct fd_ringbuffer * ring)1617 emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring)
1618 {
1619    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1620    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1621 
1622    if (batch->restore & FD_BUFFER_COLOR) {
1623       unsigned i;
1624       for (i = 0; i < pfb->nr_cbufs; i++) {
1625          if (!pfb->cbufs[i])
1626             continue;
1627          if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i)))
1628             continue;
1629          emit_restore_blit<CHIP>(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i],
1630                                  FD_BUFFER_COLOR);
1631       }
1632    }
1633 
1634    if (batch->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
1635       struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
1636 
1637       if (!rsc->stencil || (batch->restore & FD_BUFFER_DEPTH)) {
1638          emit_restore_blit<CHIP>(batch, ring, gmem->zsbuf_base[0], pfb->zsbuf,
1639                                  FD_BUFFER_DEPTH);
1640       }
1641       if (rsc->stencil && (batch->restore & FD_BUFFER_STENCIL)) {
1642          emit_restore_blit<CHIP>(batch, ring, gmem->zsbuf_base[1], pfb->zsbuf,
1643                                  FD_BUFFER_STENCIL);
1644       }
1645    }
1646 }
1647 
1648 template <chip CHIP>
1649 static void
prepare_tile_setup(struct fd_batch * batch)1650 prepare_tile_setup(struct fd_batch *batch)
1651 {
1652    if (batch->restore) {
1653       batch->tile_loads =
1654          fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1655 
1656       set_blit_scissor(batch, batch->tile_loads);
1657       emit_restore_blits<CHIP>(batch, batch->tile_loads);
1658    }
1659 
1660    foreach_subpass (subpass, batch) {
1661       if (!subpass->fast_cleared)
1662          continue;
1663 
1664       subpass->subpass_clears =
1665          fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1666 
1667       set_blit_scissor(batch, subpass->subpass_clears);
1668       emit_subpass_clears<CHIP>(batch, subpass);
1669    }
1670 }
1671 
1672 /*
1673  * transfer from system memory to gmem
1674  */
1675 static void
fd6_emit_tile_mem2gmem(struct fd_batch * batch,const struct fd_tile * tile)1676 fd6_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile)
1677 {
1678 }
1679 
1680 /* before IB to rendering cmds: */
1681 static void
fd6_emit_tile_renderprep(struct fd_batch * batch,const struct fd_tile * tile)1682 fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile)
1683 {
1684    if (batch->tile_loads) {
1685       trace_start_tile_loads(&batch->trace, batch->gmem, batch->restore);
1686       emit_conditional_ib(batch, tile, batch->tile_loads);
1687       trace_end_tile_loads(&batch->trace, batch->gmem);
1688    }
1689 }
1690 
1691 static bool
blit_can_resolve(enum pipe_format format)1692 blit_can_resolve(enum pipe_format format)
1693 {
1694    const struct util_format_description *desc = util_format_description(format);
1695 
1696    /* blit event can only do resolve for simple cases:
1697     * averaging samples as unsigned integers or choosing only one sample
1698     */
1699    if (util_format_is_snorm(format) || util_format_is_srgb(format))
1700       return false;
1701 
1702    /* can't do formats with larger channel sizes
1703     * note: this includes all float formats
1704     * note2: single channel integer formats seem OK
1705     */
1706    if (desc->channel[0].size > 10)
1707       return false;
1708 
1709    switch (format) {
1710    /* for unknown reasons blit event can't msaa resolve these formats when tiled
1711     * likely related to these formats having different layout from other cpp=2
1712     * formats
1713     */
1714    case PIPE_FORMAT_R8G8_UNORM:
1715    case PIPE_FORMAT_R8G8_UINT:
1716    case PIPE_FORMAT_R8G8_SINT:
1717    case PIPE_FORMAT_R8G8_SRGB:
1718    /* TODO: this one should be able to work? */
1719    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1720       return false;
1721    default:
1722       break;
1723    }
1724 
1725    return true;
1726 }
1727 
1728 static bool
needs_resolve(struct pipe_surface * psurf)1729 needs_resolve(struct pipe_surface *psurf)
1730 {
1731    return psurf->nr_samples &&
1732           (psurf->nr_samples != psurf->texture->nr_samples);
1733 }
1734 
1735 /**
1736  * Returns the UNKNOWN_8C01 value for handling partial depth/stencil
1737  * clear/stores to Z24S8.
1738  */
1739 static uint32_t
fd6_unknown_8c01(enum pipe_format format,unsigned buffers)1740 fd6_unknown_8c01(enum pipe_format format, unsigned buffers)
1741 {
1742    buffers &= FD_BUFFER_DEPTH | FD_BUFFER_STENCIL;
1743    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1744       if (buffers == FD_BUFFER_DEPTH)
1745          return 0x08000041;
1746       else if (buffers == FD_BUFFER_STENCIL)
1747          return 0x00084001;
1748    }
1749    return 0;
1750 }
1751 
1752 template <chip CHIP>
1753 static void
emit_resolve_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,unsigned buffer)1754 emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring,
1755                   uint32_t base, struct pipe_surface *psurf,
1756                   unsigned buffer) assert_dt
1757 {
1758    uint32_t info = 0;
1759    bool stencil = false;
1760 
1761    if (!fd_resource(psurf->texture)->valid)
1762       return;
1763 
1764    /* if we need to resolve, but cannot with BLIT event, we instead need
1765     * to generate per-tile CP_BLIT (r2d) commands:
1766     *
1767     * The separate-stencil is a special case, we might need to use CP_BLIT
1768     * for depth, but we can still resolve stencil with a BLIT event
1769     */
1770    if (needs_resolve(psurf) && !blit_can_resolve(psurf->format) &&
1771        (buffer != FD_BUFFER_STENCIL)) {
1772       /* We could potentially use fd6_unknown_8c01() to handle partial z/s
1773        * resolve to packed z/s, but we would need a corresponding ability in the
1774        * !resolve case below, so batch_draw_tracking_for_dirty_bits() has us
1775        * just do a restore of the other channel for partial packed z/s writes.
1776        */
1777       fd6_resolve_tile<CHIP>(batch, ring, base, psurf, 0);
1778       return;
1779    }
1780 
1781    switch (buffer) {
1782    case FD_BUFFER_COLOR:
1783       info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE);
1784       break;
1785    case FD_BUFFER_STENCIL:
1786       info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE_AND_CLEAR);
1787       stencil = true;
1788       break;
1789    case FD_BUFFER_DEPTH:
1790       info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE) | A6XX_RB_BLIT_INFO_DEPTH;
1791       break;
1792    }
1793 
1794    if (util_format_is_pure_integer(psurf->format) ||
1795        util_format_is_depth_or_stencil(psurf->format))
1796       info |= A6XX_RB_BLIT_INFO_SAMPLE_0;
1797 
1798    OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1799    OUT_RING(ring, info);
1800 
1801    emit_blit<CHIP>(batch, ring, base, psurf, stencil);
1802 }
1803 
1804 /*
1805  * transfer from gmem to system memory (ie. normal RAM)
1806  */
1807 
1808 template <chip CHIP>
1809 static void
prepare_tile_fini(struct fd_batch * batch)1810 prepare_tile_fini(struct fd_batch *batch)
1811    assert_dt
1812 {
1813    const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1814    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1815    struct fd_ringbuffer *ring;
1816 
1817    batch->tile_store =
1818       fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1819    ring = batch->tile_store;
1820 
1821    set_blit_scissor(batch, ring);
1822 
1823    if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
1824       struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
1825 
1826       if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) {
1827          emit_resolve_blit<CHIP>(batch, ring, gmem->zsbuf_base[0],
1828                                  pfb->zsbuf, FD_BUFFER_DEPTH);
1829       }
1830       if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) {
1831          emit_resolve_blit<CHIP>(batch, ring, gmem->zsbuf_base[1],
1832                                  pfb->zsbuf, FD_BUFFER_STENCIL);
1833       }
1834    }
1835 
1836    if (batch->resolve & FD_BUFFER_COLOR) {
1837       unsigned i;
1838       for (i = 0; i < pfb->nr_cbufs; i++) {
1839          if (!pfb->cbufs[i])
1840             continue;
1841          if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i)))
1842             continue;
1843          emit_resolve_blit<CHIP>(batch, ring, gmem->cbuf_base[i],
1844                                  pfb->cbufs[i], FD_BUFFER_COLOR);
1845       }
1846    }
1847 }
1848 
1849 template <chip CHIP>
1850 static void
fd6_emit_tile(struct fd_batch * batch,const struct fd_tile * tile)1851 fd6_emit_tile(struct fd_batch *batch, const struct fd_tile *tile)
1852 {
1853    foreach_subpass (subpass, batch) {
1854       if (subpass->subpass_clears) {
1855          trace_start_clears(&batch->trace, batch->gmem, subpass->fast_cleared);
1856          emit_conditional_ib(batch, tile, subpass->subpass_clears);
1857          trace_end_clears(&batch->trace, batch->gmem);
1858       }
1859 
1860       emit_lrz<CHIP>(batch, subpass);
1861 
1862       fd6_emit_ib(batch->gmem, subpass->draw);
1863    }
1864 
1865    if (batch->tile_epilogue)
1866       fd6_emit_ib(batch->gmem, batch->tile_epilogue);
1867 }
1868 
1869 static void
fd6_emit_tile_gmem2mem(struct fd_batch * batch,const struct fd_tile * tile)1870 fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile)
1871 {
1872    struct fd_ringbuffer *ring = batch->gmem;
1873 
1874    if (batch->epilogue)
1875       fd6_emit_ib(batch->gmem, batch->epilogue);
1876 
1877    if (use_hw_binning(batch)) {
1878       OUT_PKT7(ring, CP_SET_MARKER, 1);
1879       OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
1880                      A6XX_CP_SET_MARKER_0_USES_GMEM);
1881    }
1882 
1883    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1884    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1885                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1886                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1887    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1888    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1889 
1890    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
1891    OUT_RING(ring, 0x0);
1892 
1893    emit_marker6(ring, 7);
1894    OUT_PKT7(ring, CP_SET_MARKER, 1);
1895    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
1896                   A6XX_CP_SET_MARKER_0_USES_GMEM);
1897    emit_marker6(ring, 7);
1898 
1899    if (batch->tile_store) {
1900       trace_start_tile_stores(&batch->trace, batch->gmem, batch->resolve);
1901       emit_conditional_ib(batch, tile, batch->tile_store);
1902       trace_end_tile_stores(&batch->trace, batch->gmem);
1903    }
1904 
1905    OUT_PKT7(ring, CP_SET_MARKER, 1);
1906    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_END));
1907 }
1908 
1909 template <chip CHIP>
1910 static void
fd6_emit_tile_fini(struct fd_batch * batch)1911 fd6_emit_tile_fini(struct fd_batch *batch)
1912 {
1913    struct fd_ringbuffer *ring = batch->gmem;
1914 
1915    emit_common_fini<CHIP>(batch);
1916 
1917    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1918    OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE);
1919 
1920    fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
1921    fd6_event_write<CHIP>(batch->ctx, ring, FD_CCU_CLEAN_BLIT_CACHE);
1922 
1923    if (use_hw_binning(batch)) {
1924       check_vsc_overflow(batch->ctx);
1925    }
1926 }
1927 
1928 template <chip CHIP>
1929 static void
emit_sysmem_clears(struct fd_batch * batch,struct fd_batch_subpass * subpass)1930 emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass)
1931    assert_dt
1932 {
1933    struct fd_context *ctx = batch->ctx;
1934    struct fd_ringbuffer *ring = batch->gmem;
1935    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1936 
1937    uint32_t buffers = subpass->fast_cleared;
1938 
1939    if (!buffers)
1940       return;
1941 
1942    struct pipe_box box2d;
1943    u_box_2d(0, 0, pfb->width, pfb->height, &box2d);
1944 
1945    trace_start_clears(&batch->trace, ring, buffers);
1946 
1947    if (buffers & PIPE_CLEAR_COLOR) {
1948       for (int i = 0; i < pfb->nr_cbufs; i++) {
1949          union pipe_color_union color = subpass->clear_color[i];
1950 
1951          if (!pfb->cbufs[i])
1952             continue;
1953 
1954          if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
1955             continue;
1956 
1957          fd6_clear_surface<CHIP>(ctx, ring, pfb->cbufs[i], &box2d, &color, 0);
1958       }
1959    }
1960    if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
1961       union pipe_color_union value = {};
1962 
1963       const bool has_depth = pfb->zsbuf;
1964       struct pipe_resource *separate_stencil =
1965          has_depth && fd_resource(pfb->zsbuf->texture)->stencil
1966             ? &fd_resource(pfb->zsbuf->texture)->stencil->b.b
1967             : NULL;
1968 
1969       if ((buffers & PIPE_CLEAR_DEPTH) || (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) {
1970          value.f[0] = subpass->clear_depth;
1971          value.ui[1] = subpass->clear_stencil;
1972          fd6_clear_surface<CHIP>(ctx, ring, pfb->zsbuf, &box2d,
1973                                  &value, fd6_unknown_8c01(pfb->zsbuf->format, buffers));
1974       }
1975 
1976       if (separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) {
1977          value.ui[0] = subpass->clear_stencil;
1978 
1979          struct pipe_surface stencil_surf = *pfb->zsbuf;
1980          stencil_surf.format = PIPE_FORMAT_S8_UINT;
1981          stencil_surf.texture = separate_stencil;
1982 
1983          fd6_clear_surface<CHIP>(ctx, ring, &stencil_surf, &box2d, &value, 0);
1984       }
1985    }
1986 
1987    fd6_emit_flushes<CHIP>(ctx, ring, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR);
1988 
1989    trace_end_clears(&batch->trace, ring);
1990 }
1991 
1992 template <chip CHIP>
1993 static void
fd6_emit_sysmem_prep(struct fd_batch * batch)1994 fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
1995 {
1996    struct fd_ringbuffer *ring = batch->gmem;
1997 
1998    emit_lrz_clears<CHIP>(batch);
1999 
2000    fd6_emit_restore<CHIP>(batch, ring);
2001    fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
2002 
2003    if (batch->prologue) {
2004       if (!batch->nondraw) {
2005          trace_start_prologue(&batch->trace, ring);
2006       }
2007       fd6_emit_ib(ring, batch->prologue);
2008       if (!batch->nondraw) {
2009          trace_end_prologue(&batch->trace, ring);
2010       }
2011    }
2012 
2013    /* remaining setup below here does not apply to blit/compute: */
2014    if (batch->nondraw)
2015       return;
2016 
2017    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
2018 
2019    if (pfb->width > 0 && pfb->height > 0)
2020       set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1);
2021    else
2022       set_scissor(ring, 0, 0, 0, 0);
2023 
2024    set_tessfactor_bo<CHIP>(ring, batch);
2025    set_window_offset<CHIP>(ring, 0, 0);
2026 
2027    set_bin_size<CHIP>(ring, NULL, {
2028          .render_mode = RENDERING_PASS,
2029          .buffers_location = BUFFERS_IN_SYSMEM,
2030    });
2031 
2032    if (CHIP >= A7XX) {
2033       OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem
2034       OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(batch->ctx->screen->info->a6xx.magic.RB_UNKNOWN_8E06));
2035       OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
2036       OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
2037       OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
2038    }
2039 
2040    emit_marker6(ring, 7);
2041    OUT_PKT7(ring, CP_SET_MARKER, 1);
2042    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_DIRECT_RENDER));
2043    emit_marker6(ring, 7);
2044 
2045    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2046    OUT_RING(ring, 0x0);
2047 
2048    /* blob controls "local" in IB2, but I think that is not required */
2049    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
2050    OUT_RING(ring, 0x1);
2051 
2052    /* enable stream-out, with sysmem there is only one pass: */
2053    OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
2054 
2055    OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
2056    OUT_RING(ring, 0x1);
2057 
2058    emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, NULL);
2059    emit_mrt<CHIP>(ring, pfb, NULL);
2060    emit_msaa(ring, pfb->samples);
2061    patch_fb_read_sysmem<CHIP>(batch);
2062 
2063    emit_common_init<CHIP>(batch);
2064 }
2065 
2066 template <chip CHIP>
2067 static void
fd6_emit_sysmem(struct fd_batch * batch)2068 fd6_emit_sysmem(struct fd_batch *batch)
2069    assert_dt
2070 {
2071    struct fd_ringbuffer *ring = batch->gmem;
2072    struct fd_screen *screen = batch->ctx->screen;
2073 
2074    foreach_subpass (subpass, batch) {
2075       if (subpass->fast_cleared) {
2076          unsigned flushes = 0;
2077          if (subpass->fast_cleared & FD_BUFFER_COLOR)
2078             flushes |= FD6_INVALIDATE_CCU_COLOR;
2079          if (subpass->fast_cleared & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
2080             flushes |= FD6_INVALIDATE_CCU_DEPTH;
2081 
2082          fd6_emit_flushes<CHIP>(batch->ctx, ring, flushes);
2083          emit_sysmem_clears<CHIP>(batch, subpass);
2084       }
2085 
2086       fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
2087 
2088       struct pipe_framebuffer_state *pfb = &batch->framebuffer;
2089       update_render_cntl<CHIP>(batch, pfb, false);
2090 
2091       emit_lrz<CHIP>(batch, subpass);
2092 
2093       fd6_emit_ib(ring, subpass->draw);
2094    }
2095 }
2096 
2097 template <chip CHIP>
2098 static void
fd6_emit_sysmem_fini(struct fd_batch * batch)2099 fd6_emit_sysmem_fini(struct fd_batch *batch) assert_dt
2100 {
2101    struct fd_ringbuffer *ring = batch->gmem;
2102 
2103    emit_common_fini<CHIP>(batch);
2104 
2105    if (batch->tile_epilogue)
2106       fd6_emit_ib(batch->gmem, batch->tile_epilogue);
2107 
2108    if (batch->epilogue)
2109       fd6_emit_ib(batch->gmem, batch->epilogue);
2110 
2111    OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2112    OUT_RING(ring, 0x0);
2113 
2114    fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
2115 
2116    fd6_emit_flushes<CHIP>(batch->ctx, ring,
2117                           FD6_FLUSH_CCU_COLOR |
2118                           FD6_FLUSH_CCU_DEPTH);
2119 }
2120 
2121 template <chip CHIP>
2122 void
fd6_gmem_init(struct pipe_context * pctx)2123 fd6_gmem_init(struct pipe_context *pctx)
2124    disable_thread_safety_analysis
2125 {
2126    struct fd_context *ctx = fd_context(pctx);
2127 
2128    ctx->emit_tile_init = fd6_emit_tile_init<CHIP>;
2129    ctx->emit_tile_prep = fd6_emit_tile_prep<CHIP>;
2130    ctx->emit_tile_mem2gmem = fd6_emit_tile_mem2gmem;
2131    ctx->emit_tile_renderprep = fd6_emit_tile_renderprep;
2132    ctx->emit_tile = fd6_emit_tile<CHIP>;
2133    ctx->emit_tile_gmem2mem = fd6_emit_tile_gmem2mem;
2134    ctx->emit_tile_fini = fd6_emit_tile_fini<CHIP>;
2135    ctx->emit_sysmem_prep = fd6_emit_sysmem_prep<CHIP>;
2136    ctx->emit_sysmem = fd6_emit_sysmem<CHIP>;
2137    ctx->emit_sysmem_fini = fd6_emit_sysmem_fini<CHIP>;
2138 }
2139 FD_GENX(fd6_gmem_init);
2140