1 /*
2 * Copyright © 2016 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 * SPDX-License-Identifier: MIT
5 *
6 * Authors:
7 * Rob Clark <robclark@freedesktop.org>
8 */
9
10 #define FD_BO_NO_HARDPIN 1
11
12 #include <stdio.h>
13
14 #include "pipe/p_state.h"
15 #include "util/format/u_format.h"
16 #include "util/u_inlines.h"
17 #include "util/u_memory.h"
18 #include "util/u_string.h"
19
20 #include "freedreno_draw.h"
21 #include "freedreno_resource.h"
22 #include "freedreno_state.h"
23 #include "freedreno_tracepoints.h"
24
25 #include "fd6_barrier.h"
26 #include "fd6_blitter.h"
27 #include "fd6_context.h"
28 #include "fd6_draw.h"
29 #include "fd6_emit.h"
30 #include "fd6_gmem.h"
31 #include "fd6_pack.h"
32 #include "fd6_program.h"
33 #include "fd6_resource.h"
34 #include "fd6_zsa.h"
35
36 /**
37 * Emits the flags registers, suitable for RB_MRT_FLAG_BUFFER,
38 * RB_DEPTH_FLAG_BUFFER, SP_PS_2D_SRC_FLAGS, and RB_BLIT_FLAG_DST.
39 */
40 void
fd6_emit_flag_reference(struct fd_ringbuffer * ring,struct fd_resource * rsc,int level,int layer)41 fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc,
42 int level, int layer)
43 {
44 if (fd_resource_ubwc_enabled(rsc, level)) {
45 OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0,
46 0);
47 OUT_RING(ring, A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(
48 fdl_ubwc_pitch(&rsc->layout, level)) |
49 A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(
50 rsc->layout.ubwc_layer_size >> 2));
51 } else {
52 OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */
53 OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */
54 OUT_RING(ring, 0x00000000);
55 }
56 }
57
58 template <chip CHIP>
59 static void
emit_mrt(struct fd_ringbuffer * ring,struct pipe_framebuffer_state * pfb,const struct fd_gmem_stateobj * gmem)60 emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
61 const struct fd_gmem_stateobj *gmem)
62 {
63 unsigned srgb_cntl = 0;
64 unsigned i;
65
66 /* Note, GLES 3.2 says "If the fragment’s layer number is negative, or
67 * greater than or equal to the minimum number of layers of any attachment,
68 * the effects of the fragment on the framebuffer contents are undefined."
69 */
70 unsigned max_layer_index = 0;
71 enum a6xx_format mrt0_format = FMT6_NONE;
72
73 for (i = 0; i < pfb->nr_cbufs; i++) {
74 enum a3xx_color_swap swap = WZYX;
75 bool sint = false, uint = false;
76 struct fd_resource *rsc = NULL;
77 ASSERTED struct fdl_slice *slice = NULL;
78 uint32_t stride = 0;
79 uint32_t array_stride = 0;
80 uint32_t offset;
81
82 if (!pfb->cbufs[i])
83 continue;
84
85 struct pipe_surface *psurf = pfb->cbufs[i];
86 enum pipe_format pformat = psurf->format;
87 rsc = fd_resource(psurf->texture);
88
89 uint32_t base = gmem ? gmem->cbuf_base[i] : 0;
90 slice = fd_resource_slice(rsc, psurf->u.tex.level);
91 enum a6xx_tile_mode tile_mode = (enum a6xx_tile_mode)
92 fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
93 enum a6xx_format format = fd6_color_format(pformat, tile_mode);
94 sint = util_format_is_pure_sint(pformat);
95 uint = util_format_is_pure_uint(pformat);
96
97 if (util_format_is_srgb(pformat))
98 srgb_cntl |= (1 << i);
99
100 offset =
101 fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer);
102
103 stride = fd_resource_pitch(rsc, psurf->u.tex.level);
104 array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level);
105 swap = fd6_color_swap(pformat, (enum a6xx_tile_mode)rsc->layout.tile_mode, false);
106
107 max_layer_index = psurf->u.tex.last_layer - psurf->u.tex.first_layer;
108
109 assert((offset + slice->size0) <= fd_bo_size(rsc->bo));
110
111 /* Batch with no draws? */
112 fd_ringbuffer_attach_bo(ring, rsc->bo);
113
114 OUT_REG(ring,
115 RB_MRT_BUF_INFO(CHIP, i,
116 .color_format = format,
117 .color_tile_mode = tile_mode,
118 .color_swap = swap,
119 .losslesscompen = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level),
120 ),
121 A6XX_RB_MRT_PITCH(i, stride),
122 A6XX_RB_MRT_ARRAY_PITCH(i, array_stride),
123 A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset),
124 A6XX_RB_MRT_BASE_GMEM(i, base));
125
126 OUT_REG(ring, A6XX_SP_FS_MRT_REG(i, .color_format = format,
127 .color_sint = sint, .color_uint = uint));
128
129 OUT_PKT4(ring, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 3);
130 fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level,
131 psurf->u.tex.first_layer);
132
133 if (i == 0)
134 mrt0_format = format;
135 }
136 if (pfb->zsbuf)
137 max_layer_index = pfb->zsbuf->u.tex.last_layer - pfb->zsbuf->u.tex.first_layer;
138
139 OUT_REG(ring, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = mrt0_format));
140
141 OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl));
142 OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl));
143
144 OUT_REG(ring, A6XX_GRAS_MAX_LAYER_INDEX(max_layer_index));
145 }
146
147 template <chip CHIP>
148 static void
emit_zs(struct fd_context * ctx,struct fd_ringbuffer * ring,struct pipe_surface * zsbuf,const struct fd_gmem_stateobj * gmem)149 emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring,
150 struct pipe_surface *zsbuf, const struct fd_gmem_stateobj *gmem)
151 {
152 if (zsbuf) {
153 struct fd_resource *rsc = fd_resource(zsbuf->texture);
154 struct fd_resource *stencil = rsc->stencil;
155 uint32_t stride = fd_resource_pitch(rsc, zsbuf->u.tex.level);
156 uint32_t array_stride = fd_resource_layer_stride(rsc, zsbuf->u.tex.level);
157 uint32_t base = gmem ? gmem->zsbuf_base[0] : 0;
158 uint32_t offset =
159 fd_resource_offset(rsc, zsbuf->u.tex.level, zsbuf->u.tex.first_layer);
160
161 /* We could have a depth buffer, but no draws with depth write/test
162 * enabled, in which case it wouldn't have been part of the batch
163 * resource tracking
164 */
165 fd_ringbuffer_attach_bo(ring, rsc->bo);
166
167 if (zsbuf->format == PIPE_FORMAT_S8_UINT) {
168 /* S8 is implemented as Z32_S8 minus the Z32 plane: */
169 enum a6xx_depth_format fmt = DEPTH6_32;
170
171 OUT_REG(ring,
172 RB_DEPTH_BUFFER_INFO(CHIP,
173 .depth_format = fmt,
174 .tilemode = TILE6_3,
175 .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
176 ),
177 A6XX_RB_DEPTH_BUFFER_PITCH(0),
178 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
179 A6XX_RB_DEPTH_BUFFER_BASE(.qword = 0),
180 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(base));
181
182 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
183
184 stencil = rsc;
185 } else {
186 enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format);
187
188 OUT_REG(ring,
189 RB_DEPTH_BUFFER_INFO(CHIP,
190 .depth_format = fmt,
191 .tilemode = TILE6_3,
192 .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level),
193 ),
194 A6XX_RB_DEPTH_BUFFER_PITCH(stride),
195 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(array_stride),
196 A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset),
197 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(base));
198
199 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
200
201 OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
202 fd6_emit_flag_reference(ring, rsc, zsbuf->u.tex.level,
203 zsbuf->u.tex.first_layer);
204 }
205
206 if (stencil) {
207 stride = fd_resource_pitch(stencil, zsbuf->u.tex.level);
208 array_stride = fd_resource_layer_stride(stencil, zsbuf->u.tex.level);
209 uint32_t base = gmem ? gmem->zsbuf_base[1] : 0;
210 uint32_t offset =
211 fd_resource_offset(stencil, zsbuf->u.tex.level, zsbuf->u.tex.first_layer);
212
213 fd_ringbuffer_attach_bo(ring, stencil->bo);
214
215 OUT_REG(ring,
216 RB_STENCIL_INFO(
217 CHIP,
218 .separate_stencil = true,
219 .tilemode = TILE6_3,
220 ),
221 A6XX_RB_STENCIL_BUFFER_PITCH(stride),
222 A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride),
223 A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset),
224 A6XX_RB_STENCIL_BUFFER_BASE_GMEM(base)
225 );
226 } else {
227 OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
228 }
229 } else {
230 OUT_REG(ring,
231 RB_DEPTH_BUFFER_INFO(
232 CHIP,
233 .depth_format = DEPTH6_NONE,
234 ),
235 A6XX_RB_DEPTH_BUFFER_PITCH(),
236 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(),
237 A6XX_RB_DEPTH_BUFFER_BASE(),
238 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(),
239 );
240
241 OUT_REG(ring,
242 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
243
244 OUT_REG(ring, RB_STENCIL_INFO(CHIP, 0));
245 }
246 }
247
248 template <chip CHIP>
249 static void
emit_lrz(struct fd_batch * batch,struct fd_batch_subpass * subpass)250 emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass)
251 {
252 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
253 struct fd_ringbuffer *ring = batch->gmem;
254
255 if (!subpass->lrz) {
256 OUT_REG(ring, A6XX_GRAS_LRZ_BUFFER_BASE(),
257 A6XX_GRAS_LRZ_BUFFER_PITCH(),
258 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
259 if (CHIP >= A7XX)
260 OUT_REG(ring, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
261 return;
262 }
263
264 /* When swapping LRZ buffers we need to flush LRZ cache..
265 * we possibly don't need this during the binning pass, it
266 * appears that the corruption happens on the read-side, ie.
267 * we change the LRZ buffer after a sub-pass, but get a
268 * cache-hit on stale data from the previous LRZ buffer.
269 */
270 fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
271
272 struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture);
273 OUT_REG(ring,
274 A6XX_GRAS_LRZ_BUFFER_BASE(.bo = subpass->lrz),
275 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = zsbuf->lrz_pitch),
276 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(
277 .bo = zsbuf->lrz_fc_offset ? subpass->lrz : NULL,
278 .bo_offset = zsbuf->lrz_fc_offset
279 ),
280 );
281 fd_ringbuffer_attach_bo(ring, subpass->lrz);
282
283 if (CHIP >= A7XX) {
284 OUT_REG(ring,
285 A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
286 .depth_format = fd6_pipe2depth(pfb->zsbuf->format),
287 )
288 );
289 }
290 }
291
292 /* Emit any needed lrz clears to the prologue cmds
293 */
294 template <chip CHIP>
295 static void
emit_lrz_clears(struct fd_batch * batch)296 emit_lrz_clears(struct fd_batch *batch)
297 {
298 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
299 struct fd_context *ctx = batch->ctx;
300 unsigned count = 0;
301
302 if (!pfb->zsbuf)
303 return;
304
305 struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture);
306
307 foreach_subpass (subpass, batch) {
308 /* The lrz buffer isn't explicitly tracked by the batch resource
309 * tracking (tracking the zsbuf is sufficient), but it still needs
310 * to be attached to the ring
311 */
312 if (subpass->lrz)
313 fd_ringbuffer_attach_bo(batch->gmem, subpass->lrz);
314
315 if (!(subpass->fast_cleared & FD_BUFFER_LRZ))
316 continue;
317
318 subpass->fast_cleared &= ~FD_BUFFER_LRZ;
319
320 /* prep before first clear: */
321 if (count == 0) {
322 struct fd_ringbuffer *ring = fd_batch_get_prologue(batch);
323
324 fd6_emit_ccu_cntl<CHIP>(ring, ctx->screen, false);
325
326 OUT_PKT7(ring, CP_SET_MARKER, 1);
327 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE));
328
329 fd6_emit_flushes<CHIP>(ctx, ring, FD6_FLUSH_CACHE);
330
331 if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
332 ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) {
333 /* This a non-context register, so we have to WFI before changing. */
334 OUT_WFI5(ring);
335 OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1);
336 OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
337 }
338 }
339
340 fd6_clear_lrz<CHIP>(batch, zsbuf, subpass->lrz, subpass->clear_depth);
341
342 count++;
343 }
344
345 /* cleanup after last clear: */
346 if (count > 0) {
347 struct fd_ringbuffer *ring = fd_batch_get_prologue(batch);
348
349 if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
350 ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) {
351 OUT_WFI5(ring);
352 OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1);
353 OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
354 }
355
356 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
357 * UCHE in the earlier GRAS stage.
358 *
359 * Note tu also asks for WFI but maybe that is only needed if
360 * has_ccu_flush_bug (and it is added by fd6_emit_flushes() already
361 * in that case)
362 */
363 fd6_emit_flushes<CHIP>(batch->ctx, ring,
364 FD6_FLUSH_CCU_COLOR |
365 FD6_INVALIDATE_CACHE);
366 }
367 }
368
369 static bool
use_hw_binning(struct fd_batch * batch)370 use_hw_binning(struct fd_batch *batch)
371 {
372 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
373
374 if ((gmem->maxpw * gmem->maxph) > 32)
375 return false;
376
377 return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) >= 2) &&
378 (batch->num_draws > 0);
379 }
380
381 static void
patch_fb_read_gmem(struct fd_batch * batch)382 patch_fb_read_gmem(struct fd_batch *batch)
383 {
384 struct fd_screen *screen = batch->ctx->screen;
385 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
386 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
387
388 unsigned num_patches = fd_patch_num_elements(&batch->fb_read_patches);
389 if (!num_patches)
390 return;
391
392 for (unsigned i = 0; i < num_patches; i++) {
393 struct fd_cs_patch *patch =
394 fd_patch_element(&batch->fb_read_patches, i);
395 int buf = patch->val;
396 struct pipe_surface *psurf = pfb->cbufs[buf];
397 struct pipe_resource *prsc = psurf->texture;
398 struct fd_resource *rsc = fd_resource(prsc);
399 enum pipe_format format = psurf->format;
400
401 uint8_t swiz[4];
402 fdl6_format_swiz(psurf->format, false, swiz);
403
404 uint64_t base = screen->gmem_base + gmem->cbuf_base[buf];
405 /* always TILE6_2 mode in GMEM, which also means no swap: */
406 uint32_t descriptor[FDL6_TEX_CONST_DWORDS] = {
407 A6XX_TEX_CONST_0_FMT(fd6_texture_format(
408 format, (enum a6xx_tile_mode)rsc->layout.tile_mode, false)) |
409 A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) |
410 A6XX_TEX_CONST_0_SWAP(WZYX) |
411 A6XX_TEX_CONST_0_TILE_MODE(TILE6_2) |
412 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
413 A6XX_TEX_CONST_0_SWIZ_X(fdl6_swiz(swiz[0])) |
414 A6XX_TEX_CONST_0_SWIZ_Y(fdl6_swiz(swiz[1])) |
415 A6XX_TEX_CONST_0_SWIZ_Z(fdl6_swiz(swiz[2])) |
416 A6XX_TEX_CONST_0_SWIZ_W(fdl6_swiz(swiz[3])),
417
418 A6XX_TEX_CONST_1_WIDTH(pfb->width) |
419 A6XX_TEX_CONST_1_HEIGHT(pfb->height),
420
421 A6XX_TEX_CONST_2_PITCH(gmem->bin_w * gmem->cbuf_cpp[buf]) |
422 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D),
423
424 A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size),
425 A6XX_TEX_CONST_4_BASE_LO(base),
426
427 A6XX_TEX_CONST_5_BASE_HI(base >> 32) |
428 A6XX_TEX_CONST_5_DEPTH(prsc->array_size)
429 };
430
431 memcpy(patch->cs, descriptor, FDL6_TEX_CONST_DWORDS * 4);
432 }
433
434 util_dynarray_clear(&batch->fb_read_patches);
435 }
436
437 template <chip CHIP>
438 static void
patch_fb_read_sysmem(struct fd_batch * batch)439 patch_fb_read_sysmem(struct fd_batch *batch)
440 {
441 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
442
443 unsigned num_patches =
444 fd_patch_num_elements(&batch->fb_read_patches);
445 if (!num_patches)
446 return;
447 for (unsigned i = 0; i < num_patches; i++) {
448 struct fd_cs_patch *patch =
449 fd_patch_element(&batch->fb_read_patches, i);
450 int buf = patch->val;
451
452 struct pipe_surface *psurf = pfb->cbufs[buf];
453 if (!psurf)
454 return;
455
456 struct pipe_resource *prsc = psurf->texture;
457 struct fd_resource *rsc = fd_resource(prsc);
458
459 struct fdl_view_args args = {
460 .chip = CHIP,
461
462 .iova = fd_bo_get_iova(rsc->bo),
463
464 .base_miplevel = psurf->u.tex.level,
465 .level_count = 1,
466
467 .base_array_layer = psurf->u.tex.first_layer,
468 .layer_count = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1,
469
470 .swiz = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
471 PIPE_SWIZZLE_W},
472 .format = psurf->format,
473
474 .type = FDL_VIEW_TYPE_2D,
475 .chroma_offsets = {FDL_CHROMA_LOCATION_COSITED_EVEN,
476 FDL_CHROMA_LOCATION_COSITED_EVEN},
477 };
478 const struct fdl_layout *layouts[3] = {&rsc->layout, NULL, NULL};
479 struct fdl6_view view;
480 fdl6_view_init(&view, layouts, &args,
481 batch->ctx->screen->info->a6xx.has_z24uint_s8uint);
482 memcpy(patch->cs, view.descriptor, FDL6_TEX_CONST_DWORDS * 4);
483 }
484
485 util_dynarray_clear(&batch->fb_read_patches);
486 }
487
488 template <chip CHIP>
489 static void
update_render_cntl(struct fd_batch * batch,struct pipe_framebuffer_state * pfb,bool binning)490 update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb,
491 bool binning)
492 {
493 struct fd_ringbuffer *ring = batch->gmem;
494
495 if (CHIP >= A7XX) {
496 OUT_REG(ring,
497 RB_RENDER_CNTL(
498 CHIP,
499 .binning = binning,
500 .raster_mode = TYPE_TILED,
501 .raster_direction = LR_TB
502 )
503 );
504 OUT_REG(ring,
505 A7XX_GRAS_SU_RENDER_CNTL(
506 .binning = binning,
507 )
508 );
509 return;
510 }
511
512 struct fd_screen *screen = batch->ctx->screen;
513 bool depth_ubwc_enable = false;
514 uint32_t mrts_ubwc_enable = 0;
515 int i;
516
517 if (pfb->zsbuf) {
518 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
519 depth_ubwc_enable =
520 fd_resource_ubwc_enabled(rsc, pfb->zsbuf->u.tex.level);
521 }
522
523 for (i = 0; i < pfb->nr_cbufs; i++) {
524 if (!pfb->cbufs[i])
525 continue;
526
527 struct pipe_surface *psurf = pfb->cbufs[i];
528 struct fd_resource *rsc = fd_resource(psurf->texture);
529
530 if (fd_resource_ubwc_enabled(rsc, psurf->u.tex.level))
531 mrts_ubwc_enable |= 1 << i;
532 }
533
534 struct fd_reg_pair rb_render_cntl = RB_RENDER_CNTL(
535 CHIP,
536 .ccusinglecachelinesize = 2,
537 .binning = binning,
538 .flag_depth = depth_ubwc_enable,
539 .flag_mrts = mrts_ubwc_enable,
540 );
541
542 if (screen->info->a6xx.has_cp_reg_write) {
543 OUT_PKT(ring, CP_REG_WRITE,
544 CP_REG_WRITE_0(TRACK_RENDER_CNTL),
545 CP_REG_WRITE_1(rb_render_cntl.reg),
546 CP_REG_WRITE_2(rb_render_cntl.value),
547 );
548 } else {
549 OUT_REG(ring, rb_render_cntl);
550 }
551 }
552
553 static void
update_vsc_pipe(struct fd_batch * batch)554 update_vsc_pipe(struct fd_batch *batch)
555 {
556 struct fd_context *ctx = batch->ctx;
557 struct fd6_context *fd6_ctx = fd6_context(ctx);
558 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
559 struct fd_ringbuffer *ring = batch->gmem;
560 unsigned max_vsc_pipes = batch->ctx->screen->info->num_vsc_pipes;
561 int i;
562
563 if (batch->draw_strm_bits / 8 > fd6_ctx->vsc_draw_strm_pitch) {
564 if (fd6_ctx->vsc_draw_strm)
565 fd_bo_del(fd6_ctx->vsc_draw_strm);
566 fd6_ctx->vsc_draw_strm = NULL;
567 /* Note: probably only need to align to 0x40, but aligning stronger
568 * reduces the odds that we will have to realloc again on the next
569 * frame:
570 */
571 fd6_ctx->vsc_draw_strm_pitch = align(batch->draw_strm_bits / 8, 0x4000);
572 mesa_logd("pre-resize VSC_DRAW_STRM_PITCH to: 0x%x",
573 fd6_ctx->vsc_draw_strm_pitch);
574 }
575
576 if (batch->prim_strm_bits / 8 > fd6_ctx->vsc_prim_strm_pitch) {
577 if (fd6_ctx->vsc_prim_strm)
578 fd_bo_del(fd6_ctx->vsc_prim_strm);
579 fd6_ctx->vsc_prim_strm = NULL;
580 fd6_ctx->vsc_prim_strm_pitch = align(batch->prim_strm_bits / 8, 0x4000);
581 mesa_logd("pre-resize VSC_PRIM_STRM_PITCH to: 0x%x",
582 fd6_ctx->vsc_prim_strm_pitch);
583 }
584
585 if (!fd6_ctx->vsc_draw_strm) {
586 /* We also use four bytes per vsc pipe at the end of the draw
587 * stream buffer for VSC_DRAW_STRM_SIZE written back by hw
588 * (see VSC_DRAW_STRM_SIZE_ADDRESS)
589 */
590 unsigned sz = (max_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch) +
591 (max_vsc_pipes * 4);
592 fd6_ctx->vsc_draw_strm =
593 fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_draw_strm");
594 }
595
596 if (!fd6_ctx->vsc_prim_strm) {
597 unsigned sz = max_vsc_pipes * fd6_ctx->vsc_prim_strm_pitch;
598 fd6_ctx->vsc_prim_strm =
599 fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_prim_strm");
600 }
601
602 fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_draw_strm);
603 fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_prim_strm);
604
605 OUT_REG(ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h),
606 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm,
607 .bo_offset = max_vsc_pipes *
608 fd6_ctx->vsc_draw_strm_pitch));
609
610 OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, .ny = gmem->nbins_y));
611
612 OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), max_vsc_pipes);
613 for (i = 0; i < max_vsc_pipes; i++) {
614 const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i];
615 OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
616 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
617 A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) |
618 A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
619 }
620
621 OUT_REG(
622 ring, A6XX_VSC_PRIM_STRM_ADDRESS(.bo = fd6_ctx->vsc_prim_strm),
623 A6XX_VSC_PRIM_STRM_PITCH(.dword = fd6_ctx->vsc_prim_strm_pitch),
624 A6XX_VSC_PRIM_STRM_LIMIT(.dword = fd6_ctx->vsc_prim_strm_pitch - 64));
625
626 OUT_REG(
627 ring, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = fd6_ctx->vsc_draw_strm),
628 A6XX_VSC_DRAW_STRM_PITCH(.dword = fd6_ctx->vsc_draw_strm_pitch),
629 A6XX_VSC_DRAW_STRM_LIMIT(.dword = fd6_ctx->vsc_draw_strm_pitch - 64));
630 }
631
632 /*
633 * If overflow is detected, either 0x1 (VSC_DRAW_STRM overflow) or 0x3
634 * (VSC_PRIM_STRM overflow) plus the size of the overflowed buffer is
635 * written to control->vsc_overflow. This allows the CPU to
636 * detect which buffer overflowed (and, since the current size is
637 * encoded as well, this protects against already-submitted but
638 * not executed batches from fooling the CPU into increasing the
639 * size again unnecessarily).
640 */
641 static void
emit_vsc_overflow_test(struct fd_batch * batch)642 emit_vsc_overflow_test(struct fd_batch *batch)
643 {
644 struct fd_ringbuffer *ring = batch->gmem;
645 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
646 struct fd6_context *fd6_ctx = fd6_context(batch->ctx);
647
648 assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0);
649 assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0);
650
651 /* Check for overflow, write vsc_scratch if detected: */
652 for (int i = 0; i < gmem->num_vsc_pipes; i++) {
653 OUT_PKT7(ring, CP_COND_WRITE5, 8);
654 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
655 CP_COND_WRITE5_0_WRITE_MEMORY);
656 OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(
657 REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
658 OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
659 OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch - 64));
660 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
661 OUT_RELOC(ring,
662 control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */
663 OUT_RING(ring,
664 CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch));
665
666 OUT_PKT7(ring, CP_COND_WRITE5, 8);
667 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
668 CP_COND_WRITE5_0_WRITE_MEMORY);
669 OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(
670 REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
671 OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
672 OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch - 64));
673 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
674 OUT_RELOC(ring,
675 control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */
676 OUT_RING(ring,
677 CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch));
678 }
679
680 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
681 }
682
683 static void
check_vsc_overflow(struct fd_context * ctx)684 check_vsc_overflow(struct fd_context *ctx)
685 {
686 struct fd6_context *fd6_ctx = fd6_context(ctx);
687 struct fd6_control *control =
688 (struct fd6_control *)fd_bo_map(fd6_ctx->control_mem);
689 uint32_t vsc_overflow = control->vsc_overflow;
690
691 if (!vsc_overflow)
692 return;
693
694 /* clear overflow flag: */
695 control->vsc_overflow = 0;
696
697 unsigned buffer = vsc_overflow & 0x3;
698 unsigned size = vsc_overflow & ~0x3;
699
700 if (buffer == 0x1) {
701 /* VSC_DRAW_STRM overflow: */
702
703 if (size < fd6_ctx->vsc_draw_strm_pitch) {
704 /* we've already increased the size, this overflow is
705 * from a batch submitted before resize, but executed
706 * after
707 */
708 return;
709 }
710
711 fd_bo_del(fd6_ctx->vsc_draw_strm);
712 fd6_ctx->vsc_draw_strm = NULL;
713 fd6_ctx->vsc_draw_strm_pitch *= 2;
714
715 mesa_logd("resized VSC_DRAW_STRM_PITCH to: 0x%x",
716 fd6_ctx->vsc_draw_strm_pitch);
717
718 } else if (buffer == 0x3) {
719 /* VSC_PRIM_STRM overflow: */
720
721 if (size < fd6_ctx->vsc_prim_strm_pitch) {
722 /* we've already increased the size */
723 return;
724 }
725
726 fd_bo_del(fd6_ctx->vsc_prim_strm);
727 fd6_ctx->vsc_prim_strm = NULL;
728 fd6_ctx->vsc_prim_strm_pitch *= 2;
729
730 mesa_logd("resized VSC_PRIM_STRM_PITCH to: 0x%x",
731 fd6_ctx->vsc_prim_strm_pitch);
732
733 } else {
734 /* NOTE: it's possible, for example, for overflow to corrupt the
735 * control page. I mostly just see this hit if I set initial VSC
736 * buffer size extremely small. Things still seem to recover,
737 * but maybe we should pre-emptively realloc vsc_data/vsc_data2
738 * and hope for different memory placement?
739 */
740 mesa_loge("invalid vsc_overflow value: 0x%08x", vsc_overflow);
741 }
742 }
743
744 template <chip CHIP>
745 static void
emit_common_init(struct fd_batch * batch)746 emit_common_init(struct fd_batch *batch)
747 {
748 struct fd_context *ctx = batch->ctx;
749 struct fd_ringbuffer *ring = batch->gmem;
750 struct fd_autotune *at = &batch->ctx->autotune;
751 struct fd_batch_result *result = batch->autotune_result;
752
753 if (!result)
754 return;
755
756 fd_ringbuffer_attach_bo(ring, at->results_mem);
757
758 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
759 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
760
761 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
762 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
763 OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start));
764
765 fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
766
767 /* Copied from blob's cmdstream, not sure why it is done. */
768 if (CHIP == A7XX) {
769 fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
770 }
771 } else {
772 OUT_PKT(ring, CP_EVENT_WRITE7,
773 CP_EVENT_WRITE7_0(
774 .event = ZPASS_DONE,
775 .write_sample_count = true,
776 ),
777 EV_DST_RAM_CP_EVENT_WRITE7_1(
778 results_ptr(at, result[result->idx].samples_start)
779 ),
780 );
781 }
782 }
783
784 template <chip CHIP>
785 static void
emit_common_fini(struct fd_batch * batch)786 emit_common_fini(struct fd_batch *batch)
787 {
788 struct fd_context *ctx = batch->ctx;
789 struct fd_ringbuffer *ring = batch->gmem;
790 struct fd_autotune *at = &batch->ctx->autotune;
791 struct fd_batch_result *result = batch->autotune_result;
792
793 fd6_emit_flushes<CHIP>(batch->ctx, ring, batch->barrier);
794
795 if (!result)
796 return;
797
798 fd_ringbuffer_attach_bo(ring, at->results_mem);
799
800 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
801 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
802
803 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
804 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
805 OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end));
806
807 fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
808 } else {
809 OUT_PKT(ring, CP_EVENT_WRITE7,
810 CP_EVENT_WRITE7_0(
811 .event = ZPASS_DONE,
812 .write_sample_count = true,
813 .sample_count_end_offset = true,
814 .write_accum_sample_count_diff = true,
815 ),
816 EV_DST_RAM_CP_EVENT_WRITE7_1(
817 results_ptr(at, result[result->idx].samples_start)
818 ),
819 );
820 }
821
822 fd6_fence_write<CHIP>(ring, result->fence, results_ptr(at, fence));
823 }
824
825 /*
826 * Emit conditional CP_INDIRECT_BRANCH based on VSC_STATE[p], ie. the IB
827 * is skipped for tiles that have no visible geometry.
828 *
829 * If we aren't using binning pass, this just emits a normal IB.
830 */
831 static void
emit_conditional_ib(struct fd_batch * batch,const struct fd_tile * tile,struct fd_ringbuffer * target)832 emit_conditional_ib(struct fd_batch *batch, const struct fd_tile *tile,
833 struct fd_ringbuffer *target)
834 {
835 struct fd_ringbuffer *ring = batch->gmem;
836
837 /* If we have fast clear, that won't count in the VSC state, so it
838 * forces an unconditional IB (because we know there is something
839 * to do for this tile)
840 */
841 if (batch->cleared || !use_hw_binning(batch)) {
842 fd6_emit_ib(batch->gmem, target);
843 return;
844 }
845
846 if (target->cur == target->start)
847 return;
848
849 emit_marker6(ring, 6);
850
851 unsigned count = fd_ringbuffer_cmd_count(target);
852
853 BEGIN_RING(ring, 5 + 4 * count); /* ensure conditional doesn't get split */
854
855 OUT_PKT7(ring, CP_REG_TEST, 1);
856 OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) |
857 A6XX_CP_REG_TEST_0_BIT(tile->n) |
858 A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
859
860 OUT_PKT7(ring, CP_COND_REG_EXEC, 2);
861 OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
862 OUT_RING(ring, PRED_TEST_CP_COND_REG_EXEC_1_DWORDS(4 * count));
863
864 for (unsigned i = 0; i < count; i++) {
865 uint32_t dwords;
866 OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3);
867 dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4;
868 assert(dwords > 0);
869 OUT_RING(ring, dwords);
870 }
871
872 emit_marker6(ring, 6);
873 }
874
875 static void
set_scissor(struct fd_ringbuffer * ring,uint32_t x1,uint32_t y1,uint32_t x2,uint32_t y2)876 set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2,
877 uint32_t y2)
878 {
879 OUT_REG(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
880 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
881
882 OUT_REG(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
883 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
884 }
885
886 template <chip CHIP>
887 static void
set_tessfactor_bo(struct fd_ringbuffer * ring,struct fd_batch * batch)888 set_tessfactor_bo(struct fd_ringbuffer *ring, struct fd_batch *batch)
889 {
890 /* This happens after all drawing has been emitted to the draw CS, so we know
891 * whether we need the tess BO pointers.
892 */
893 if (!batch->tessellation)
894 return;
895
896 struct fd_screen *screen = batch->ctx->screen;
897
898 assert(screen->tess_bo);
899 fd_ringbuffer_attach_bo(ring, screen->tess_bo);
900 OUT_REG(ring, PC_TESSFACTOR_ADDR(CHIP, screen->tess_bo));
901 /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
902 OUT_WFI5(ring);
903 }
904
905 struct bin_size_params {
906 enum a6xx_render_mode render_mode;
907 bool force_lrz_write_dis;
908 enum a6xx_buffers_location buffers_location;
909 enum a6xx_lrz_feedback_mask lrz_feedback_zmode_mask;
910 };
911
912 template <chip CHIP>
913 static void
set_bin_size(struct fd_ringbuffer * ring,const struct fd_gmem_stateobj * gmem,struct bin_size_params p)914 set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem,
915 struct bin_size_params p)
916 {
917 unsigned w = gmem ? gmem->bin_w : 0;
918 unsigned h = gmem ? gmem->bin_h : 0;
919
920 if (CHIP == A6XX) {
921 OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
922 .binw = w, .binh = h,
923 .render_mode = p.render_mode,
924 .force_lrz_write_dis = p.force_lrz_write_dis,
925 .buffers_location = p.buffers_location,
926 .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
927 ));
928 } else {
929 OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(
930 .binw = w, .binh = h,
931 .render_mode = p.render_mode,
932 .force_lrz_write_dis = p.force_lrz_write_dis,
933 .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
934 ));
935 }
936 OUT_REG(ring, RB_BIN_CONTROL(
937 CHIP,
938 .binw = w, .binh = h,
939 .render_mode = p.render_mode,
940 .force_lrz_write_dis = p.force_lrz_write_dis,
941 .buffers_location = p.buffers_location,
942 .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask,
943 ));
944 /* no flag for RB_BIN_CONTROL2... */
945 OUT_REG(ring, A6XX_RB_BIN_CONTROL2(.binw = w, .binh = h));
946 }
947
948 template <chip CHIP>
949 static void
emit_binning_pass(struct fd_batch * batch)950 emit_binning_pass(struct fd_batch *batch) assert_dt
951 {
952 struct fd_ringbuffer *ring = batch->gmem;
953 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
954 struct fd_screen *screen = batch->ctx->screen;
955
956 assert(!batch->tessellation);
957
958 set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1);
959
960 emit_marker6(ring, 7);
961 OUT_PKT7(ring, CP_SET_MARKER, 1);
962 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
963 emit_marker6(ring, 7);
964
965 OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
966 OUT_RING(ring, 0x1);
967
968 OUT_PKT7(ring, CP_SET_MODE, 1);
969 OUT_RING(ring, 0x1);
970
971 OUT_WFI5(ring);
972
973 OUT_REG(ring, A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
974
975 update_vsc_pipe(batch);
976
977 if (CHIP == A6XX) {
978 OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
979 OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
980 }
981
982 OUT_PKT7(ring, CP_EVENT_WRITE, 1);
983 OUT_RING(ring, UNK_2C);
984
985 OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1);
986 OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) | A6XX_RB_WINDOW_OFFSET_Y(0));
987
988 OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
989 OUT_RING(ring,
990 A6XX_SP_TP_WINDOW_OFFSET_X(0) | A6XX_SP_TP_WINDOW_OFFSET_Y(0));
991
992 /* emit IB to binning drawcmds: */
993 trace_start_binning_ib(&batch->trace, ring);
994 foreach_subpass (subpass, batch) {
995 emit_lrz<CHIP>(batch, subpass);
996 fd6_emit_ib(ring, subpass->draw);
997 }
998 trace_end_binning_ib(&batch->trace, ring);
999
1000 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1001 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1002 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1003 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1004 OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1005 OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1006
1007 OUT_PKT7(ring, CP_EVENT_WRITE, 1);
1008 OUT_RING(ring, UNK_2D);
1009
1010 /* This flush is probably required because the VSC, which produces the
1011 * visibility stream, is a client of UCHE, whereas the CP needs to read
1012 * the visibility stream (without caching) to do draw skipping. The
1013 * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1014 * submitted are finished before reading the VSC regs (in
1015 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly
1016 * as part of draws).
1017 */
1018 fd6_emit_flushes<CHIP>(batch->ctx, ring,
1019 FD6_FLUSH_CACHE |
1020 FD6_WAIT_FOR_IDLE |
1021 FD6_WAIT_FOR_ME);
1022
1023 trace_start_vsc_overflow_test(&batch->trace, batch->gmem);
1024 emit_vsc_overflow_test(batch);
1025 trace_end_vsc_overflow_test(&batch->trace, batch->gmem);
1026
1027 OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1028 OUT_RING(ring, 0x0);
1029
1030 OUT_PKT7(ring, CP_SET_MODE, 1);
1031 OUT_RING(ring, 0x0);
1032
1033 fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1034 }
1035
1036 static void
emit_msaa(struct fd_ringbuffer * ring,unsigned nr)1037 emit_msaa(struct fd_ringbuffer *ring, unsigned nr)
1038 {
1039 enum a3xx_msaa_samples samples = fd_msaa_samples(nr);
1040
1041 OUT_PKT4(ring, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2);
1042 OUT_RING(ring, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples));
1043 OUT_RING(ring, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) |
1044 COND(samples == MSAA_ONE,
1045 A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE));
1046
1047 OUT_PKT4(ring, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2);
1048 OUT_RING(ring, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples));
1049 OUT_RING(ring, A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) |
1050 COND(samples == MSAA_ONE,
1051 A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE));
1052
1053 OUT_PKT4(ring, REG_A6XX_RB_RAS_MSAA_CNTL, 2);
1054 OUT_RING(ring, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples));
1055 OUT_RING(ring,
1056 A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) |
1057 COND(samples == MSAA_ONE, A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE));
1058
1059 OUT_PKT4(ring, REG_A6XX_RB_BLIT_GMEM_MSAA_CNTL, 1);
1060 OUT_RING(ring, A6XX_RB_BLIT_GMEM_MSAA_CNTL_SAMPLES(samples));
1061 }
1062
1063 template <chip CHIP>
1064 static void prepare_tile_setup(struct fd_batch *batch);
1065 template <chip CHIP>
1066 static void prepare_tile_fini(struct fd_batch *batch);
1067
1068 static void
fd7_emit_static_binning_regs(struct fd_ringbuffer * ring)1069 fd7_emit_static_binning_regs(struct fd_ringbuffer *ring)
1070 {
1071 OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x0));
1072 OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(0x0));
1073 OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
1074 OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
1075 OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
1076 OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1077 }
1078
1079 template <chip CHIP>
1080 struct fd_ringbuffer *
fd6_build_preemption_preamble(struct fd_context * ctx)1081 fd6_build_preemption_preamble(struct fd_context *ctx)
1082 {
1083 struct fd_screen *screen = ctx->screen;
1084 struct fd_ringbuffer *ring;
1085
1086 ring = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1087 fd6_emit_static_regs<CHIP>(ctx, ring);
1088 fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
1089
1090 if (CHIP == A6XX) {
1091 OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1092 OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1093 } else if (CHIP >= A7XX) {
1094 fd7_emit_static_binning_regs(ring);
1095 }
1096
1097 /* TODO use CP_MEM_TO_SCRATCH_MEM on a7xx. The VSC scratch mem should be
1098 * automatically saved, unlike GPU registers, so we wouldn't have to
1099 * manually restore this state.
1100 */
1101 OUT_PKT7(ring, CP_MEM_TO_REG, 3);
1102 OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VSC_STATE(0)) |
1103 CP_MEM_TO_REG_0_CNT(32));
1104 OUT_RELOC(ring, control_ptr(fd6_context(ctx), vsc_state));
1105
1106 return ring;
1107 }
1108 FD_GENX(fd6_build_preemption_preamble);
1109
1110 /* before first tile */
1111 template <chip CHIP>
1112 static void
fd6_emit_tile_init(struct fd_batch * batch)1113 fd6_emit_tile_init(struct fd_batch *batch) assert_dt
1114 {
1115 struct fd_ringbuffer *ring = batch->gmem;
1116 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1117 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1118 struct fd_screen *screen = batch->ctx->screen;
1119
1120 emit_lrz_clears<CHIP>(batch);
1121
1122 fd6_emit_restore<CHIP>(batch, ring);
1123
1124 fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
1125
1126 if (batch->prologue) {
1127 trace_start_prologue(&batch->trace, ring);
1128 fd6_emit_ib(ring, batch->prologue);
1129 trace_end_prologue(&batch->trace, ring);
1130 }
1131
1132 fd6_cache_inv<CHIP>(batch->ctx, ring);
1133
1134 prepare_tile_setup<CHIP>(batch);
1135 prepare_tile_fini<CHIP>(batch);
1136
1137 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1138 OUT_RING(ring, 0x0);
1139
1140 /* blob controls "local" in IB2, but I think that is not required */
1141 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
1142 OUT_RING(ring, 0x1);
1143
1144 fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1145
1146 emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, batch->gmem_state);
1147 emit_mrt<CHIP>(ring, pfb, batch->gmem_state);
1148 emit_msaa(ring, pfb->samples);
1149 patch_fb_read_gmem(batch);
1150
1151 if (CHIP >= A7XX)
1152 fd7_emit_static_binning_regs(ring);
1153
1154 if (use_hw_binning(batch)) {
1155 /* enable stream-out during binning pass: */
1156 OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1157
1158 set_bin_size<CHIP>(ring, gmem, {
1159 .render_mode = BINNING_PASS,
1160 .buffers_location = BUFFERS_IN_GMEM,
1161 .lrz_feedback_zmode_mask = LRZ_FEEDBACK_NONE,
1162 });
1163 update_render_cntl<CHIP>(batch, pfb, true);
1164 emit_binning_pass<CHIP>(batch);
1165
1166 /* and disable stream-out for draw pass: */
1167 OUT_REG(ring, A6XX_VPC_SO_DISABLE(true));
1168
1169 /*
1170 * NOTE: even if we detect VSC overflow and disable use of
1171 * visibility stream in draw pass, it is still safe to execute
1172 * the reset of these cmds:
1173 */
1174
1175 set_bin_size<CHIP>(ring, gmem, {
1176 .render_mode = RENDERING_PASS,
1177 .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1178 .buffers_location = BUFFERS_IN_GMEM,
1179 .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback
1180 ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1181 : LRZ_FEEDBACK_NONE,
1182 });
1183
1184 OUT_REG(ring, A6XX_VFD_MODE_CNTL(RENDERING_PASS));
1185
1186 if (CHIP == A6XX) {
1187 OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1188 OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1189 }
1190
1191 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1192 OUT_RING(ring, 0x1);
1193
1194 /* Upload state regs to memory to be restored on skipsaverestore
1195 * preemption.
1196 */
1197 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
1198 OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_STATE_REG(0)) |
1199 CP_REG_TO_MEM_0_CNT(32));
1200 OUT_RELOC(ring, control_ptr(fd6_context(batch->ctx), vsc_state));
1201 } else {
1202 /* no binning pass, so enable stream-out for draw pass:: */
1203 OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1204
1205 set_bin_size<CHIP>(ring, gmem, {
1206 .render_mode = RENDERING_PASS,
1207 .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1208 .buffers_location = BUFFERS_IN_GMEM,
1209 .lrz_feedback_zmode_mask =
1210 screen->info->a6xx.has_lrz_feedback
1211 ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z
1212 : LRZ_FEEDBACK_NONE,
1213 });
1214 }
1215
1216 update_render_cntl<CHIP>(batch, pfb, false);
1217
1218 emit_common_init<CHIP>(batch);
1219 }
1220
1221 template <chip CHIP>
1222 static void
set_window_offset(struct fd_ringbuffer * ring,uint32_t x1,uint32_t y1)1223 set_window_offset(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1)
1224 {
1225 OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1);
1226 OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1));
1227
1228 OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET2, 1);
1229 OUT_RING(ring, A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1));
1230
1231 OUT_REG(ring, SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1));
1232
1233 OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
1234 OUT_RING(ring,
1235 A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1));
1236 }
1237
1238 /* before mem2gmem */
1239 template <chip CHIP>
1240 static void
fd6_emit_tile_prep(struct fd_batch * batch,const struct fd_tile * tile)1241 fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
1242 {
1243 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1244 struct fd_screen *screen = batch->ctx->screen;
1245 struct fd_context *ctx = batch->ctx;
1246 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1247 struct fd6_context *fd6_ctx = fd6_context(ctx);
1248 struct fd_ringbuffer *ring = batch->gmem;
1249
1250 emit_marker6(ring, 7);
1251 OUT_PKT7(ring, CP_SET_MARKER, 1);
1252 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_START) |
1253 A6XX_CP_SET_MARKER_0_USES_GMEM);
1254 emit_marker6(ring, 7);
1255
1256 uint32_t x1 = tile->xoff;
1257 uint32_t y1 = tile->yoff;
1258 uint32_t x2 = tile->xoff + tile->bin_w - 1;
1259 uint32_t y2 = tile->yoff + tile->bin_h - 1;
1260
1261 set_scissor(ring, x1, y1, x2, y2);
1262 set_tessfactor_bo<CHIP>(ring, batch);
1263
1264 fd6_emit_ccu_cntl<CHIP>(ring, screen, true);
1265
1266 emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, batch->gmem_state);
1267 emit_mrt<CHIP>(ring, pfb, batch->gmem_state);
1268 emit_msaa(ring, pfb->samples);
1269
1270 if (use_hw_binning(batch)) {
1271 const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p];
1272 unsigned num_vsc_pipes = ctx->screen->info->num_vsc_pipes;
1273
1274 OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
1275
1276 OUT_PKT7(ring, CP_SET_MODE, 1);
1277 OUT_RING(ring, 0x0);
1278
1279 OUT_PKT7(ring, CP_SET_BIN_DATA5, 7);
1280 OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
1281 CP_SET_BIN_DATA5_0_VSC_N(tile->n));
1282 OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */
1283 (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
1284 OUT_RELOC(
1285 ring, fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */
1286 (tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch),
1287 0, 0);
1288 OUT_RELOC(ring, fd6_ctx->vsc_prim_strm,
1289 (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0);
1290
1291 OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1292 OUT_RING(ring, 0x0);
1293
1294 /* and disable stream-out for draw pass: */
1295 OUT_REG(ring, A6XX_VPC_SO_DISABLE(true));
1296
1297 /*
1298 * NOTE: even if we detect VSC overflow and disable use of
1299 * visibility stream in draw pass, it is still safe to execute
1300 * the reset of these cmds:
1301 */
1302
1303 set_bin_size<CHIP>(ring, gmem, {
1304 .render_mode = RENDERING_PASS,
1305 .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1306 .buffers_location = BUFFERS_IN_GMEM,
1307 .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback
1308 ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1309 : LRZ_FEEDBACK_NONE,
1310 });
1311
1312 OUT_REG(ring, A6XX_VFD_MODE_CNTL(RENDERING_PASS));
1313
1314 if (CHIP == A6XX) {
1315 OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1316 OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL));
1317 }
1318
1319 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1320 OUT_RING(ring, 0x1);
1321
1322 } else {
1323 OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
1324 OUT_RING(ring, 0x1);
1325
1326 /* no binning pass, so enable stream-out for draw pass:: */
1327 OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
1328
1329 set_bin_size<CHIP>(ring, gmem, {
1330 .render_mode = RENDERING_PASS,
1331 .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback,
1332 .buffers_location = BUFFERS_IN_GMEM,
1333 .lrz_feedback_zmode_mask =
1334 screen->info->a6xx.has_lrz_feedback
1335 ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_LRZ_LATE_Z
1336 : LRZ_FEEDBACK_NONE,
1337 });
1338 }
1339
1340 set_window_offset<CHIP>(ring, x1, y1);
1341
1342 set_bin_size<CHIP>(ring, gmem, {
1343 .render_mode = RENDERING_PASS,
1344 .force_lrz_write_dis = !ctx->screen->info->a6xx.has_lrz_feedback,
1345 .buffers_location = BUFFERS_IN_GMEM,
1346 .lrz_feedback_zmode_mask = ctx->screen->info->a6xx.has_lrz_feedback
1347 ? LRZ_FEEDBACK_EARLY_LRZ_LATE_Z
1348 : LRZ_FEEDBACK_NONE,
1349 });
1350
1351 OUT_PKT7(ring, CP_SET_MODE, 1);
1352 OUT_RING(ring, 0x0);
1353 }
1354
1355 static void
set_blit_scissor(struct fd_batch * batch,struct fd_ringbuffer * ring)1356 set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring)
1357 {
1358 const struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1359
1360 struct pipe_scissor_state blit_scissor;
1361
1362 blit_scissor.minx = 0;
1363 blit_scissor.miny = 0;
1364 blit_scissor.maxx = ALIGN(pfb->width, 16);
1365 blit_scissor.maxy = ALIGN(pfb->height, 4);
1366
1367 OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
1368 OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_TL_X(blit_scissor.minx) |
1369 A6XX_RB_BLIT_SCISSOR_TL_Y(blit_scissor.miny));
1370 OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_BR_X(blit_scissor.maxx - 1) |
1371 A6XX_RB_BLIT_SCISSOR_BR_Y(blit_scissor.maxy - 1));
1372 }
1373
1374 template <chip CHIP>
1375 static void
emit_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,bool stencil)1376 emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base,
1377 struct pipe_surface *psurf, bool stencil)
1378 {
1379 struct fd_resource *rsc = fd_resource(psurf->texture);
1380 enum pipe_format pfmt = psurf->format;
1381 uint32_t offset;
1382 bool ubwc_enabled;
1383
1384 assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
1385
1386 /* separate stencil case: */
1387 if (stencil) {
1388 rsc = rsc->stencil;
1389 pfmt = rsc->b.b.format;
1390 }
1391
1392 offset =
1393 fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer);
1394 ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level);
1395
1396 assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
1397
1398 enum a6xx_tile_mode tile_mode = (enum a6xx_tile_mode)
1399 fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level);
1400 enum a6xx_format format = fd6_color_format(pfmt, tile_mode);
1401 uint32_t stride = fd_resource_pitch(rsc, psurf->u.tex.level);
1402 uint32_t array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level);
1403 enum a3xx_color_swap swap =
1404 fd6_color_swap(pfmt, (enum a6xx_tile_mode)rsc->layout.tile_mode,
1405 false);
1406 enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->b.b.nr_samples);
1407
1408 OUT_REG(ring,
1409 A6XX_RB_BLIT_DST_INFO(
1410 .tile_mode = tile_mode,
1411 .flags = ubwc_enabled,
1412 .samples = samples,
1413 .color_swap = swap,
1414 .color_format = format,
1415 ),
1416 A6XX_RB_BLIT_DST(.bo = rsc->bo, .bo_offset = offset),
1417 A6XX_RB_BLIT_DST_PITCH(stride),
1418 A6XX_RB_BLIT_DST_ARRAY_PITCH(array_stride));
1419
1420 OUT_REG(ring, A6XX_RB_BLIT_BASE_GMEM(.dword = base));
1421
1422 if (ubwc_enabled) {
1423 OUT_PKT4(ring, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1424 fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level,
1425 psurf->u.tex.first_layer);
1426 }
1427
1428 if (CHIP >= A7XX)
1429 OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1430
1431 fd6_emit_blit<CHIP>(batch->ctx, ring);
1432 }
1433
1434 template <chip CHIP>
1435 static void
emit_restore_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,unsigned buffer)1436 emit_restore_blit(struct fd_batch *batch, struct fd_ringbuffer *ring,
1437 uint32_t base, struct pipe_surface *psurf, unsigned buffer)
1438 {
1439 bool stencil = (buffer == FD_BUFFER_STENCIL);
1440
1441 OUT_REG(ring,
1442 A6XX_RB_BLIT_INFO(
1443 .type = BLIT_EVENT_LOAD,
1444 .sample_0 = util_format_is_pure_integer(psurf->format),
1445 .depth = (buffer == FD_BUFFER_DEPTH),
1446 ),
1447 );
1448
1449 emit_blit<CHIP>(batch, ring, base, psurf, stencil);
1450 }
1451
1452 template <chip CHIP>
1453 static void
emit_subpass_clears(struct fd_batch * batch,struct fd_batch_subpass * subpass)1454 emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass)
1455 {
1456 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1457 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1458 struct fd_ringbuffer *ring = subpass->subpass_clears;
1459 enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples);
1460
1461 uint32_t buffers = subpass->fast_cleared;
1462
1463 if (buffers & PIPE_CLEAR_COLOR) {
1464
1465 for (int i = 0; i < pfb->nr_cbufs; i++) {
1466 union pipe_color_union *color = &subpass->clear_color[i];
1467 union util_color uc = {0};
1468
1469 if (!pfb->cbufs[i])
1470 continue;
1471
1472 if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
1473 continue;
1474
1475 enum pipe_format pfmt = pfb->cbufs[i]->format;
1476
1477 // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP??
1478 union pipe_color_union swapped;
1479 switch (fd6_color_swap(pfmt, TILE6_LINEAR, false)) {
1480 case WZYX:
1481 swapped.ui[0] = color->ui[0];
1482 swapped.ui[1] = color->ui[1];
1483 swapped.ui[2] = color->ui[2];
1484 swapped.ui[3] = color->ui[3];
1485 break;
1486 case WXYZ:
1487 swapped.ui[2] = color->ui[0];
1488 swapped.ui[1] = color->ui[1];
1489 swapped.ui[0] = color->ui[2];
1490 swapped.ui[3] = color->ui[3];
1491 break;
1492 case ZYXW:
1493 swapped.ui[3] = color->ui[0];
1494 swapped.ui[0] = color->ui[1];
1495 swapped.ui[1] = color->ui[2];
1496 swapped.ui[2] = color->ui[3];
1497 break;
1498 case XYZW:
1499 swapped.ui[3] = color->ui[0];
1500 swapped.ui[2] = color->ui[1];
1501 swapped.ui[1] = color->ui[2];
1502 swapped.ui[0] = color->ui[3];
1503 break;
1504 }
1505
1506 util_pack_color_union(pfmt, &uc, &swapped);
1507
1508 OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1509 OUT_RING(ring,
1510 A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1511 A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1512 A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
1513
1514 OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1515 OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1516 A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf));
1517
1518 OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1519 OUT_RING(ring, gmem->cbuf_base[i]);
1520
1521 OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1522 OUT_RING(ring, 0);
1523
1524 OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
1525 OUT_RING(ring, uc.ui[0]);
1526 OUT_RING(ring, uc.ui[1]);
1527 OUT_RING(ring, uc.ui[2]);
1528 OUT_RING(ring, uc.ui[3]);
1529
1530 if (CHIP >= A7XX)
1531 OUT_REG(ring, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM));
1532
1533 fd6_emit_blit<CHIP>(batch->ctx, ring);
1534 }
1535 }
1536
1537 const bool has_depth = pfb->zsbuf;
1538 const bool has_separate_stencil =
1539 has_depth && fd_resource(pfb->zsbuf->texture)->stencil;
1540
1541 /* First clear depth or combined depth/stencil. */
1542 if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) ||
1543 (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) {
1544 enum pipe_format pfmt = pfb->zsbuf->format;
1545 uint32_t clear_value;
1546 uint32_t mask = 0;
1547
1548 if (has_separate_stencil) {
1549 pfmt = util_format_get_depth_only(pfb->zsbuf->format);
1550 clear_value = util_pack_z(pfmt, subpass->clear_depth);
1551 } else {
1552 pfmt = pfb->zsbuf->format;
1553 clear_value =
1554 util_pack_z_stencil(pfmt, subpass->clear_depth, subpass->clear_stencil);
1555 }
1556
1557 if (buffers & PIPE_CLEAR_DEPTH)
1558 mask |= 0x1;
1559
1560 if (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))
1561 mask |= 0x2;
1562
1563 OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1564 OUT_RING(ring,
1565 A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1566 A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1567 A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
1568
1569 OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1570 OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1571 A6XX_RB_BLIT_INFO_DEPTH |
1572 A6XX_RB_BLIT_INFO_CLEAR_MASK(mask));
1573
1574 OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1575 OUT_RING(ring, gmem->zsbuf_base[0]);
1576
1577 OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1578 OUT_RING(ring, 0);
1579
1580 OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1);
1581 OUT_RING(ring, clear_value);
1582
1583 fd6_emit_blit<CHIP>(batch->ctx, ring);
1584 }
1585
1586 /* Then clear the separate stencil buffer in case of 32 bit depth
1587 * formats with separate stencil. */
1588 if (has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) {
1589 OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1);
1590 OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
1591 A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
1592 A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(FMT6_8_UINT));
1593
1594 OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1595 OUT_RING(ring, A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_CLEAR) |
1596 A6XX_RB_BLIT_INFO_DEPTH |
1597 A6XX_RB_BLIT_INFO_CLEAR_MASK(0x1));
1598
1599 OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1600 OUT_RING(ring, gmem->zsbuf_base[1]);
1601
1602 OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1);
1603 OUT_RING(ring, 0);
1604
1605 OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1);
1606 OUT_RING(ring, subpass->clear_stencil & 0xff);
1607
1608 fd6_emit_blit<CHIP>(batch->ctx, ring);
1609 }
1610 }
1611
1612 /*
1613 * transfer from system memory to gmem
1614 */
1615 template <chip CHIP>
1616 static void
emit_restore_blits(struct fd_batch * batch,struct fd_ringbuffer * ring)1617 emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring)
1618 {
1619 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1620 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1621
1622 if (batch->restore & FD_BUFFER_COLOR) {
1623 unsigned i;
1624 for (i = 0; i < pfb->nr_cbufs; i++) {
1625 if (!pfb->cbufs[i])
1626 continue;
1627 if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i)))
1628 continue;
1629 emit_restore_blit<CHIP>(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i],
1630 FD_BUFFER_COLOR);
1631 }
1632 }
1633
1634 if (batch->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
1635 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
1636
1637 if (!rsc->stencil || (batch->restore & FD_BUFFER_DEPTH)) {
1638 emit_restore_blit<CHIP>(batch, ring, gmem->zsbuf_base[0], pfb->zsbuf,
1639 FD_BUFFER_DEPTH);
1640 }
1641 if (rsc->stencil && (batch->restore & FD_BUFFER_STENCIL)) {
1642 emit_restore_blit<CHIP>(batch, ring, gmem->zsbuf_base[1], pfb->zsbuf,
1643 FD_BUFFER_STENCIL);
1644 }
1645 }
1646 }
1647
1648 template <chip CHIP>
1649 static void
prepare_tile_setup(struct fd_batch * batch)1650 prepare_tile_setup(struct fd_batch *batch)
1651 {
1652 if (batch->restore) {
1653 batch->tile_loads =
1654 fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1655
1656 set_blit_scissor(batch, batch->tile_loads);
1657 emit_restore_blits<CHIP>(batch, batch->tile_loads);
1658 }
1659
1660 foreach_subpass (subpass, batch) {
1661 if (!subpass->fast_cleared)
1662 continue;
1663
1664 subpass->subpass_clears =
1665 fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1666
1667 set_blit_scissor(batch, subpass->subpass_clears);
1668 emit_subpass_clears<CHIP>(batch, subpass);
1669 }
1670 }
1671
1672 /*
1673 * transfer from system memory to gmem
1674 */
1675 static void
fd6_emit_tile_mem2gmem(struct fd_batch * batch,const struct fd_tile * tile)1676 fd6_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile)
1677 {
1678 }
1679
1680 /* before IB to rendering cmds: */
1681 static void
fd6_emit_tile_renderprep(struct fd_batch * batch,const struct fd_tile * tile)1682 fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile)
1683 {
1684 if (batch->tile_loads) {
1685 trace_start_tile_loads(&batch->trace, batch->gmem, batch->restore);
1686 emit_conditional_ib(batch, tile, batch->tile_loads);
1687 trace_end_tile_loads(&batch->trace, batch->gmem);
1688 }
1689 }
1690
1691 static bool
blit_can_resolve(enum pipe_format format)1692 blit_can_resolve(enum pipe_format format)
1693 {
1694 const struct util_format_description *desc = util_format_description(format);
1695
1696 /* blit event can only do resolve for simple cases:
1697 * averaging samples as unsigned integers or choosing only one sample
1698 */
1699 if (util_format_is_snorm(format) || util_format_is_srgb(format))
1700 return false;
1701
1702 /* can't do formats with larger channel sizes
1703 * note: this includes all float formats
1704 * note2: single channel integer formats seem OK
1705 */
1706 if (desc->channel[0].size > 10)
1707 return false;
1708
1709 switch (format) {
1710 /* for unknown reasons blit event can't msaa resolve these formats when tiled
1711 * likely related to these formats having different layout from other cpp=2
1712 * formats
1713 */
1714 case PIPE_FORMAT_R8G8_UNORM:
1715 case PIPE_FORMAT_R8G8_UINT:
1716 case PIPE_FORMAT_R8G8_SINT:
1717 case PIPE_FORMAT_R8G8_SRGB:
1718 /* TODO: this one should be able to work? */
1719 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1720 return false;
1721 default:
1722 break;
1723 }
1724
1725 return true;
1726 }
1727
1728 static bool
needs_resolve(struct pipe_surface * psurf)1729 needs_resolve(struct pipe_surface *psurf)
1730 {
1731 return psurf->nr_samples &&
1732 (psurf->nr_samples != psurf->texture->nr_samples);
1733 }
1734
1735 /**
1736 * Returns the UNKNOWN_8C01 value for handling partial depth/stencil
1737 * clear/stores to Z24S8.
1738 */
1739 static uint32_t
fd6_unknown_8c01(enum pipe_format format,unsigned buffers)1740 fd6_unknown_8c01(enum pipe_format format, unsigned buffers)
1741 {
1742 buffers &= FD_BUFFER_DEPTH | FD_BUFFER_STENCIL;
1743 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1744 if (buffers == FD_BUFFER_DEPTH)
1745 return 0x08000041;
1746 else if (buffers == FD_BUFFER_STENCIL)
1747 return 0x00084001;
1748 }
1749 return 0;
1750 }
1751
1752 template <chip CHIP>
1753 static void
emit_resolve_blit(struct fd_batch * batch,struct fd_ringbuffer * ring,uint32_t base,struct pipe_surface * psurf,unsigned buffer)1754 emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring,
1755 uint32_t base, struct pipe_surface *psurf,
1756 unsigned buffer) assert_dt
1757 {
1758 uint32_t info = 0;
1759 bool stencil = false;
1760
1761 if (!fd_resource(psurf->texture)->valid)
1762 return;
1763
1764 /* if we need to resolve, but cannot with BLIT event, we instead need
1765 * to generate per-tile CP_BLIT (r2d) commands:
1766 *
1767 * The separate-stencil is a special case, we might need to use CP_BLIT
1768 * for depth, but we can still resolve stencil with a BLIT event
1769 */
1770 if (needs_resolve(psurf) && !blit_can_resolve(psurf->format) &&
1771 (buffer != FD_BUFFER_STENCIL)) {
1772 /* We could potentially use fd6_unknown_8c01() to handle partial z/s
1773 * resolve to packed z/s, but we would need a corresponding ability in the
1774 * !resolve case below, so batch_draw_tracking_for_dirty_bits() has us
1775 * just do a restore of the other channel for partial packed z/s writes.
1776 */
1777 fd6_resolve_tile<CHIP>(batch, ring, base, psurf, 0);
1778 return;
1779 }
1780
1781 switch (buffer) {
1782 case FD_BUFFER_COLOR:
1783 info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE);
1784 break;
1785 case FD_BUFFER_STENCIL:
1786 info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE_AND_CLEAR);
1787 stencil = true;
1788 break;
1789 case FD_BUFFER_DEPTH:
1790 info = A6XX_RB_BLIT_INFO_TYPE(BLIT_EVENT_STORE) | A6XX_RB_BLIT_INFO_DEPTH;
1791 break;
1792 }
1793
1794 if (util_format_is_pure_integer(psurf->format) ||
1795 util_format_is_depth_or_stencil(psurf->format))
1796 info |= A6XX_RB_BLIT_INFO_SAMPLE_0;
1797
1798 OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
1799 OUT_RING(ring, info);
1800
1801 emit_blit<CHIP>(batch, ring, base, psurf, stencil);
1802 }
1803
1804 /*
1805 * transfer from gmem to system memory (ie. normal RAM)
1806 */
1807
1808 template <chip CHIP>
1809 static void
prepare_tile_fini(struct fd_batch * batch)1810 prepare_tile_fini(struct fd_batch *batch)
1811 assert_dt
1812 {
1813 const struct fd_gmem_stateobj *gmem = batch->gmem_state;
1814 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1815 struct fd_ringbuffer *ring;
1816
1817 batch->tile_store =
1818 fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
1819 ring = batch->tile_store;
1820
1821 set_blit_scissor(batch, ring);
1822
1823 if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
1824 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
1825
1826 if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) {
1827 emit_resolve_blit<CHIP>(batch, ring, gmem->zsbuf_base[0],
1828 pfb->zsbuf, FD_BUFFER_DEPTH);
1829 }
1830 if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) {
1831 emit_resolve_blit<CHIP>(batch, ring, gmem->zsbuf_base[1],
1832 pfb->zsbuf, FD_BUFFER_STENCIL);
1833 }
1834 }
1835
1836 if (batch->resolve & FD_BUFFER_COLOR) {
1837 unsigned i;
1838 for (i = 0; i < pfb->nr_cbufs; i++) {
1839 if (!pfb->cbufs[i])
1840 continue;
1841 if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i)))
1842 continue;
1843 emit_resolve_blit<CHIP>(batch, ring, gmem->cbuf_base[i],
1844 pfb->cbufs[i], FD_BUFFER_COLOR);
1845 }
1846 }
1847 }
1848
1849 template <chip CHIP>
1850 static void
fd6_emit_tile(struct fd_batch * batch,const struct fd_tile * tile)1851 fd6_emit_tile(struct fd_batch *batch, const struct fd_tile *tile)
1852 {
1853 foreach_subpass (subpass, batch) {
1854 if (subpass->subpass_clears) {
1855 trace_start_clears(&batch->trace, batch->gmem, subpass->fast_cleared);
1856 emit_conditional_ib(batch, tile, subpass->subpass_clears);
1857 trace_end_clears(&batch->trace, batch->gmem);
1858 }
1859
1860 emit_lrz<CHIP>(batch, subpass);
1861
1862 fd6_emit_ib(batch->gmem, subpass->draw);
1863 }
1864
1865 if (batch->tile_epilogue)
1866 fd6_emit_ib(batch->gmem, batch->tile_epilogue);
1867 }
1868
1869 static void
fd6_emit_tile_gmem2mem(struct fd_batch * batch,const struct fd_tile * tile)1870 fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile)
1871 {
1872 struct fd_ringbuffer *ring = batch->gmem;
1873
1874 if (batch->epilogue)
1875 fd6_emit_ib(batch->gmem, batch->epilogue);
1876
1877 if (use_hw_binning(batch)) {
1878 OUT_PKT7(ring, CP_SET_MARKER, 1);
1879 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) |
1880 A6XX_CP_SET_MARKER_0_USES_GMEM);
1881 }
1882
1883 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1884 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1885 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1886 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1887 OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1888 OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1889
1890 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
1891 OUT_RING(ring, 0x0);
1892
1893 emit_marker6(ring, 7);
1894 OUT_PKT7(ring, CP_SET_MARKER, 1);
1895 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
1896 A6XX_CP_SET_MARKER_0_USES_GMEM);
1897 emit_marker6(ring, 7);
1898
1899 if (batch->tile_store) {
1900 trace_start_tile_stores(&batch->trace, batch->gmem, batch->resolve);
1901 emit_conditional_ib(batch, tile, batch->tile_store);
1902 trace_end_tile_stores(&batch->trace, batch->gmem);
1903 }
1904
1905 OUT_PKT7(ring, CP_SET_MARKER, 1);
1906 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_END));
1907 }
1908
1909 template <chip CHIP>
1910 static void
fd6_emit_tile_fini(struct fd_batch * batch)1911 fd6_emit_tile_fini(struct fd_batch *batch)
1912 {
1913 struct fd_ringbuffer *ring = batch->gmem;
1914
1915 emit_common_fini<CHIP>(batch);
1916
1917 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1918 OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE);
1919
1920 fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
1921 fd6_event_write<CHIP>(batch->ctx, ring, FD_CCU_CLEAN_BLIT_CACHE);
1922
1923 if (use_hw_binning(batch)) {
1924 check_vsc_overflow(batch->ctx);
1925 }
1926 }
1927
1928 template <chip CHIP>
1929 static void
emit_sysmem_clears(struct fd_batch * batch,struct fd_batch_subpass * subpass)1930 emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass)
1931 assert_dt
1932 {
1933 struct fd_context *ctx = batch->ctx;
1934 struct fd_ringbuffer *ring = batch->gmem;
1935 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1936
1937 uint32_t buffers = subpass->fast_cleared;
1938
1939 if (!buffers)
1940 return;
1941
1942 struct pipe_box box2d;
1943 u_box_2d(0, 0, pfb->width, pfb->height, &box2d);
1944
1945 trace_start_clears(&batch->trace, ring, buffers);
1946
1947 if (buffers & PIPE_CLEAR_COLOR) {
1948 for (int i = 0; i < pfb->nr_cbufs; i++) {
1949 union pipe_color_union color = subpass->clear_color[i];
1950
1951 if (!pfb->cbufs[i])
1952 continue;
1953
1954 if (!(buffers & (PIPE_CLEAR_COLOR0 << i)))
1955 continue;
1956
1957 fd6_clear_surface<CHIP>(ctx, ring, pfb->cbufs[i], &box2d, &color, 0);
1958 }
1959 }
1960 if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
1961 union pipe_color_union value = {};
1962
1963 const bool has_depth = pfb->zsbuf;
1964 struct pipe_resource *separate_stencil =
1965 has_depth && fd_resource(pfb->zsbuf->texture)->stencil
1966 ? &fd_resource(pfb->zsbuf->texture)->stencil->b.b
1967 : NULL;
1968
1969 if ((buffers & PIPE_CLEAR_DEPTH) || (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) {
1970 value.f[0] = subpass->clear_depth;
1971 value.ui[1] = subpass->clear_stencil;
1972 fd6_clear_surface<CHIP>(ctx, ring, pfb->zsbuf, &box2d,
1973 &value, fd6_unknown_8c01(pfb->zsbuf->format, buffers));
1974 }
1975
1976 if (separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) {
1977 value.ui[0] = subpass->clear_stencil;
1978
1979 struct pipe_surface stencil_surf = *pfb->zsbuf;
1980 stencil_surf.format = PIPE_FORMAT_S8_UINT;
1981 stencil_surf.texture = separate_stencil;
1982
1983 fd6_clear_surface<CHIP>(ctx, ring, &stencil_surf, &box2d, &value, 0);
1984 }
1985 }
1986
1987 fd6_emit_flushes<CHIP>(ctx, ring, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR);
1988
1989 trace_end_clears(&batch->trace, ring);
1990 }
1991
1992 template <chip CHIP>
1993 static void
fd6_emit_sysmem_prep(struct fd_batch * batch)1994 fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
1995 {
1996 struct fd_ringbuffer *ring = batch->gmem;
1997
1998 emit_lrz_clears<CHIP>(batch);
1999
2000 fd6_emit_restore<CHIP>(batch, ring);
2001 fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
2002
2003 if (batch->prologue) {
2004 if (!batch->nondraw) {
2005 trace_start_prologue(&batch->trace, ring);
2006 }
2007 fd6_emit_ib(ring, batch->prologue);
2008 if (!batch->nondraw) {
2009 trace_end_prologue(&batch->trace, ring);
2010 }
2011 }
2012
2013 /* remaining setup below here does not apply to blit/compute: */
2014 if (batch->nondraw)
2015 return;
2016
2017 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
2018
2019 if (pfb->width > 0 && pfb->height > 0)
2020 set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1);
2021 else
2022 set_scissor(ring, 0, 0, 0, 0);
2023
2024 set_tessfactor_bo<CHIP>(ring, batch);
2025 set_window_offset<CHIP>(ring, 0, 0);
2026
2027 set_bin_size<CHIP>(ring, NULL, {
2028 .render_mode = RENDERING_PASS,
2029 .buffers_location = BUFFERS_IN_SYSMEM,
2030 });
2031
2032 if (CHIP >= A7XX) {
2033 OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem
2034 OUT_REG(ring, A7XX_RB_UNKNOWN_8E06(batch->ctx->screen->info->a6xx.magic.RB_UNKNOWN_8E06));
2035 OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0));
2036 OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
2037 OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4));
2038 }
2039
2040 emit_marker6(ring, 7);
2041 OUT_PKT7(ring, CP_SET_MARKER, 1);
2042 OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_DIRECT_RENDER));
2043 emit_marker6(ring, 7);
2044
2045 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2046 OUT_RING(ring, 0x0);
2047
2048 /* blob controls "local" in IB2, but I think that is not required */
2049 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1);
2050 OUT_RING(ring, 0x1);
2051
2052 /* enable stream-out, with sysmem there is only one pass: */
2053 OUT_REG(ring, A6XX_VPC_SO_DISABLE(false));
2054
2055 OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
2056 OUT_RING(ring, 0x1);
2057
2058 emit_zs<CHIP>(batch->ctx, ring, pfb->zsbuf, NULL);
2059 emit_mrt<CHIP>(ring, pfb, NULL);
2060 emit_msaa(ring, pfb->samples);
2061 patch_fb_read_sysmem<CHIP>(batch);
2062
2063 emit_common_init<CHIP>(batch);
2064 }
2065
2066 template <chip CHIP>
2067 static void
fd6_emit_sysmem(struct fd_batch * batch)2068 fd6_emit_sysmem(struct fd_batch *batch)
2069 assert_dt
2070 {
2071 struct fd_ringbuffer *ring = batch->gmem;
2072 struct fd_screen *screen = batch->ctx->screen;
2073
2074 foreach_subpass (subpass, batch) {
2075 if (subpass->fast_cleared) {
2076 unsigned flushes = 0;
2077 if (subpass->fast_cleared & FD_BUFFER_COLOR)
2078 flushes |= FD6_INVALIDATE_CCU_COLOR;
2079 if (subpass->fast_cleared & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
2080 flushes |= FD6_INVALIDATE_CCU_DEPTH;
2081
2082 fd6_emit_flushes<CHIP>(batch->ctx, ring, flushes);
2083 emit_sysmem_clears<CHIP>(batch, subpass);
2084 }
2085
2086 fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
2087
2088 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
2089 update_render_cntl<CHIP>(batch, pfb, false);
2090
2091 emit_lrz<CHIP>(batch, subpass);
2092
2093 fd6_emit_ib(ring, subpass->draw);
2094 }
2095 }
2096
2097 template <chip CHIP>
2098 static void
fd6_emit_sysmem_fini(struct fd_batch * batch)2099 fd6_emit_sysmem_fini(struct fd_batch *batch) assert_dt
2100 {
2101 struct fd_ringbuffer *ring = batch->gmem;
2102
2103 emit_common_fini<CHIP>(batch);
2104
2105 if (batch->tile_epilogue)
2106 fd6_emit_ib(batch->gmem, batch->tile_epilogue);
2107
2108 if (batch->epilogue)
2109 fd6_emit_ib(batch->gmem, batch->epilogue);
2110
2111 OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
2112 OUT_RING(ring, 0x0);
2113
2114 fd6_event_write<CHIP>(batch->ctx, ring, FD_LRZ_FLUSH);
2115
2116 fd6_emit_flushes<CHIP>(batch->ctx, ring,
2117 FD6_FLUSH_CCU_COLOR |
2118 FD6_FLUSH_CCU_DEPTH);
2119 }
2120
2121 template <chip CHIP>
2122 void
fd6_gmem_init(struct pipe_context * pctx)2123 fd6_gmem_init(struct pipe_context *pctx)
2124 disable_thread_safety_analysis
2125 {
2126 struct fd_context *ctx = fd_context(pctx);
2127
2128 ctx->emit_tile_init = fd6_emit_tile_init<CHIP>;
2129 ctx->emit_tile_prep = fd6_emit_tile_prep<CHIP>;
2130 ctx->emit_tile_mem2gmem = fd6_emit_tile_mem2gmem;
2131 ctx->emit_tile_renderprep = fd6_emit_tile_renderprep;
2132 ctx->emit_tile = fd6_emit_tile<CHIP>;
2133 ctx->emit_tile_gmem2mem = fd6_emit_tile_gmem2mem;
2134 ctx->emit_tile_fini = fd6_emit_tile_fini<CHIP>;
2135 ctx->emit_sysmem_prep = fd6_emit_sysmem_prep<CHIP>;
2136 ctx->emit_sysmem = fd6_emit_sysmem<CHIP>;
2137 ctx->emit_sysmem_fini = fd6_emit_sysmem_fini<CHIP>;
2138 }
2139 FD_GENX(fd6_gmem_init);
2140