• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  * SPDX-License-Identifier: MIT
5  *
6  * Authors:
7  *    Rob Clark <robclark@freedesktop.org>
8  */
9 
10 #define FD_BO_NO_HARDPIN 1
11 
12 #include "pipe/p_state.h"
13 #include "util/format/u_format.h"
14 #include "util/u_helpers.h"
15 #include "util/u_memory.h"
16 #include "util/u_string.h"
17 #include "util/u_viewport.h"
18 
19 #include "freedreno_query_hw.h"
20 #include "freedreno_resource.h"
21 #include "freedreno_state.h"
22 #include "freedreno_stompable_regs.h"
23 #include "freedreno_tracepoints.h"
24 
25 #include "fd6_blend.h"
26 #include "fd6_const.h"
27 #include "fd6_context.h"
28 #include "fd6_compute.h"
29 #include "fd6_emit.h"
30 #include "fd6_image.h"
31 #include "fd6_pack.h"
32 #include "fd6_program.h"
33 #include "fd6_rasterizer.h"
34 #include "fd6_texture.h"
35 #include "fd6_zsa.h"
36 
37 /* Helper to get tex stateobj.
38  */
39 static struct fd_ringbuffer *
tex_state(struct fd_context * ctx,enum pipe_shader_type type)40 tex_state(struct fd_context *ctx, enum pipe_shader_type type)
41    assert_dt
42 {
43    if (ctx->tex[type].num_textures == 0)
44       return NULL;
45 
46    return fd_ringbuffer_ref(fd6_texture_state(ctx, type)->stateobj);
47 }
48 
49 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)50 build_vbo_state(struct fd6_emit *emit) assert_dt
51 {
52    const struct fd_vertex_state *vtx = &emit->ctx->vtx;
53 
54    const unsigned cnt = vtx->vertexbuf.count;
55    const unsigned dwords = cnt * 4;  /* per vbo: reg64 + one reg32 + pkt hdr */
56 
57    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
58       emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING);
59 
60    for (int32_t j = 0; j < cnt; j++) {
61       OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 3);
62       const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
63       struct fd_resource *rsc = fd_resource(vb->buffer.resource);
64       if (rsc == NULL) {
65          OUT_RING(ring, 0);
66          OUT_RING(ring, 0);
67          OUT_RING(ring, 0);
68       } else {
69          uint32_t off = vb->buffer_offset;
70          uint32_t size = vb->buffer.resource->width0 - off;
71 
72          OUT_RELOC(ring, rsc->bo, off, 0, 0);
73          OUT_RING(ring, size);       /* VFD_FETCH[j].SIZE */
74       }
75    }
76 
77    return ring;
78 }
79 
80 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)81 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
82 {
83    if (emit->prog->lrz_mask.z_mode != A6XX_INVALID_ZTEST)
84       return emit->prog->lrz_mask.z_mode;
85 
86    struct fd_context *ctx = emit->ctx;
87    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
88    const struct ir3_shader_variant *fs = emit->fs;
89 
90    if (!zsa->base.depth_enabled) {
91       return A6XX_LATE_Z;
92    } else if ((fs->has_kill || zsa->alpha_test) &&
93               (zsa->writes_zs || ctx->occlusion_queries_active)) {
94       /* If occlusion queries are active, we don't want to use EARLY_Z
95        * since that will count samples that are discarded by fs
96        *
97        * I'm not entirely sure about the interaction with LRZ, since
98        * that could discard samples that would otherwise only be
99        * hidden by a later draw.
100        */
101       return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
102    } else {
103       return A6XX_EARLY_Z;
104    }
105 }
106 
107 /**
108  * Calculate normalized LRZ state based on zsa/prog/blend state, updating
109  * the zsbuf's lrz state as necessary to detect the cases where we need
110  * to invalidate lrz.
111  */
112 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit)113 compute_lrz_state(struct fd6_emit *emit) assert_dt
114 {
115    struct fd_context *ctx = emit->ctx;
116    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
117    struct fd6_lrz_state lrz;
118 
119    if (!pfb->zsbuf) {
120       memset(&lrz, 0, sizeof(lrz));
121       lrz.z_mode = compute_ztest_mode(emit, false);
122       return lrz;
123    }
124 
125    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
126    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
127    struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
128    bool reads_dest = blend->reads_dest;
129 
130    lrz = zsa->lrz;
131 
132    lrz.val &= emit->prog->lrz_mask.val;
133 
134    /* normalize lrz state: */
135    if (reads_dest || blend->base.alpha_to_coverage) {
136       lrz.write = false;
137    }
138 
139    /* Unwritten channels *that actually exist* are a form of blending
140     * reading the dest from the PoV of LRZ, but the valid dst channels
141     * isn't known when blend CSO is constructed so we need to handle
142     * that here.
143     */
144    if (ctx->all_mrt_channel_mask & ~blend->all_mrt_write_mask) {
145       lrz.write = false;
146       reads_dest = true;
147    }
148 
149    /* Writing depth with blend enabled means we need to invalidate LRZ,
150     * because the written depth value could mean that a later draw with
151     * depth enabled (where we would otherwise write LRZ) could have
152     * fragments which don't pass the depth test due to this draw.  For
153     * example, consider this sequence of draws, with depth mode GREATER:
154     *
155     *   draw A:
156     *     z=0.1, fragments pass
157     *   draw B:
158     *     z=0.4, fragments pass
159     *     blend enabled (LRZ write disabled)
160     *     depth write enabled
161     *   draw C:
162     *     z=0.2, fragments don't pass
163     *     blend disabled
164     *     depth write enabled
165     *
166     * Normally looking at the state in draw C, we'd assume we could
167     * enable LRZ write.  But this would cause early-z/lrz to discard
168     * fragments from draw A which should be visible due to draw B.
169     */
170    if (reads_dest && zsa->writes_z && ctx->screen->driconf.conservative_lrz) {
171       if (!zsa->perf_warn_blend && rsc->lrz_valid) {
172          perf_debug_ctx(ctx, "Invalidating LRZ due to blend+depthwrite");
173          zsa->perf_warn_blend = true;
174       }
175       rsc->lrz_valid = false;
176    }
177 
178    /* if we change depthfunc direction, bail out on using LRZ.  The
179     * LRZ buffer encodes a min/max depth value per block, but if
180     * we switch from GT/GE <-> LT/LE, those values cannot be
181     * interpreted properly.
182     */
183    if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
184        (rsc->lrz_direction != lrz.direction)) {
185       if (!zsa->perf_warn_zdir && rsc->lrz_valid) {
186          perf_debug_ctx(ctx, "Invalidating LRZ due to depth test direction change");
187          zsa->perf_warn_zdir = true;
188       }
189       rsc->lrz_valid = false;
190    }
191 
192    if (zsa->invalidate_lrz || !rsc->lrz_valid) {
193       rsc->lrz_valid = false;
194       memset(&lrz, 0, sizeof(lrz));
195    }
196 
197    lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
198 
199    /* Once we start writing to the real depth buffer, we lock in the
200     * direction for LRZ.. if we have to skip a LRZ write for any
201     * reason, it is still safe to have LRZ until there is a direction
202     * reversal.  Prior to the reversal, since we disabled LRZ writes
203     * in the "unsafe" cases, this just means that the LRZ test may
204     * not early-discard some things that end up not passing a later
205     * test (ie. be overly concervative).  But once you have a reversal
206     * of direction, it is possible to increase/decrease the z value
207     * to the point where the overly-conservative test is incorrect.
208     */
209    if (zsa->base.depth_writemask) {
210       rsc->lrz_direction = lrz.direction;
211    }
212 
213    return lrz;
214 }
215 
216 template <chip CHIP>
217 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit)218 build_lrz(struct fd6_emit *emit) assert_dt
219 {
220    struct fd_context *ctx = emit->ctx;
221    struct fd6_context *fd6_ctx = fd6_context(ctx);
222    struct fd6_lrz_state lrz = compute_lrz_state(emit);
223 
224    /* If the LRZ state has not changed, we can skip the emit: */
225    if (!ctx->last.dirty && (fd6_ctx->last.lrz.val == lrz.val))
226       return NULL;
227 
228    fd6_ctx->last.lrz = lrz;
229 
230    unsigned ndwords = (CHIP >= A7XX) ? 10 : 8;
231    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
232       ctx->batch->submit, ndwords * 4, FD_RINGBUFFER_STREAMING);
233 
234    if (CHIP >= A7XX) {
235       OUT_REG(ring,
236          A6XX_GRAS_LRZ_CNTL(
237             .enable = lrz.enable,
238             .lrz_write = lrz.write,
239             .greater = lrz.direction == FD_LRZ_GREATER,
240             .z_test_enable = lrz.test,
241             .z_bounds_enable = lrz.z_bounds_enable,
242          )
243       );
244       OUT_REG(ring,
245          A7XX_GRAS_LRZ_CNTL2(
246             .disable_on_wrong_dir = false,
247             .fc_enable = false,
248          )
249       );
250    } else {
251       OUT_REG(ring,
252          A6XX_GRAS_LRZ_CNTL(
253             .enable = lrz.enable,
254             .lrz_write = lrz.write,
255             .greater = lrz.direction == FD_LRZ_GREATER,
256             .fc_enable = false,
257             .z_test_enable = lrz.test,
258             .z_bounds_enable = lrz.z_bounds_enable,
259             .disable_on_wrong_dir = false,
260          )
261       );
262    }
263    OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
264 
265    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
266 
267    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
268 
269    return ring;
270 }
271 
272 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)273 build_scissor(struct fd6_emit *emit) assert_dt
274 {
275    struct fd_context *ctx = emit->ctx;
276    struct pipe_scissor_state *scissors = fd_context_get_scissor(ctx);
277    unsigned num_viewports = emit->prog->num_viewports;
278 
279    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
280       emit->ctx->batch->submit, (1 + (2 * num_viewports)) * 4, FD_RINGBUFFER_STREAMING);
281 
282    OUT_PKT4(ring, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), 2 * num_viewports);
283    for (unsigned i = 0; i < num_viewports; i++) {
284       OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(scissors[i].minx) |
285                A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(scissors[i].miny));
286       OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(scissors[i].maxx) |
287                A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(scissors[i].maxy));
288    }
289 
290    return ring;
291 }
292 
293 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
294  * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
295  */
296 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)297 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
298 {
299    struct fd_context *ctx = emit->ctx;
300    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
301    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
302    const struct ir3_shader_variant *fs = emit->fs;
303 
304    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
305       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
306 
307    unsigned nr = pfb->nr_cbufs;
308 
309    if (ctx->rasterizer->rasterizer_discard)
310       nr = 0;
311 
312    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
313 
314    if (blend->use_dual_src_blend)
315       nr++;
316 
317    OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
318    OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
319                      COND(fs->writes_smask && pfb->samples > 1,
320                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
321                      COND(fs->writes_stencilref,
322                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
323                      COND(blend->use_dual_src_blend,
324                           A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
325    OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
326 
327    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
328    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
329 
330    unsigned mrt_components = 0;
331    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
332       if (!pfb->cbufs[i])
333          continue;
334       mrt_components |= 0xf << (i * 4);
335    }
336 
337    /* dual source blending has an extra fs output in the 2nd slot */
338    if (blend->use_dual_src_blend)
339       mrt_components |= 0xf << 4;
340 
341    mrt_components &= prog->mrt_components;
342 
343    OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
344    OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
345 
346    return ring;
347 }
348 
349 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)350 build_blend_color(struct fd6_emit *emit) assert_dt
351 {
352    struct fd_context *ctx = emit->ctx;
353    struct pipe_blend_color *bcolor = &ctx->blend_color;
354    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
355       ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
356 
357    OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
358            A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
359            A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
360            A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
361 
362    return ring;
363 }
364 
365 static struct fd_ringbuffer *
build_sample_locations(struct fd6_emit * emit)366 build_sample_locations(struct fd6_emit *emit)
367    assert_dt
368 {
369    struct fd_context *ctx = emit->ctx;
370 
371    if (!ctx->sample_locations_enabled) {
372       struct fd6_context *fd6_ctx = fd6_context(ctx);
373       return fd_ringbuffer_ref(fd6_ctx->sample_locations_disable_stateobj);
374    }
375 
376    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
377       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
378 
379    uint32_t sample_locations = 0;
380    for (int i = 0; i < 4; i++) {
381       float x = (ctx->sample_locations[i] & 0xf) / 16.0f;
382       float y = (16 - (ctx->sample_locations[i] >> 4)) / 16.0f;
383 
384       x = CLAMP(x, 0.0f, 0.9375f);
385       y = CLAMP(y, 0.0f, 0.9375f);
386 
387       sample_locations |=
388          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
389           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y)) << i*8;
390    }
391 
392    OUT_REG(ring, A6XX_GRAS_SAMPLE_CONFIG(.location_enable = true),
393                  A6XX_GRAS_SAMPLE_LOCATION_0(.dword = sample_locations));
394 
395    OUT_REG(ring, A6XX_RB_SAMPLE_CONFIG(.location_enable = true),
396                  A6XX_RB_SAMPLE_LOCATION_0(.dword = sample_locations));
397 
398    OUT_REG(ring, A6XX_SP_TP_SAMPLE_CONFIG(.location_enable = true),
399                  A6XX_SP_TP_SAMPLE_LOCATION_0(.dword = sample_locations));
400 
401    return ring;
402 }
403 
404 template <chip CHIP>
405 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)406 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
407 {
408    struct fd_context *ctx = emit->ctx;
409    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
410    const struct ir3_stream_output_info *info = prog->stream_output;
411    struct fd_streamout_stateobj *so = &ctx->streamout;
412    unsigned streamout_mask = 0;
413 
414    if (!info)
415       return;
416 
417    for (unsigned i = 0; i < so->num_targets; i++) {
418       struct fd_stream_output_target *target =
419          fd_stream_output_target(so->targets[i]);
420 
421       if (!target)
422          continue;
423 
424       target->stride = info->stride[i];
425 
426       OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
427       /* VPC_SO[i].BUFFER_BASE_LO: */
428       OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
429       OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
430 
431       struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
432 
433       if (so->reset & (1 << i)) {
434          assert(so->offsets[i] == 0);
435 
436          OUT_PKT7(ring, CP_MEM_WRITE, 3);
437          OUT_RELOC(ring, offset_bo, 0, 0, 0);
438          OUT_RING(ring, target->base.buffer_offset);
439 
440          OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
441          OUT_RING(ring, target->base.buffer_offset);
442       } else {
443          OUT_PKT7(ring, CP_MEM_TO_REG, 3);
444          OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
445                            COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) |
446                            CP_MEM_TO_REG_0_UNK31 |
447                            CP_MEM_TO_REG_0_CNT(0));
448          OUT_RELOC(ring, offset_bo, 0, 0, 0);
449       }
450 
451       // After a draw HW would write the new offset to offset_bo
452       OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
453       OUT_RELOC(ring, offset_bo, 0, 0, 0);
454 
455       so->reset &= ~(1 << i);
456 
457       streamout_mask |= (1 << i);
458    }
459 
460    if (streamout_mask) {
461       fd6_state_add_group(&emit->state, prog->streamout_stateobj, FD6_GROUP_SO);
462    } else if (ctx->last.streamout_mask != 0) {
463       /* If we transition from a draw with streamout to one without, turn
464        * off streamout.
465        */
466       fd6_state_add_group(&emit->state, fd6_context(ctx)->streamout_disable_stateobj,
467                          FD6_GROUP_SO);
468    }
469 
470    /* Make sure that any use of our TFB outputs (indirect draw source or shader
471     * UBO reads) comes after the TFB output is written.  From the GL 4.6 core
472     * spec:
473     *
474     *     "Buffers should not be bound or in use for both transform feedback and
475     *      other purposes in the GL.  Specifically, if a buffer object is
476     *      simultaneously bound to a transform feedback buffer binding point
477     *      and elsewhere in the GL, any writes to or reads from the buffer
478     *      generate undefined values."
479     *
480     * So we idle whenever SO buffers change.  Note that this function is called
481     * on every draw with TFB enabled, so check the dirty flag for the buffers
482     * themselves.
483     */
484    if (ctx->dirty & FD_DIRTY_STREAMOUT)
485       OUT_WFI5(ring);
486 
487    ctx->last.streamout_mask = streamout_mask;
488    emit->streamout_mask = streamout_mask;
489 }
490 
491 /**
492  * Stuff that less frequently changes and isn't (yet) moved into stategroups
493  */
494 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)495 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
496 {
497    struct fd_context *ctx = emit->ctx;
498    const enum fd_dirty_3d_state dirty = ctx->dirty;
499    unsigned num_viewports = emit->prog->num_viewports;
500 
501    if (dirty & FD_DIRTY_STENCIL_REF) {
502       struct pipe_stencil_ref *sr = &ctx->stencil_ref;
503 
504       OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
505       OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
506                         A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
507    }
508 
509    if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_PROG)) {
510       for (unsigned i = 0; i < num_viewports; i++) {
511          struct pipe_scissor_state *scissor = &ctx->viewport_scissor[i];
512          struct pipe_viewport_state *vp = & ctx->viewport[i];
513 
514          OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(i, vp->translate[0]),
515                  A6XX_GRAS_CL_VPORT_XSCALE(i, vp->scale[0]),
516                  A6XX_GRAS_CL_VPORT_YOFFSET(i, vp->translate[1]),
517                  A6XX_GRAS_CL_VPORT_YSCALE(i, vp->scale[1]),
518                  A6XX_GRAS_CL_VPORT_ZOFFSET(i, vp->translate[2]),
519                  A6XX_GRAS_CL_VPORT_ZSCALE(i, vp->scale[2]));
520 
521          OUT_REG(
522                ring,
523                A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(i,
524                                                 .x = scissor->minx,
525                                                 .y = scissor->miny),
526                A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(i,
527                                                 .x = scissor->maxx,
528                                                 .y = scissor->maxy));
529       }
530 
531       OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = ctx->guardband.x,
532                                                     .vert = ctx->guardband.y));
533    }
534 
535    /* The clamp ranges are only used when the rasterizer wants depth
536     * clamping.
537     */
538    if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) &&
539        fd_depth_clamp_enabled(ctx)) {
540       for (unsigned i = 0; i < num_viewports; i++) {
541          struct pipe_viewport_state *vp = & ctx->viewport[i];
542          float zmin, zmax;
543 
544          util_viewport_zmin_zmax(vp, ctx->rasterizer->clip_halfz,
545                                  &zmin, &zmax);
546 
547          OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(i, zmin),
548                  A6XX_GRAS_CL_Z_CLAMP_MAX(i, zmax));
549 
550          /* TODO: what to do about this and multi viewport ? */
551          if (i == 0)
552             OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
553       }
554    }
555 }
556 
557 static struct fd_ringbuffer*
build_prim_mode(struct fd6_emit * emit,struct fd_context * ctx,bool gmem)558 build_prim_mode(struct fd6_emit *emit, struct fd_context *ctx, bool gmem)
559    assert_dt
560 {
561    struct fd_ringbuffer *ring =
562       fd_submit_new_ringbuffer(emit->ctx->batch->submit, 2 * 4, FD_RINGBUFFER_STREAMING);
563    uint32_t prim_mode = NO_FLUSH;
564    if (emit->fs->fs.uses_fbfetch_output) {
565       if (gmem) {
566          prim_mode = (ctx->blend->blend_coherent || emit->fs->fs.fbfetch_coherent)
567             ? FLUSH_PER_OVERLAP : NO_FLUSH;
568       } else {
569          prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
570       }
571    } else {
572       prim_mode = NO_FLUSH;
573    }
574    OUT_REG(ring, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
575                                    .single_prim_mode = (enum a6xx_single_prim_mode)prim_mode));
576    return ring;
577 }
578 
579 template <chip CHIP, fd6_pipeline_type PIPELINE>
580 void
fd6_emit_3d_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)581 fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
582 {
583    struct fd_context *ctx = emit->ctx;
584    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
585    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
586    const struct ir3_shader_variant *fs = emit->fs;
587 
588    emit_marker6(ring, 5);
589 
590    /* Special case, we need to re-emit bindless FS state w/ the
591     * fb-read state appended:
592     */
593    if ((emit->dirty_groups & BIT(FD6_GROUP_PROG)) && fs->fb_read) {
594       ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
595       emit->dirty_groups |= BIT(FD6_GROUP_FS_BINDLESS);
596    }
597 
598    u_foreach_bit (b, emit->dirty_groups) {
599       enum fd6_state_id group = (enum fd6_state_id)b;
600       struct fd_ringbuffer *state = NULL;
601 
602       switch (group) {
603       case FD6_GROUP_VTXSTATE:
604          state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
605          fd6_state_add_group(&emit->state, state, FD6_GROUP_VTXSTATE);
606          break;
607       case FD6_GROUP_VBO:
608          state = build_vbo_state(emit);
609          fd6_state_take_group(&emit->state, state, FD6_GROUP_VBO);
610          break;
611       case FD6_GROUP_ZSA:
612          state = fd6_zsa_state(
613             ctx,
614             util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
615             fd_depth_clamp_enabled(ctx));
616          fd6_state_add_group(&emit->state, state, FD6_GROUP_ZSA);
617          break;
618       case FD6_GROUP_LRZ:
619          state = build_lrz<CHIP>(emit);
620          if (state)
621             fd6_state_take_group(&emit->state, state, FD6_GROUP_LRZ);
622          break;
623       case FD6_GROUP_SCISSOR:
624          state = build_scissor(emit);
625          fd6_state_take_group(&emit->state, state, FD6_GROUP_SCISSOR);
626          break;
627       case FD6_GROUP_PROG:
628          fd6_state_add_group(&emit->state, prog->config_stateobj,
629                              FD6_GROUP_PROG_CONFIG);
630          fd6_state_add_group(&emit->state, prog->stateobj, FD6_GROUP_PROG);
631          fd6_state_add_group(&emit->state, prog->binning_stateobj,
632                              FD6_GROUP_PROG_BINNING);
633 
634          /* emit remaining streaming program state, ie. what depends on
635           * other emit state, so cannot be pre-baked.
636           */
637          fd6_state_take_group(&emit->state, fd6_program_interp_state(emit),
638                               FD6_GROUP_PROG_INTERP);
639          break;
640       case FD6_GROUP_RASTERIZER:
641          state = fd6_rasterizer_state<CHIP>(ctx, emit->primitive_restart);
642          fd6_state_add_group(&emit->state, state, FD6_GROUP_RASTERIZER);
643          break;
644       case FD6_GROUP_PROG_FB_RAST:
645          state = build_prog_fb_rast(emit);
646          fd6_state_take_group(&emit->state, state, FD6_GROUP_PROG_FB_RAST);
647          break;
648       case FD6_GROUP_BLEND:
649          state = fd6_blend_variant<CHIP>(ctx->blend, pfb->samples, ctx->sample_mask)
650                     ->stateobj;
651          fd6_state_add_group(&emit->state, state, FD6_GROUP_BLEND);
652          break;
653       case FD6_GROUP_BLEND_COLOR:
654          state = build_blend_color(emit);
655          fd6_state_take_group(&emit->state, state, FD6_GROUP_BLEND_COLOR);
656          break;
657       case FD6_GROUP_SAMPLE_LOCATIONS:
658          state = build_sample_locations(emit);
659          fd6_state_take_group(&emit->state, state, FD6_GROUP_SAMPLE_LOCATIONS);
660          break;
661       case FD6_GROUP_VS_BINDLESS:
662          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_VERTEX, false);
663          fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_BINDLESS);
664          break;
665       case FD6_GROUP_HS_BINDLESS:
666          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_CTRL, false);
667          fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_BINDLESS);
668          break;
669       case FD6_GROUP_DS_BINDLESS:
670          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_TESS_EVAL, false);
671          fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_BINDLESS);
672          break;
673       case FD6_GROUP_GS_BINDLESS:
674          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_GEOMETRY, false);
675          fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_BINDLESS);
676          break;
677       case FD6_GROUP_FS_BINDLESS:
678          state = fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_FRAGMENT, fs->fb_read);
679          fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_BINDLESS);
680          break;
681       case FD6_GROUP_CONST:
682          state = fd6_build_user_consts<CHIP, PIPELINE>(emit);
683          fd6_state_take_group(&emit->state, state, FD6_GROUP_CONST);
684          break;
685       case FD6_GROUP_DRIVER_PARAMS:
686          state = fd6_build_driver_params<CHIP, PIPELINE>(emit);
687          fd6_state_take_group(&emit->state, state, FD6_GROUP_DRIVER_PARAMS);
688          break;
689       case FD6_GROUP_PRIMITIVE_PARAMS:
690          if (PIPELINE == HAS_TESS_GS) {
691             state = fd6_build_tess_consts<CHIP>(emit);
692             fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIMITIVE_PARAMS);
693          }
694          break;
695       case FD6_GROUP_VS_TEX:
696          state = tex_state(ctx, PIPE_SHADER_VERTEX);
697          fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_TEX);
698          break;
699       case FD6_GROUP_HS_TEX:
700          state = tex_state(ctx, PIPE_SHADER_TESS_CTRL);
701          fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_TEX);
702          break;
703       case FD6_GROUP_DS_TEX:
704          state = tex_state(ctx, PIPE_SHADER_TESS_EVAL);
705          fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_TEX);
706          break;
707       case FD6_GROUP_GS_TEX:
708          state = tex_state(ctx, PIPE_SHADER_GEOMETRY);
709          fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_TEX);
710          break;
711       case FD6_GROUP_FS_TEX:
712          state = tex_state(ctx, PIPE_SHADER_FRAGMENT);
713          fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_TEX);
714          break;
715       case FD6_GROUP_SO:
716          fd6_emit_streamout<CHIP>(ring, emit);
717          break;
718       case FD6_GROUP_PRIM_MODE_SYSMEM:
719          state = build_prim_mode(emit, ctx, false);
720          fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_SYSMEM);
721          break;
722       case FD6_GROUP_PRIM_MODE_GMEM:
723          state = build_prim_mode(emit, ctx, true);
724          fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_GMEM);
725          break;
726       case FD6_GROUP_NON_GROUP:
727          fd6_emit_non_ring(ring, emit);
728          break;
729       default:
730          break;
731       }
732    }
733 
734    fd6_state_emit(&emit->state, ring);
735 }
736 
737 template void fd6_emit_3d_state<A6XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
738 template void fd6_emit_3d_state<A7XX, NO_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
739 template void fd6_emit_3d_state<A6XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
740 template void fd6_emit_3d_state<A7XX, HAS_TESS_GS>(struct fd_ringbuffer *ring, struct fd6_emit *emit);
741 
742 template <chip CHIP>
743 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct fd6_compute_state * cs)744 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
745                   struct fd6_compute_state *cs)
746 {
747    struct fd6_state state = {};
748 
749    /* We want CP_SET_DRAW_STATE to execute immediately, otherwise we need to
750     * emit consts as draw state groups (which otherwise has no benefit outside
751     * of GMEM 3d using viz stream from binning pass).
752     *
753     * In particular, the PROG state group sets up the configuration for the
754     * const state, so it must execute before we start loading consts, rather
755     * than be deferred until CP_EXEC_CS.
756     */
757    OUT_PKT7(ring, CP_SET_MODE, 1);
758    OUT_RING(ring, 1);
759 
760    uint32_t gen_dirty = ctx->gen_dirty &
761          (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS));
762 
763    u_foreach_bit (b, gen_dirty) {
764       enum fd6_state_id group = (enum fd6_state_id)b;
765 
766       switch (group) {
767       case FD6_GROUP_PROG:
768          fd6_state_add_group(&state, cs->stateobj, FD6_GROUP_PROG);
769          break;
770       case FD6_GROUP_CS_TEX:
771          fd6_state_take_group(
772                &state,
773                tex_state(ctx, PIPE_SHADER_COMPUTE),
774                FD6_GROUP_CS_TEX);
775          break;
776       case FD6_GROUP_CS_BINDLESS:
777          fd6_state_take_group(
778                &state,
779                fd6_build_bindless_state<CHIP>(ctx, PIPE_SHADER_COMPUTE, false),
780                FD6_GROUP_CS_BINDLESS);
781          break;
782       default:
783          /* State-group unused for compute shaders */
784          break;
785       }
786    }
787 
788    fd6_state_emit(&state, ring);
789 }
790 FD_GENX(fd6_emit_cs_state);
791 
792 template <chip CHIP>
793 void
fd6_emit_ccu_cntl(struct fd_ringbuffer * ring,struct fd_screen * screen,bool gmem)794 fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem)
795 {
796    const struct fd6_gmem_config *cfg = gmem ? &screen->config_gmem : &screen->config_sysmem;
797    enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL :
798       (enum a6xx_ccu_cache_size)(screen->info->a6xx.gmem_ccu_color_cache_fraction);
799    uint32_t color_offset = cfg->color_ccu_offset & 0x1fffff;
800    uint32_t color_offset_hi = cfg->color_ccu_offset >> 21;
801 
802    uint32_t depth_offset = cfg->depth_ccu_offset & 0x1fffff;
803    uint32_t depth_offset_hi = cfg->depth_ccu_offset >> 21;
804 
805    if (CHIP == A7XX) {
806       OUT_REG(ring,
807          A7XX_RB_CCU_CNTL2(
808             .depth_offset_hi = depth_offset_hi,
809             .color_offset_hi = color_offset_hi,
810             .depth_cache_size = CCU_CACHE_SIZE_FULL,
811             .depth_offset = depth_offset,
812             .color_cache_size = color_cache_size,
813             .color_offset = color_offset,
814          )
815       );
816 
817       if (screen->info->a7xx.has_gmem_vpc_attr_buf) {
818          OUT_REG(ring,
819             A7XX_VPC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size),
820             A7XX_VPC_ATTR_BUF_BASE_GMEM(.base_gmem = cfg->vpc_attr_buf_offset)
821          );
822          OUT_REG(ring,
823             A7XX_PC_ATTR_BUF_SIZE_GMEM(.size_gmem = cfg->vpc_attr_buf_size)
824          );
825       }
826    } else {
827       OUT_WFI5(ring);   /* early a6xx (a630?) needed this */
828 
829       OUT_REG(ring,
830          RB_CCU_CNTL(
831             CHIP,
832             .gmem_fast_clear_disable =
833                !screen->info->a6xx.has_gmem_fast_clear,
834             .concurrent_resolve =
835                screen->info->a6xx.concurrent_resolve,
836             .depth_offset_hi = depth_offset_hi,
837             .color_offset_hi = color_offset_hi,
838             .depth_cache_size = CCU_CACHE_SIZE_FULL,
839             .depth_offset = depth_offset,
840             .color_cache_size = color_cache_size,
841             .color_offset = color_offset,
842          )
843       );
844    }
845 }
846 FD_GENX(fd6_emit_ccu_cntl);
847 
848 template <chip CHIP>
849 static void
fd6_emit_stomp(struct fd_ringbuffer * ring,const uint16_t * regs,size_t count)850 fd6_emit_stomp(struct fd_ringbuffer *ring, const uint16_t *regs, size_t count)
851 {
852    for (size_t i = 0; i < count; i++) {
853       if (fd_reg_stomp_allowed(CHIP, regs[i])) {
854          WRITE(regs[i], 0xffffffff);
855       }
856    }
857 }
858 
859 template <chip CHIP>
860 void
fd6_emit_static_regs(struct fd_context * ctx,struct fd_ringbuffer * ring)861 fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring)
862 {
863    struct fd_screen *screen = ctx->screen;
864 
865    if (CHIP >= A7XX) {
866       /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has
867        * static properties that can be set once, this requires a WFI to take effect.
868        * While the newly introduced register RB_CCU_CNTL2 has properties that may
869        * change per-RP and don't require a WFI to take effect, only CCU inval/flush
870        * events are required.
871        */
872       OUT_REG(ring,
873          RB_CCU_CNTL(
874             CHIP,
875             .gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear,
876             .concurrent_resolve = screen->info->a6xx.concurrent_resolve,
877          )
878       );
879    }
880 
881    for (size_t i = 0; i < ARRAY_SIZE(screen->info->a6xx.magic_raw); i++) {
882       auto magic_reg = screen->info->a6xx.magic_raw[i];
883       if (!magic_reg.reg)
884          break;
885 
886       uint32_t value = magic_reg.value;
887       switch(magic_reg.reg) {
888          case REG_A6XX_TPL1_DBG_ECO_CNTL1:
889             value = (value & ~A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT) |
890                     (screen->info->a7xx.enable_tp_ubwc_flag_hint
891                         ? A6XX_TPL1_DBG_ECO_CNTL1_TP_UBWC_FLAG_HINT
892                         : 0);
893             break;
894       }
895 
896       WRITE(magic_reg.reg, value);
897    }
898 
899    WRITE(REG_A6XX_RB_DBG_ECO_CNTL, screen->info->a6xx.magic.RB_DBG_ECO_CNTL);
900    WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
901    WRITE(REG_A6XX_SP_DBG_ECO_CNTL, screen->info->a6xx.magic.SP_DBG_ECO_CNTL);
902    WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
903    if (CHIP == A6XX)
904       WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
905    WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
906    if (CHIP == A6XX) {
907       WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
908       WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
909    }
910 
911    WRITE(REG_A6XX_VPC_DBG_ECO_CNTL, screen->info->a6xx.magic.VPC_DBG_ECO_CNTL);
912    WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
913    if (CHIP == A6XX)
914       WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
915    WRITE(REG_A6XX_SP_CHICKEN_BITS, screen->info->a6xx.magic.SP_CHICKEN_BITS);
916    WRITE(REG_A6XX_SP_IBO_COUNT, 0);
917    WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
918    if (CHIP == A6XX)
919       WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
920    WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12);
921    WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF);
922    WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01);
923    WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0);
924    OUT_REG(ring,
925       A6XX_SP_MODE_CONTROL(
926          .constant_demotion_enable = true,
927          .isammode = ISAMMODE_GL,
928          .shared_consts_enable = false,
929       )
930    );
931    WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
932    WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0);
933    WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
934    WRITE(REG_A6XX_PC_MODE_CNTL, screen->info->a6xx.magic.PC_MODE_CNTL);
935 
936    WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
937    WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
938    WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
939 
940    WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
941 
942    if (CHIP == A6XX) {
943       WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
944       WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
945       WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
946       WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
947       WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
948       WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
949    }
950 
951    WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
952 
953    WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
954    WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
955 
956    WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
957 
958    OUT_REG(ring, PC_RASTER_CNTL(CHIP));
959 
960    if (CHIP == A7XX)
961       OUT_REG(ring, A7XX_PC_RASTER_CNTL_V2());
962 
963    WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
964 
965    WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
966 
967    WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
968    WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
969    WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
970    WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
971    if (CHIP == A6XX) {
972       WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
973       WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
974    }
975    WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
976    WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
977    /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
978     * but this seems to kill texture gather offsets.
979     */
980    WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
981          A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
982 
983    OUT_REG(ring, HLSQ_CONTROL_5_REG(
984          CHIP,
985          .linelengthregid = INVALID_REG,
986          .foveationqualityregid = INVALID_REG,
987    ));
988 
989    emit_marker6(ring, 7);
990 
991    OUT_REG(ring, A6XX_VFD_MODE_CNTL(RENDERING_PASS));
992 
993    WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
994 
995    /* Clear any potential pending state groups to be safe: */
996    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
997    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
998                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
999                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1000    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1001    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1002 
1003    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1004    OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1005 
1006    if (CHIP >= A7XX) {
1007       OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1008       OUT_REG(ring, A7XX_GRAS_LRZ_CNTL2());
1009    } else {
1010       OUT_REG(ring, A6XX_GRAS_LRZ_CNTL());
1011    }
1012 
1013    OUT_REG(ring, A6XX_RB_LRZ_CNTL());
1014    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL());
1015    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1016 
1017    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1018    OUT_RING(ring, 0x00000000);
1019 
1020    OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1021    OUT_RING(ring, 0x00000000);
1022 
1023    /* Initialize VFD_FETCH[n].SIZE to zero to avoid iova faults trying
1024     * to fetch from a VFD_FETCH[n].BASE which we've potentially inherited
1025     * from another process:
1026     */
1027    for (int32_t i = 0; i < 32; i++) {
1028       OUT_PKT4(ring, REG_A6XX_VFD_FETCH_SIZE(i), 1);
1029       OUT_RING(ring, 0);
1030    }
1031 
1032    struct fd6_context *fd6_ctx = fd6_context(ctx);
1033    struct fd_bo *bcolor_mem = fd6_ctx->bcolor_mem;
1034 
1035    OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
1036    OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1037 
1038    OUT_PKT4(ring, REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, 2);
1039    OUT_RELOC(ring, bcolor_mem, 0, 0, 0);
1040 
1041    /* These regs are blocked (CP_PROTECT) on a6xx: */
1042    if (CHIP >= A7XX) {
1043       OUT_REG(ring,
1044          TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0),
1045          TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4),
1046          TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee),
1047          TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed),
1048          TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0),
1049       );
1050    }
1051 
1052    if (CHIP >= A7XX) {
1053       /* Blob sets these two per draw. */
1054       OUT_REG(ring, A7XX_PC_TESS_PARAM_SIZE(FD6_TESS_PARAM_SIZE));
1055       /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes)
1056        * but the meaning of this additional space is not known,
1057        * so we play safe and don't add it.
1058        */
1059       OUT_REG(ring, A7XX_PC_TESS_FACTOR_SIZE(FD6_TESS_FACTOR_SIZE));
1060    }
1061 
1062    /* There is an optimization to skip executing draw states for draws with no
1063     * instances. Instead of simply skipping the draw, internally the firmware
1064     * sets a bit in PC_DRAW_INITIATOR that seemingly skips the draw. However
1065     * there is a hardware bug where this bit does not always cause the FS
1066     * early preamble to be skipped. Because the draw states were skipped,
1067     * SP_FS_CTRL_REG0, SP_FS_OBJ_START and so on are never updated and a
1068     * random FS preamble from the last draw is executed. If the last visible
1069     * draw is from the same submit, it shouldn't be a problem because we just
1070     * re-execute the same preamble and preambles don't have side effects, but
1071     * if it's from another process then we could execute a garbage preamble
1072     * leading to hangs and faults. To make sure this doesn't happen, we reset
1073     * SP_FS_CTRL_REG0 here, making sure that the EARLYPREAMBLE bit isn't set
1074     * so any leftover early preamble doesn't get executed. Other stages don't
1075     * seem to be affected.
1076     */
1077    if (screen->info->a6xx.has_early_preamble) {
1078       WRITE(REG_A6XX_SP_FS_CTRL_REG0, 0);
1079    }
1080 }
1081 FD_GENX(fd6_emit_static_regs);
1082 
1083 /* emit setup at begin of new cmdstream buffer (don't rely on previous
1084  * state, there could have been a context switch between ioctls):
1085  */
1086 template <chip CHIP>
1087 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)1088 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
1089 {
1090    struct fd_context *ctx = batch->ctx;
1091    struct fd_screen *screen = ctx->screen;
1092 
1093    if (!batch->nondraw) {
1094       trace_start_state_restore(&batch->trace, ring);
1095    }
1096 
1097    if (FD_DBG(STOMP)) {
1098       fd6_emit_stomp<CHIP>(ring, &RP_BLIT_REGS<CHIP>[0], ARRAY_SIZE(RP_BLIT_REGS<CHIP>));
1099       fd6_emit_stomp<CHIP>(ring, &CMD_REGS<CHIP>[0], ARRAY_SIZE(CMD_REGS<CHIP>));
1100    }
1101 
1102    OUT_PKT7(ring, CP_SET_MODE, 1);
1103    OUT_RING(ring, 0);
1104 
1105    if (CHIP == A6XX) {
1106       fd6_cache_inv<CHIP>(ctx, ring);
1107    } else {
1108       OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
1109       OUT_RING(ring, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
1110                      CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE);
1111 
1112       fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_COLOR);
1113       fd6_event_write<CHIP>(ctx, ring, FD_CCU_INVALIDATE_DEPTH);
1114 
1115       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
1116       OUT_RING(ring, UNK_40);
1117 
1118       fd6_event_write<CHIP>(ctx, ring, FD_CACHE_INVALIDATE);
1119       OUT_WFI5(ring);
1120    }
1121 
1122    OUT_REG(ring,
1123       HLSQ_INVALIDATE_CMD(CHIP,
1124          .vs_state = true, .hs_state = true,
1125          .ds_state = true, .gs_state = true,
1126          .fs_state = true, .cs_state = true,
1127          .cs_ibo = true,   .gfx_ibo = true,
1128          .cs_shared_const = true,
1129          .gfx_shared_const = true,
1130          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
1131          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
1132       )
1133    );
1134 
1135    OUT_WFI5(ring);
1136 
1137    fd6_emit_ib(ring, fd6_context(ctx)->restore);
1138    fd6_emit_ccu_cntl<CHIP>(ring, screen, false);
1139 
1140    OUT_PKT7(ring, CP_SET_AMBLE, 3);
1141    uint32_t dwords = fd_ringbuffer_emit_reloc_ring_full(ring, fd6_context(ctx)->preamble, 0) / 4;
1142    OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(dwords) |
1143                   CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE));
1144 
1145    OUT_PKT7(ring, CP_SET_AMBLE, 3);
1146    OUT_RING(ring, 0x00000000);
1147    OUT_RING(ring, 0x00000000);
1148    OUT_RING(ring, CP_SET_AMBLE_2_TYPE(PREAMBLE_AMBLE_TYPE));
1149 
1150    OUT_PKT7(ring, CP_SET_AMBLE, 3);
1151    OUT_RING(ring, 0x00000000);
1152    OUT_RING(ring, 0x00000000);
1153    OUT_RING(ring, CP_SET_AMBLE_2_TYPE(POSTAMBLE_AMBLE_TYPE));
1154 
1155    if (!batch->nondraw) {
1156       trace_end_state_restore(&batch->trace, ring);
1157    }
1158 }
1159 FD_GENX(fd6_emit_restore);
1160 
1161 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1162 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1163                unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1164                unsigned sizedwords)
1165 {
1166    struct fd_bo *src_bo = fd_resource(src)->bo;
1167    struct fd_bo *dst_bo = fd_resource(dst)->bo;
1168    unsigned i;
1169 
1170    fd_ringbuffer_attach_bo(ring, dst_bo);
1171    fd_ringbuffer_attach_bo(ring, src_bo);
1172 
1173    for (i = 0; i < sizedwords; i++) {
1174       OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1175       OUT_RING(ring, 0x00000000);
1176       OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1177       OUT_RELOC(ring, src_bo, src_off, 0, 0);
1178 
1179       dst_off += 4;
1180       src_off += 4;
1181    }
1182 }
1183 
1184 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1185 fd6_emit_init_screen(struct pipe_screen *pscreen)
1186 {
1187    struct fd_screen *screen = fd_screen(pscreen);
1188    screen->mem_to_mem = fd6_mem_to_mem;
1189 }
1190