• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  * SPDX-License-Identifier: MIT
5  *
6  * Authors:
7  *    Rob Clark <robclark@freedesktop.org>
8  */
9 
10 #define FD_BO_NO_HARDPIN 1
11 
12 #include <initializer_list>
13 
14 #include "pipe/p_state.h"
15 #include "util/bitset.h"
16 #include "util/format/u_format.h"
17 #include "util/u_inlines.h"
18 #include "util/u_memory.h"
19 #include "util/u_string.h"
20 
21 #include "freedreno_program.h"
22 
23 #include "fd6_const.h"
24 #include "fd6_emit.h"
25 #include "fd6_pack.h"
26 #include "fd6_program.h"
27 #include "fd6_texture.h"
28 
29 /**
30  * Temporary program building state.
31  */
32 struct program_builder {
33    struct fd6_program_state *state;
34    struct fd_context *ctx;
35    const struct ir3_cache_key *key;
36    const struct ir3_shader_variant *vs;
37    const struct ir3_shader_variant *hs;
38    const struct ir3_shader_variant *ds;
39    const struct ir3_shader_variant *gs;
40    const struct ir3_shader_variant *fs;
41    const struct ir3_shader_variant *last_shader;
42    bool binning_pass;
43 };
44 
45 template <chip CHIP>
46 struct xs_config {
47    uint16_t reg_sp_xs_instrlen;
48    uint16_t reg_hlsq_xs_ctrl;
49    uint16_t reg_sp_xs_first_exec_offset;
50    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
51    uint16_t reg_sp_xs_vgpr_config;
52 };
53 
54 template <chip CHIP>
55 static const struct xs_config<CHIP> xs_configs[] = {
56    [MESA_SHADER_VERTEX] = {
57       REG_A6XX_SP_VS_INSTRLEN,
58       CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
59       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
60       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
61       REG_A7XX_SP_VS_VGPR_CONFIG,
62    },
63    [MESA_SHADER_TESS_CTRL] = {
64       REG_A6XX_SP_HS_INSTRLEN,
65       CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
66       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
67       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
68       REG_A7XX_SP_HS_VGPR_CONFIG,
69    },
70    [MESA_SHADER_TESS_EVAL] = {
71       REG_A6XX_SP_DS_INSTRLEN,
72       CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
73       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
74       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
75       REG_A7XX_SP_DS_VGPR_CONFIG,
76    },
77    [MESA_SHADER_GEOMETRY] = {
78       REG_A6XX_SP_GS_INSTRLEN,
79       CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
80       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
81       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
82       REG_A7XX_SP_GS_VGPR_CONFIG,
83    },
84    [MESA_SHADER_FRAGMENT] = {
85       REG_A6XX_SP_FS_INSTRLEN,
86       CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
87       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
88       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
89       REG_A7XX_SP_FS_VGPR_CONFIG,
90    },
91    [MESA_SHADER_COMPUTE] = {
92       REG_A6XX_SP_CS_INSTRLEN,
93       CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
94       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
95       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
96       REG_A7XX_SP_CS_VGPR_CONFIG,
97    },
98 };
99 
100 template <chip CHIP>
101 void
fd6_emit_shader(struct fd_context * ctx,struct fd_ringbuffer * ring,const struct ir3_shader_variant * so)102 fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
103                 const struct ir3_shader_variant *so)
104 {
105    if (!so) {
106       /* shader stage disabled */
107       return;
108    }
109 
110 #if MESA_DEBUG
111    /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */
112    const char *name = so->name;
113    if (name)
114       fd_emit_string5(ring, name, strlen(name));
115 #endif
116 
117    gl_shader_stage type = so->type;
118    if (type == MESA_SHADER_KERNEL)
119       type = MESA_SHADER_COMPUTE;
120 
121    enum a6xx_threadsize thrsz =
122       so->info.double_threadsize ? THREAD128 : THREAD64;
123 
124    switch (type) {
125    case MESA_SHADER_VERTEX:
126       OUT_REG(ring, A6XX_SP_VS_CTRL_REG0(
127                .halfregfootprint = so->info.max_half_reg + 1,
128                .fullregfootprint = so->info.max_reg + 1,
129                .branchstack = ir3_shader_branchstack_hw(so),
130                .mergedregs = so->mergedregs,
131                .earlypreamble = so->early_preamble,
132       ));
133       break;
134    case MESA_SHADER_TESS_CTRL:
135       OUT_REG(ring, A6XX_SP_HS_CTRL_REG0(
136                .halfregfootprint = so->info.max_half_reg + 1,
137                .fullregfootprint = so->info.max_reg + 1,
138                .branchstack = ir3_shader_branchstack_hw(so),
139                .earlypreamble = so->early_preamble,
140       ));
141       break;
142    case MESA_SHADER_TESS_EVAL:
143       OUT_REG(ring, A6XX_SP_DS_CTRL_REG0(
144                .halfregfootprint = so->info.max_half_reg + 1,
145                .fullregfootprint = so->info.max_reg + 1,
146                .branchstack = ir3_shader_branchstack_hw(so),
147                .earlypreamble = so->early_preamble,
148       ));
149       break;
150    case MESA_SHADER_GEOMETRY:
151       OUT_REG(ring, A6XX_SP_GS_CTRL_REG0(
152                .halfregfootprint = so->info.max_half_reg + 1,
153                .fullregfootprint = so->info.max_reg + 1,
154                .branchstack = ir3_shader_branchstack_hw(so),
155                .earlypreamble = so->early_preamble,
156       ));
157       break;
158    case MESA_SHADER_FRAGMENT:
159       OUT_REG(ring, A6XX_SP_FS_CTRL_REG0(
160                .halfregfootprint = so->info.max_half_reg + 1,
161                .fullregfootprint = so->info.max_reg + 1,
162                .branchstack = ir3_shader_branchstack_hw(so),
163                .threadsize = thrsz,
164                .varying = so->total_in != 0,
165                .lodpixmask = so->need_full_quad,
166                /* unknown bit, seems unnecessary */
167                .unk24 = true,
168                .pixlodenable = so->need_pixlod,
169                .earlypreamble = so->early_preamble,
170                .mergedregs = so->mergedregs,
171       ));
172       break;
173    case MESA_SHADER_COMPUTE:
174       thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz : THREAD128;
175       OUT_REG(ring, A6XX_SP_CS_CTRL_REG0(
176                .halfregfootprint = so->info.max_half_reg + 1,
177                .fullregfootprint = so->info.max_reg + 1,
178                .branchstack = ir3_shader_branchstack_hw(so),
179                .threadsize = thrsz,
180                .earlypreamble = so->early_preamble,
181                .mergedregs = so->mergedregs,
182       ));
183       break;
184    default:
185       unreachable("bad shader stage");
186    }
187 
188    const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[type];
189 
190    OUT_PKT4(ring, cfg->reg_sp_xs_instrlen, 1);
191    OUT_RING(ring, so->instrlen);
192 
193    /* emit program binary & private memory layout
194     */
195 
196    ir3_get_private_mem(ctx, so);
197 
198    uint32_t per_sp_size = ctx->pvtmem[so->pvtmem_per_wave].per_sp_size;
199 
200    fd_ringbuffer_attach_bo(ring, so->bo);
201 
202    OUT_PKT4(ring, cfg->reg_sp_xs_first_exec_offset, 7);
203    OUT_RING(ring, 0);                /* SP_xS_OBJ_FIRST_EXEC_OFFSET */
204    OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */
205    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size));
206    if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */
207       fd_ringbuffer_attach_bo(ring, ctx->pvtmem[so->pvtmem_per_wave].bo);
208       OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0);
209    } else {
210       OUT_RING(ring, 0);
211       OUT_RING(ring, 0);
212    }
213    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
214                      COND(so->pvtmem_per_wave,
215                           A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
216 
217    OUT_PKT4(ring, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
218    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
219 
220    if (CHIP >= A7XX) {
221       OUT_PKT4(ring, cfg->reg_sp_xs_vgpr_config, 1);
222       OUT_RING(ring, 0);
223    }
224 
225    if (CHIP == A6XX) {
226       uint32_t shader_preload_size =
227          MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
228 
229       enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
230       OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
231       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
232                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
233                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
234                         CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
235                         CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
236       OUT_RELOC(ring, so->bo, 0, 0, 0);
237    }
238 
239    fd6_emit_immediates<CHIP>(so, ring);
240 }
241 FD_GENX(fd6_emit_shader);
242 
243 /**
244  * Build a pre-baked state-obj to disable SO, so that we aren't dynamically
245  * building this at draw time whenever we transition from SO enabled->disabled
246  */
247 static void
setup_stream_out_disable(struct fd_context * ctx)248 setup_stream_out_disable(struct fd_context *ctx)
249 {
250    unsigned sizedw = 4;
251 
252    if (ctx->screen->info->a6xx.tess_use_shared)
253       sizedw += 2;
254 
255    struct fd_ringbuffer *ring =
256       fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
257 
258    OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
259    OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
260    OUT_RING(ring, 0);
261    OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
262    OUT_RING(ring, 0);
263 
264    if (ctx->screen->info->a6xx.tess_use_shared) {
265       OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
266       OUT_RING(ring, 0);
267    }
268 
269    fd6_context(ctx)->streamout_disable_stateobj = ring;
270 }
271 
272 static void
setup_stream_out(struct fd_context * ctx,struct fd6_program_state * state,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)273 setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
274                  const struct ir3_shader_variant *v,
275                  struct ir3_shader_linkage *l)
276 {
277    const struct ir3_stream_output_info *strmout = &v->stream_output;
278 
279    /* Note: 64 here comes from the HW layout of the program RAM. The program
280     * for stream N is at DWORD 64 * N.
281     */
282 #define A6XX_SO_PROG_DWORDS 64
283    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
284    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
285 
286    memset(prog, 0, sizeof(prog));
287 
288    for (unsigned i = 0; i < strmout->num_outputs; i++) {
289       const struct ir3_stream_output *out = &strmout->output[i];
290       unsigned k = out->register_index;
291       unsigned idx;
292 
293       /* linkage map sorted by order frag shader wants things, so
294        * a bit less ideal here..
295        */
296       for (idx = 0; idx < l->cnt; idx++)
297          if (l->var[idx].slot == v->outputs[k].slot)
298             break;
299 
300       assert(idx < l->cnt);
301 
302       for (unsigned j = 0; j < out->num_components; j++) {
303          unsigned c = j + out->start_component;
304          unsigned loc = l->var[idx].loc + c;
305          unsigned off = j + out->dst_offset; /* in dwords */
306 
307          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
308          if (loc & 1) {
309             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
310                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
311                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
312          } else {
313             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
314                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
315                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
316          }
317          BITSET_SET(valid_dwords, dword);
318       }
319    }
320 
321    unsigned prog_count = 0;
322    unsigned start, end;
323    BITSET_FOREACH_RANGE (start, end, valid_dwords,
324                          A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
325       prog_count += end - start + 1;
326    }
327 
328    const bool emit_pc_so_stream_cntl =
329          ctx->screen->info->a6xx.tess_use_shared &&
330          v->type == MESA_SHADER_TESS_EVAL;
331 
332    unsigned sizedw = 10 + (2 * prog_count);
333    if (emit_pc_so_stream_cntl)
334       sizedw += 2;
335 
336    struct fd_ringbuffer *ring =
337       fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
338 
339    OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
340    OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
341    OUT_RING(ring,
342             A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(strmout->streams_written) |
343             COND(strmout->stride[0] > 0,
344                  A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + strmout->output[0].stream)) |
345             COND(strmout->stride[1] > 0,
346                  A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + strmout->output[1].stream)) |
347             COND(strmout->stride[2] > 0,
348                  A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + strmout->output[2].stream)) |
349             COND(strmout->stride[3] > 0,
350                  A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + strmout->output[3].stream)));
351    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(0));
352    OUT_RING(ring, strmout->stride[0]);
353    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(1));
354    OUT_RING(ring, strmout->stride[1]);
355    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(2));
356    OUT_RING(ring, strmout->stride[2]);
357    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(3));
358    OUT_RING(ring, strmout->stride[3]);
359 
360    bool first = true;
361    BITSET_FOREACH_RANGE (start, end, valid_dwords,
362                          A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
363       OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
364       OUT_RING(ring, COND(first, A6XX_VPC_SO_CNTL_RESET) |
365                      A6XX_VPC_SO_CNTL_ADDR(start));
366       for (unsigned i = start; i < end; i++) {
367          OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
368          OUT_RING(ring, prog[i]);
369       }
370       first = false;
371    }
372 
373    if (emit_pc_so_stream_cntl) {
374       /* Possibly not tess_use_shared related, but the combination of
375        * tess + xfb fails some tests if we don't emit this.
376        */
377       OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
378       OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(0x1));
379    }
380 
381    state->streamout_stateobj = ring;
382 }
383 
384 static uint32_t
sp_xs_config(const struct ir3_shader_variant * v)385 sp_xs_config(const struct ir3_shader_variant *v)
386 {
387    if (!v)
388       return 0;
389 
390    return A6XX_SP_VS_CONFIG_ENABLED |
391          COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
392          COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
393          COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
394          COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
395          A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(v)) |
396          A6XX_SP_VS_CONFIG_NTEX(v->num_samp) |
397          A6XX_SP_VS_CONFIG_NSAMP(v->num_samp);
398 }
399 
400 template <chip CHIP>
401 static void
setup_config_stateobj(struct fd_context * ctx,struct fd6_program_state * state)402 setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
403 {
404    struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4);
405 
406    OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true,
407                                           .ds_state = true, .gs_state = true,
408                                           .fs_state = true, .cs_state = true,
409                                           .cs_ibo = true, .gfx_ibo = true, ));
410 
411    assert(state->vs->constlen >= state->bs->constlen);
412 
413    OUT_REG(ring, HLSQ_VS_CNTL(
414          CHIP,
415          .constlen = state->vs->constlen,
416          .enabled = true,
417    ));
418    OUT_REG(ring, HLSQ_HS_CNTL(
419          CHIP,
420          .constlen = COND(state->hs, state->hs->constlen),
421          .enabled = COND(state->hs, true),
422    ));
423    OUT_REG(ring, HLSQ_DS_CNTL(
424          CHIP,
425          .constlen = COND(state->ds, state->ds->constlen),
426          .enabled = COND(state->ds, true),
427    ));
428    OUT_REG(ring, HLSQ_GS_CNTL(
429          CHIP,
430          .constlen = COND(state->gs, state->gs->constlen),
431          .enabled = COND(state->gs, true),
432    ));
433    OUT_REG(ring, HLSQ_FS_CNTL(
434          CHIP,
435          .constlen = state->fs->constlen,
436          .enabled = true,
437    ));
438 
439    OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1);
440    OUT_RING(ring, sp_xs_config(state->vs));
441 
442    OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1);
443    OUT_RING(ring, sp_xs_config(state->hs));
444 
445    OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1);
446    OUT_RING(ring, sp_xs_config(state->ds));
447 
448    OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1);
449    OUT_RING(ring, sp_xs_config(state->gs));
450 
451    OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1);
452    OUT_RING(ring, sp_xs_config(state->fs));
453 
454    OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
455    OUT_RING(ring, ir3_shader_nibo(state->fs));
456 
457    state->config_stateobj = ring;
458 }
459 
460 static inline uint32_t
next_regid(uint32_t reg,uint32_t increment)461 next_regid(uint32_t reg, uint32_t increment)
462 {
463    if (VALIDREG(reg))
464       return reg + increment;
465    else
466       return INVALID_REG;
467 }
468 
469 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)470 primitive_to_tess(enum mesa_prim primitive)
471 {
472    switch (primitive) {
473    case MESA_PRIM_POINTS:
474       return TESS_POINTS;
475    case MESA_PRIM_LINE_STRIP:
476       return TESS_LINES;
477    case MESA_PRIM_TRIANGLE_STRIP:
478       return TESS_CW_TRIS;
479    default:
480       unreachable("");
481    }
482 }
483 
484 #define MAX_VERTEX_ATTRIBS 32
485 
486 static void
emit_vfd_dest(struct fd_ringbuffer * ring,const struct ir3_shader_variant * vs)487 emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs)
488 {
489    uint32_t attr_count = 0;
490 
491    for (uint32_t i = 0; i < vs->inputs_count; i++)
492       if (!vs->inputs[i].sysval)
493          attr_count++;
494 
495    OUT_REG(ring, A6XX_VFD_CONTROL_0(
496                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
497                      .decode_cnt = attr_count));
498 
499    if (attr_count)
500       OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
501 
502    for (uint32_t i = 0; i < attr_count; i++) {
503       assert(!vs->inputs[i].sysval);
504       OUT_RING(ring,
505                A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) |
506                   A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid));
507    }
508 }
509 
510 static void
emit_vs_system_values(struct fd_ringbuffer * ring,const struct program_builder * b)511 emit_vs_system_values(struct fd_ringbuffer *ring,
512                       const struct program_builder *b)
513 {
514    const uint32_t vertexid_regid =
515          ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_VERTEX_ID);
516    const uint32_t instanceid_regid =
517          ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_INSTANCE_ID);
518    const uint32_t tess_coord_x_regid =
519          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_TESS_COORD);
520    const uint32_t tess_coord_y_regid = next_regid(tess_coord_x_regid, 1);
521    const uint32_t hs_rel_patch_regid =
522          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
523    const uint32_t ds_rel_patch_regid =
524          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
525    const uint32_t hs_invocation_regid =
526          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_TCS_HEADER_IR3);
527    const uint32_t gs_primitiveid_regid =
528          ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_PRIMITIVE_ID);
529    const uint32_t vs_primitiveid_regid = b->hs ?
530          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_PRIMITIVE_ID) :
531          gs_primitiveid_regid;
532    const uint32_t ds_primitiveid_regid =
533          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_PRIMITIVE_ID);
534    const uint32_t gsheader_regid =
535          ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_GS_HEADER_IR3);
536 
537    /* Note: we currently don't support multiview.
538     */
539    const uint32_t viewid_regid = INVALID_REG;
540 
541    OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6);
542    OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
543                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
544                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
545                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
546    OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
547                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
548    OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
549                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
550                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
551                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
552    OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */
553    OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
554                   0xfc00); /* VFD_CONTROL_5 */
555    OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */
556 }
557 
558 template <chip CHIP>
559 static void
emit_vpc(struct fd_ringbuffer * ring,const struct program_builder * b)560 emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
561 {
562    const struct ir3_shader_variant *last_shader = b->last_shader;
563 
564    /* note: doesn't compile as static because of the array regs.. */
565    const struct reg_config {
566       uint16_t reg_sp_xs_out_reg;
567       uint16_t reg_sp_xs_vpc_dst_reg;
568       uint16_t reg_vpc_xs_pack;
569       uint16_t reg_vpc_xs_clip_cntl;
570       uint16_t reg_vpc_xs_clip_cntl_v2;
571       uint16_t reg_gras_xs_cl_cntl;
572       uint16_t reg_pc_xs_out_cntl;
573       uint16_t reg_sp_xs_primitive_cntl;
574       uint16_t reg_vpc_xs_layer_cntl;
575       uint16_t reg_vpc_xs_layer_cntl_v2;
576       uint16_t reg_gras_xs_layer_cntl;
577    } reg_config[] = {
578       [MESA_SHADER_VERTEX] = {
579          REG_A6XX_SP_VS_OUT_REG(0),
580          REG_A6XX_SP_VS_VPC_DST_REG(0),
581          REG_A6XX_VPC_VS_PACK,
582          REG_A6XX_VPC_VS_CLIP_CNTL,
583          REG_A6XX_VPC_VS_CLIP_CNTL_V2,
584          REG_A6XX_GRAS_VS_CL_CNTL,
585          REG_A6XX_PC_VS_OUT_CNTL,
586          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
587          REG_A6XX_VPC_VS_LAYER_CNTL,
588          REG_A6XX_VPC_VS_LAYER_CNTL_V2,
589          REG_A6XX_GRAS_VS_LAYER_CNTL
590       },
591       [MESA_SHADER_TESS_CTRL] = {
592          0,
593          0,
594          0,
595          0,
596          0,
597          0,
598          REG_A6XX_PC_HS_OUT_CNTL,
599          0,
600          0,
601          0,
602          0
603       },
604       [MESA_SHADER_TESS_EVAL] = {
605          REG_A6XX_SP_DS_OUT_REG(0),
606          REG_A6XX_SP_DS_VPC_DST_REG(0),
607          REG_A6XX_VPC_DS_PACK,
608          REG_A6XX_VPC_DS_CLIP_CNTL,
609          REG_A6XX_VPC_DS_CLIP_CNTL_V2,
610          REG_A6XX_GRAS_DS_CL_CNTL,
611          REG_A6XX_PC_DS_OUT_CNTL,
612          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
613          REG_A6XX_VPC_DS_LAYER_CNTL,
614          REG_A6XX_VPC_DS_LAYER_CNTL_V2,
615          REG_A6XX_GRAS_DS_LAYER_CNTL
616       },
617       [MESA_SHADER_GEOMETRY] = {
618          REG_A6XX_SP_GS_OUT_REG(0),
619          REG_A6XX_SP_GS_VPC_DST_REG(0),
620          REG_A6XX_VPC_GS_PACK,
621          REG_A6XX_VPC_GS_CLIP_CNTL,
622          REG_A6XX_VPC_GS_CLIP_CNTL_V2,
623          REG_A6XX_GRAS_GS_CL_CNTL,
624          REG_A6XX_PC_GS_OUT_CNTL,
625          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
626          REG_A6XX_VPC_GS_LAYER_CNTL,
627          REG_A6XX_VPC_GS_LAYER_CNTL_V2,
628          REG_A6XX_GRAS_GS_LAYER_CNTL
629       },
630    };
631    const struct reg_config *cfg = &reg_config[b->last_shader->type];
632 
633    struct ir3_shader_linkage linkage = {
634       .primid_loc = 0xff,
635       .clip0_loc = 0xff,
636       .clip1_loc = 0xff,
637    };
638 
639    /* If we have streamout, link against the real FS, rather than the
640     * dummy FS used for binning pass state, to ensure the OUTLOC's
641     * match.  Depending on whether we end up doing sysmem or gmem,
642     * the actual streamout could happen with either the binning pass
643     * or draw pass program, but the same streamout stateobj is used
644     * in either case:
645     */
646    bool do_streamout = (b->last_shader->stream_output.num_outputs > 0);
647    ir3_link_shaders(&linkage, b->last_shader,
648                     do_streamout ? b->state->fs : b->fs,
649                     true);
650 
651    if (do_streamout)
652       ir3_link_stream_out(&linkage, b->last_shader);
653 
654    emit_vs_system_values(ring, b);
655 
656    OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
657    OUT_RING(ring, ~linkage.varmask[0]);
658    OUT_RING(ring, ~linkage.varmask[1]);
659    OUT_RING(ring, ~linkage.varmask[2]);
660    OUT_RING(ring, ~linkage.varmask[3]);
661 
662    /* a6xx finds position/pointsize at the end */
663    const uint32_t position_regid =
664       ir3_find_output_regid(last_shader, VARYING_SLOT_POS);
665    const uint32_t pointsize_regid =
666       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
667    const uint32_t layer_regid =
668       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
669    const uint32_t view_regid =
670       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
671    const uint32_t clip0_regid =
672       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
673    const uint32_t clip1_regid =
674       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
675    uint32_t flags_regid = b->gs ?
676       ir3_find_output_regid(b->gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
677 
678    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
679 
680 // XXX replace regid(63,0) with INVALID_REG
681    if (layer_regid != INVALID_REG) {
682       layer_loc = linkage.max_loc;
683       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
684    }
685 
686    if (view_regid != INVALID_REG) {
687       view_loc = linkage.max_loc;
688       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
689    }
690 
691    if (position_regid != INVALID_REG) {
692       position_loc = linkage.max_loc;
693       ir3_link_add(&linkage, VARYING_SLOT_POS, position_regid, 0xf, linkage.max_loc);
694    }
695 
696    if (pointsize_regid != INVALID_REG) {
697       pointsize_loc = linkage.max_loc;
698       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
699    }
700 
701    uint8_t clip_mask = last_shader->clip_mask,
702            cull_mask = last_shader->cull_mask;
703    uint8_t clip_cull_mask = clip_mask | cull_mask;
704 
705    clip_mask &= b->key->clip_plane_enable;
706 
707    /* Handle the case where clip/cull distances aren't read by the FS */
708    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
709    if (clip0_loc == 0xff && clip0_regid != INVALID_REG) {
710       clip0_loc = linkage.max_loc;
711       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
712                    clip_cull_mask & 0xf, linkage.max_loc);
713    }
714    if (clip1_loc == 0xff && clip1_regid != INVALID_REG) {
715       clip1_loc = linkage.max_loc;
716       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
717                    clip_cull_mask >> 4, linkage.max_loc);
718    }
719 
720    /* If we have stream-out, we use the full shader for binning
721     * pass, rather than the optimized binning pass one, so that we
722     * have all the varying outputs available for xfb.  So streamout
723     * state should always be derived from the non-binning pass
724     * program:
725     */
726    if (do_streamout && !b->binning_pass) {
727       setup_stream_out(b->ctx, b->state, b->last_shader, &linkage);
728 
729       if (!fd6_context(b->ctx)->streamout_disable_stateobj)
730          setup_stream_out_disable(b->ctx);
731    }
732 
733    /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
734     * an input primitive type with adjacency, an output primitive type of
735     * points, and a high enough vertex count causes a hang.
736     */
737    if (b->ctx->screen->info->a7xx.gs_vpc_adjacency_quirk &&
738        b->gs && b->gs->gs.output_primitive == MESA_PRIM_POINTS &&
739        linkage.max_loc > 4) {
740       linkage.max_loc = MAX2(linkage.max_loc, 9);
741    }
742 
743    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
744     * at least when a DS is the last stage, so add a dummy output to keep it
745     * happy if there aren't any. We do this late in order to avoid emitting
746     * any unused code and make sure that optimizations don't remove it.
747     */
748    if (linkage.cnt == 0)
749       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
750 
751    /* map outputs of the last shader to VPC */
752    assert(linkage.cnt <= 32);
753    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
754    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
755    uint16_t sp_out[32] = {0};
756    uint8_t sp_vpc_dst[32] = {0};
757    for (uint32_t i = 0; i < linkage.cnt; i++) {
758       sp_out[i] =
759          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
760          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
761       sp_vpc_dst[i] =
762          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
763    }
764 
765    OUT_PKT4(ring, cfg->reg_sp_xs_out_reg, sp_out_count);
766    OUT_BUF(ring, sp_out, sp_out_count);
767 
768    OUT_PKT4(ring, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
769    OUT_BUF(ring, sp_vpc_dst, sp_vpc_dst_count);
770 
771    OUT_PKT4(ring, cfg->reg_vpc_xs_pack, 1);
772    OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
773                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
774                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc));
775 
776    OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl, 1);
777    OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
778                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
779                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
780 
781    OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl_v2, 1);
782    OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
783                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
784                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
785 
786    OUT_PKT4(ring, cfg->reg_gras_xs_cl_cntl, 1);
787    OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
788                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
789 
790    const struct ir3_shader_variant *geom_stages[] = { b->vs, b->hs, b->ds, b->gs };
791 
792    for (unsigned i = 0; i < ARRAY_SIZE(geom_stages); i++) {
793       const struct ir3_shader_variant *shader = geom_stages[i];
794       if (!shader)
795          continue;
796 
797       bool primid = shader->type != MESA_SHADER_VERTEX &&
798          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
799 
800       OUT_PKT4(ring, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
801       if (shader == last_shader) {
802          OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
803                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
804                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
805                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
806                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
807                         COND(primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
808                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
809       } else {
810          OUT_RING(ring, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
811       }
812    }
813 
814    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
815    assert(flags_regid != INVALID_REG);
816 
817    OUT_PKT4(ring, cfg->reg_sp_xs_primitive_cntl, 1);
818    OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
819                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
820 
821    OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl, 1);
822    OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
823                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
824                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(0xff));
825 
826    OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl_v2, 1);
827    OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
828                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
829                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(0xff));
830 
831    OUT_PKT4(ring, cfg->reg_gras_xs_layer_cntl, 1);
832    OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
833                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
834 
835    OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid));
836 
837    if (CHIP >= A7XX) {
838       OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2));
839       OUT_REG(ring, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
840    }
841 
842    OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
843    OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(b->fs->total_in) |
844                   COND(b->fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
845                   A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
846                   A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
847 
848    if (b->hs) {
849       OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
850       OUT_RING(ring, b->hs->tess.tcs_vertices_out);
851 
852       fd6_emit_link_map<CHIP>(b->ctx, b->vs, b->hs, ring);
853       fd6_emit_link_map<CHIP>(b->ctx, b->hs, b->ds, ring);
854    }
855 
856    if (b->gs) {
857       uint32_t vertices_out, invocations, vec4_size;
858       uint32_t prev_stage_output_size =
859          b->ds ? b->ds->output_size : b->vs->output_size;
860 
861       if (b->hs) {
862          fd6_emit_link_map<CHIP>(b->ctx, b->ds, b->gs, ring);
863       } else {
864          fd6_emit_link_map<CHIP>(b->ctx, b->vs, b->gs, ring);
865       }
866 
867       vertices_out = MAX2(1, b->gs->gs.vertices_out) - 1;
868       enum a6xx_tess_output output =
869          primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive);
870       invocations = b->gs->gs.invocations - 1;
871       /* Size of per-primitive alloction in ldlw memory in vec4s. */
872       vec4_size = b->gs->gs.vertices_in *
873                   DIV_ROUND_UP(prev_stage_output_size, 4);
874 
875       OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
876       OUT_RING(ring,
877             A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
878             A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
879             A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
880 
881       if (CHIP >= A7XX) {
882          OUT_REG(ring,
883             A7XX_VPC_PRIMITIVE_CNTL_5(
884                .gs_vertices_out = vertices_out,
885                .gs_invocations = invocations,
886                .gs_output = output,
887             )
888          );
889       } else {
890          OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1);
891          OUT_RING(ring, 0xff);
892       }
893 
894       if (CHIP == A6XX) {
895          OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
896          OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
897       }
898 
899       uint32_t prim_size = prev_stage_output_size;
900       if (prim_size > 64)
901          prim_size = 64;
902       else if (prim_size == 64)
903          prim_size = 63;
904 
905       OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1);
906       OUT_RING(ring, prim_size);
907    }
908 }
909 
910 static enum a6xx_tex_prefetch_cmd
tex_opc_to_prefetch_cmd(opc_t tex_opc)911 tex_opc_to_prefetch_cmd(opc_t tex_opc)
912 {
913    switch (tex_opc) {
914    case OPC_SAM:
915       return TEX_PREFETCH_SAM;
916    default:
917       unreachable("Unknown tex opc for prefeth cmd");
918    }
919 }
920 
921 template <chip CHIP>
922 static void
emit_fs_inputs(struct fd_ringbuffer * ring,const struct program_builder * b)923 emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
924 {
925    const struct ir3_shader_variant *fs = b->fs;
926    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
927    uint32_t ij_regid[IJ_COUNT];
928    uint32_t smask_in_regid;
929 
930    bool sample_shading = fs->per_samp | fs->key.sample_shading;
931    bool enable_varyings = fs->total_in > 0;
932 
933    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
934    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
935    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
936    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
937    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : INVALID_REG;
938    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
939       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
940 
941    if (fs->num_sampler_prefetch > 0) {
942       /* It seems like ij_pix is *required* to be r0.x */
943       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
944              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
945    }
946 
947    OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
948    OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
949                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
950                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
951                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
952                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
953                      COND(fs->prefetch_end_of_quad,
954                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
955    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
956       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
957       OUT_RING(ring, SP_FS_PREFETCH_CMD(
958             CHIP, i,
959             .src = prefetch->src,
960             /* For a7xx, samp_id/tex_id is always in SP_FS_BINDLESS_PREFETCH_CMD[n]
961              * even in the non-bindless case (which probably makes the reg name
962              * wrong)
963              */
964             .samp_id = (CHIP == A6XX) ? prefetch->samp_id : 0,
965             .tex_id = (CHIP == A6XX) ? prefetch->tex_id : 0,
966             .dst = prefetch->dst,
967             .wrmask = prefetch->wrmask,
968             .half = prefetch->half_precision,
969             .bindless = prefetch->bindless,
970             .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc),
971          ).value
972       );
973    }
974 
975    if (CHIP == A7XX) {
976       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
977          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
978          OUT_REG(ring,
979             A6XX_SP_FS_BINDLESS_PREFETCH_CMD(i,
980                .samp_id = prefetch->samp_id,
981                .tex_id = prefetch->tex_id,
982             )
983          );
984       }
985    }
986 
987    OUT_REG(ring,
988            HLSQ_CONTROL_1_REG(CHIP,
989             b->ctx->screen->info->a6xx.prim_alloc_threshold),
990            HLSQ_CONTROL_2_REG(
991                  CHIP,
992                  .faceregid = face_regid,
993                  .sampleid = samp_id_regid,
994                  .samplemask = smask_in_regid,
995                  .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW],
996            ),
997            HLSQ_CONTROL_3_REG(
998                  CHIP,
999                  .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1000                  .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1001                  .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1002                  .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID],
1003            ),
1004            HLSQ_CONTROL_4_REG(
1005                  CHIP,
1006                  .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1007                  .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1008                  .xycoordregid = coord_regid,
1009                  .zwcoordregid = zwcoord_regid,
1010            ),
1011            HLSQ_CONTROL_5_REG(
1012                  CHIP,
1013                  .linelengthregid = INVALID_REG,
1014                  .foveationqualityregid = INVALID_REG,
1015            ),
1016    );
1017 
1018    if (CHIP >= A7XX) {
1019       uint32_t sysval_regs = 0;
1020       for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1021          if (VALIDREG(ij_regid[i])) {
1022             if (i == IJ_PERSP_CENTER_RHW)
1023                sysval_regs += 1;
1024             else
1025                sysval_regs += 2;
1026          }
1027       }
1028 
1029       for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
1030          if (VALIDREG(sysval))
1031             sysval_regs += 1;
1032       }
1033 
1034       for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1035          if (VALIDREG(sysval))
1036             sysval_regs += 2;
1037       }
1038 
1039       OUT_REG(ring,
1040          A7XX_HLSQ_UNKNOWN_A9AE(
1041             .sysval_regs_count = sysval_regs,
1042             .unk8 = 1,
1043             .unk9 = 1,
1044          )
1045       );
1046    }
1047 
1048    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1049    OUT_REG(ring,
1050            HLSQ_FS_CNTL_0(
1051                  CHIP,
1052                  .threadsize = thrsz,
1053                  .varyings = enable_varyings,
1054            ),
1055    );
1056 
1057    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1058    bool need_size_persamp = false;
1059    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1060       if (sample_shading)
1061          need_size_persamp = true;
1062       else
1063          need_size = true;
1064    }
1065 
1066    OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
1067    OUT_RING(ring,
1068          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1069          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1070          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1071          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1072          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1073          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1074          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1075          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1076          COND(fs->fragcoord_compmask != 0,
1077               A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1078 
1079    OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2);
1080    OUT_RING(ring,
1081          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1082          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1083          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1084          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1085          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1086          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1087          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1088          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1089          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1090          COND(fs->fragcoord_compmask != 0,
1091               A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1092    OUT_RING(ring,
1093          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1094             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1095          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1096          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1097          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1098          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) |
1099          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1100 
1101    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1);
1102    OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1103 
1104    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1105    OUT_RING(ring,
1106          CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1107          A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1108             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1109 
1110    OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1111    OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1112 }
1113 
1114 static void
emit_fs_outputs(struct fd_ringbuffer * ring,const struct program_builder * b)1115 emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b)
1116 {
1117    const struct ir3_shader_variant *fs = b->fs;
1118    uint32_t smask_regid, posz_regid, stencilref_regid;
1119 
1120    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1121    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1122    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1123 
1124    /* we can't write gl_SampleMask for !msaa..  if b0 is zero then we
1125     * end up masking the single sample!!
1126     */
1127    if (!b->key->key.msaa)
1128       smask_regid = INVALID_REG;
1129 
1130    int output_reg_count = 0;
1131    uint32_t fragdata_regid[8];
1132 
1133    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1134       unsigned slot = fs->color0_mrt ? FRAG_RESULT_COLOR : FRAG_RESULT_DATA0 + i;
1135       fragdata_regid[i] = ir3_find_output_regid(fs, slot);
1136       if (VALIDREG(fragdata_regid[i]))
1137          output_reg_count = i + 1;
1138    }
1139 
1140    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1141    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1142                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1143                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1144                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1145 
1146    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1147    for (uint32_t i = 0; i < output_reg_count; i++) {
1148       OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1149                      COND(fragdata_regid[i] & HALF_REG_ID,
1150                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
1151 
1152       if (VALIDREG(fragdata_regid[i])) {
1153          b->state->mrt_components |= 0xf << (i * 4);
1154       }
1155    }
1156 }
1157 
1158 template <chip CHIP>
1159 static void
setup_stateobj(struct fd_ringbuffer * ring,const struct program_builder * b)1160 setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b)
1161    assert_dt
1162 {
1163    fd6_emit_shader<CHIP>(b->ctx, ring, b->vs);
1164    fd6_emit_shader<CHIP>(b->ctx, ring, b->hs);
1165    fd6_emit_shader<CHIP>(b->ctx, ring, b->ds);
1166    fd6_emit_shader<CHIP>(b->ctx, ring, b->gs);
1167    if (!b->binning_pass)
1168       fd6_emit_shader<CHIP>(b->ctx, ring, b->fs);
1169 
1170    OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1171    OUT_RING(ring, 0);
1172 
1173    emit_vfd_dest(ring, b->vs);
1174 
1175    emit_vpc<CHIP>(ring, b);
1176 
1177    emit_fs_inputs<CHIP>(ring, b);
1178    emit_fs_outputs(ring, b);
1179 
1180    if (b->hs) {
1181       uint32_t patch_control_points = b->key->patch_vertices;
1182 
1183       uint32_t patch_local_mem_size_16b =
1184          patch_control_points * b->vs->output_size / 4;
1185 
1186       /* Total attribute slots in HS incoming patch. */
1187       OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1188       OUT_RING(ring, patch_local_mem_size_16b);
1189 
1190       const uint32_t wavesize = 64;
1191       const uint32_t vs_hs_local_mem_size = 16384;
1192 
1193       uint32_t max_patches_per_wave;
1194       if (b->ctx->screen->info->a6xx.tess_use_shared) {
1195          /* HS invocations for a patch are always within the same wave,
1196          * making barriers less expensive. VS can't have barriers so we
1197          * don't care about VS invocations being in the same wave.
1198          */
1199          max_patches_per_wave = wavesize / b->hs->tess.tcs_vertices_out;
1200       } else {
1201       /* VS is also in the same wave */
1202          max_patches_per_wave =
1203             wavesize / MAX2(patch_control_points,
1204                             b->hs->tess.tcs_vertices_out);
1205       }
1206 
1207 
1208       uint32_t patches_per_wave =
1209          MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1210               max_patches_per_wave);
1211 
1212       uint32_t wave_input_size = DIV_ROUND_UP(
1213          patches_per_wave * patch_local_mem_size_16b * 16, 256);
1214 
1215       OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1216       OUT_RING(ring, wave_input_size);
1217 
1218       enum a6xx_tess_output output;
1219       if (b->ds->tess.point_mode)
1220          output = TESS_POINTS;
1221       else if (b->ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
1222          output = TESS_LINES;
1223       else if (b->ds->tess.ccw)
1224          output = TESS_CCW_TRIS;
1225       else
1226          output = TESS_CW_TRIS;
1227 
1228       OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
1229       OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(
1230                         fd6_gl2spacing(b->ds->tess.spacing)) |
1231                         A6XX_PC_TESS_CNTL_OUTPUT(output));
1232    }
1233 }
1234 
1235 static void emit_interp_state(struct fd_ringbuffer *ring,
1236                               const struct fd6_program_state *state,
1237                               bool rasterflat,
1238                               bool sprite_coord_mode,
1239                               uint32_t sprite_coord_enable);
1240 
1241 static struct fd_ringbuffer *
create_interp_stateobj(struct fd_context * ctx,struct fd6_program_state * state)1242 create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
1243 {
1244    struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4);
1245 
1246    emit_interp_state(ring, state, false, false, 0);
1247 
1248    return ring;
1249 }
1250 
1251 /* build the program streaming state which is not part of the pre-
1252  * baked stateobj because of dependency on other gl state (rasterflat
1253  * or sprite-coord-replacement)
1254  */
1255 struct fd_ringbuffer *
fd6_program_interp_state(struct fd6_emit * emit)1256 fd6_program_interp_state(struct fd6_emit *emit)
1257 {
1258    const struct fd6_program_state *state = fd6_emit_get_prog(emit);
1259 
1260    if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) {
1261       /* fastpath: */
1262       return fd_ringbuffer_ref(state->interp_stateobj);
1263    } else {
1264       struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
1265          emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING);
1266 
1267       emit_interp_state(ring, state, emit->rasterflat,
1268                         emit->sprite_coord_mode, emit->sprite_coord_enable);
1269 
1270       return ring;
1271    }
1272 }
1273 
1274 static void
emit_interp_state(struct fd_ringbuffer * ring,const struct fd6_program_state * state,bool rasterflat,bool sprite_coord_mode,uint32_t sprite_coord_enable)1275 emit_interp_state(struct fd_ringbuffer *ring, const struct fd6_program_state *state,
1276                   bool rasterflat, bool sprite_coord_mode,
1277                   uint32_t sprite_coord_enable)
1278 {
1279    const struct ir3_shader_variant *fs = state->fs;
1280    uint32_t vinterp[8], vpsrepl[8];
1281 
1282    memset(vinterp, 0, sizeof(vinterp));
1283    memset(vpsrepl, 0, sizeof(vpsrepl));
1284 
1285    for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) {
1286 
1287       /* NOTE: varyings are packed, so if compmask is 0xb
1288        * then first, third, and fourth component occupy
1289        * three consecutive varying slots:
1290        */
1291       unsigned compmask = fs->inputs[j].compmask;
1292 
1293       uint32_t inloc = fs->inputs[j].inloc;
1294 
1295       bool coord_mode = sprite_coord_mode;
1296       if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) {
1297          /* mask is two 2-bit fields, where:
1298           *   '01' -> S
1299           *   '10' -> T
1300           *   '11' -> 1 - T  (flip mode)
1301           */
1302          unsigned mask = coord_mode ? 0b1101 : 0b1001;
1303          uint32_t loc = inloc;
1304          if (compmask & 0x1) {
1305             vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
1306             loc++;
1307          }
1308          if (compmask & 0x2) {
1309             vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
1310             loc++;
1311          }
1312          if (compmask & 0x4) {
1313             /* .z <- 0.0f */
1314             vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1315             loc++;
1316          }
1317          if (compmask & 0x8) {
1318             /* .w <- 1.0f */
1319             vinterp[loc / 16] |= INTERP_ONE << ((loc % 16) * 2);
1320             loc++;
1321          }
1322       } else if (fs->inputs[j].slot == VARYING_SLOT_LAYER ||
1323                  fs->inputs[j].slot == VARYING_SLOT_VIEWPORT) {
1324          const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1325          uint32_t loc = inloc;
1326 
1327          /* If the last geometry shader doesn't statically write these, they're
1328           * implicitly zero and the FS is supposed to read zero.
1329           */
1330          if (ir3_find_output(last_shader, (gl_varying_slot)fs->inputs[j].slot) < 0 &&
1331              (compmask & 0x1)) {
1332             vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1333          } else {
1334             vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1335          }
1336       } else if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) {
1337          uint32_t loc = inloc;
1338 
1339          for (int i = 0; i < 4; i++) {
1340             if (compmask & (1 << i)) {
1341                vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1342                loc++;
1343             }
1344          }
1345       }
1346    }
1347 
1348    OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1349    for (int i = 0; i < 8; i++)
1350       OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */
1351 
1352    OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1353    for (int i = 0; i < 8; i++)
1354       OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */
1355 }
1356 
1357 template <chip CHIP>
1358 static struct ir3_program_state *
fd6_program_create(void * data,const struct ir3_shader_variant * bs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,const struct ir3_cache_key * key)1359 fd6_program_create(void *data, const struct ir3_shader_variant *bs,
1360                    const struct ir3_shader_variant *vs,
1361                    const struct ir3_shader_variant *hs,
1362                    const struct ir3_shader_variant *ds,
1363                    const struct ir3_shader_variant *gs,
1364                    const struct ir3_shader_variant *fs,
1365                    const struct ir3_cache_key *key) in_dt
1366 {
1367    struct fd_context *ctx = fd_context((struct pipe_context *)data);
1368    struct fd_screen *screen = ctx->screen;
1369    struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state);
1370 
1371    tc_assert_driver_thread(ctx->tc);
1372 
1373    /* if we have streamout, use full VS in binning pass, as the
1374     * binning pass VS will have outputs on other than position/psize
1375     * stripped out:
1376     */
1377    state->bs = vs->stream_output.num_outputs ? vs : bs;
1378    state->vs = vs;
1379    state->hs = hs;
1380    state->ds = ds;
1381    state->gs = gs;
1382    state->fs = fs;
1383    state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1384    state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1385 
1386    if (hs) {
1387       /* Allocate the fixed-size tess factor BO globally on the screen.  This
1388        * lets the program (which ideally we would have shared across contexts,
1389        * though the current ir3_cache impl doesn't do that) bake in the
1390        * addresses.
1391        */
1392       fd_screen_lock(screen);
1393       if (!screen->tess_bo)
1394          screen->tess_bo =
1395             fd_bo_new(screen->dev, FD6_TESS_BO_SIZE, FD_BO_NOMAP, "tessfactor");
1396       fd_screen_unlock(screen);
1397    }
1398 
1399    /* Dummy frag shader used for binning pass: */
1400    static const struct ir3_shader_variant dummy_fs = {
1401          .info = {
1402                .max_reg = -1,
1403                .max_half_reg = -1,
1404                .max_const = -1,
1405          },
1406    };
1407    /* The last geometry stage in use: */
1408    const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1409 
1410    setup_config_stateobj<CHIP>(ctx, state);
1411 
1412    struct program_builder b = {
1413       .state = state,
1414       .ctx = ctx,
1415       .key = key,
1416       .hs  = state->hs,
1417       .ds  = state->ds,
1418       .gs  = state->gs,
1419    };
1420 
1421    /*
1422     * Setup binning pass program state:
1423     */
1424 
1425    /* binning VS is wrong when GS is present, so use nonbinning VS
1426     * TODO: compile both binning VS/GS variants correctly
1427     *
1428     * If we have stream-out, we use the full shader for binning
1429     * pass, rather than the optimized binning pass one, so that we
1430     * have all the varying outputs available for xfb.  So streamout
1431     * state should always be derived from the non-binning pass
1432     * program.
1433     */
1434    b.vs  = state->gs || last_shader->stream_output.num_outputs ?
1435            state->vs : state->bs;
1436    b.fs  = &dummy_fs;
1437    b.last_shader  = last_shader->type != MESA_SHADER_VERTEX ?
1438                     last_shader : state->bs;
1439    b.binning_pass = true;
1440 
1441    setup_stateobj<CHIP>(state->binning_stateobj, &b);
1442 
1443    /*
1444     * Setup draw pass program state:
1445     */
1446    b.vs = state->vs;
1447    b.fs = state->fs;
1448    b.last_shader = last_shader;
1449    b.binning_pass = false;
1450 
1451    setup_stateobj<CHIP>(state->stateobj, &b);
1452 
1453    state->interp_stateobj = create_interp_stateobj(ctx, state);
1454 
1455    const struct ir3_stream_output_info *stream_output = &last_shader->stream_output;
1456    if (stream_output->num_outputs > 0)
1457       state->stream_output = stream_output;
1458 
1459    bool has_viewport =
1460       VALIDREG(ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT));
1461    state->num_viewports = has_viewport ? PIPE_MAX_VIEWPORTS : 1;
1462 
1463    /* Note that binning pass uses same const state as draw pass: */
1464    state->user_consts_cmdstream_size =
1465          fd6_user_consts_cmdstream_size<CHIP>(state->vs) +
1466          fd6_user_consts_cmdstream_size<CHIP>(state->hs) +
1467          fd6_user_consts_cmdstream_size<CHIP>(state->ds) +
1468          fd6_user_consts_cmdstream_size<CHIP>(state->gs) +
1469          fd6_user_consts_cmdstream_size<CHIP>(state->fs);
1470 
1471    unsigned num_dp = 0;
1472    unsigned num_ubo_dp = 0;
1473 
1474    if (vs->need_driver_params)
1475       num_dp++;
1476 
1477    if (gs && gs->need_driver_params)
1478       num_ubo_dp++;
1479    if (hs && hs->need_driver_params)
1480       num_ubo_dp++;
1481    if (ds && ds->need_driver_params)
1482       num_ubo_dp++;
1483 
1484    if (!(CHIP == A7XX && vs->compiler->load_inline_uniforms_via_preamble_ldgk)) {
1485       /* On a6xx all shader stages use driver params pushed in cmdstream: */
1486       num_dp += num_ubo_dp;
1487       num_ubo_dp = 0;
1488    }
1489 
1490    state->num_driver_params = num_dp;
1491    state->num_ubo_driver_params = num_ubo_dp;
1492 
1493    /* dual source blending has an extra fs output in the 2nd slot */
1494    if (fs->fs.color_is_dual_source) {
1495       state->mrt_components |= 0xf << 4;
1496    }
1497 
1498    state->lrz_mask.val = ~0;
1499 
1500    if (fs->has_kill) {
1501       state->lrz_mask.write = false;
1502    }
1503 
1504    if (fs->no_earlyz || fs->writes_pos) {
1505       state->lrz_mask.enable = false;
1506       state->lrz_mask.write = false;
1507       state->lrz_mask.test = false;
1508    }
1509 
1510    if (fs->fs.early_fragment_tests) {
1511       state->lrz_mask.z_mode = A6XX_EARLY_Z;
1512    } else if (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref) {
1513       state->lrz_mask.z_mode = A6XX_LATE_Z;
1514    } else {
1515       /* Wildcard indicates that we need to figure out at draw time: */
1516       state->lrz_mask.z_mode = A6XX_INVALID_ZTEST;
1517    }
1518 
1519    return &state->base;
1520 }
1521 
1522 static void
fd6_program_destroy(void * data,struct ir3_program_state * state)1523 fd6_program_destroy(void *data, struct ir3_program_state *state)
1524 {
1525    struct fd6_program_state *so = fd6_program_state(state);
1526    fd_ringbuffer_del(so->stateobj);
1527    fd_ringbuffer_del(so->binning_stateobj);
1528    fd_ringbuffer_del(so->config_stateobj);
1529    fd_ringbuffer_del(so->interp_stateobj);
1530    if (so->streamout_stateobj)
1531       fd_ringbuffer_del(so->streamout_stateobj);
1532    free(so);
1533 }
1534 
1535 template <chip CHIP>
1536 static const struct ir3_cache_funcs cache_funcs = {
1537    .create_state = fd6_program_create<CHIP>,
1538    .destroy_state = fd6_program_destroy,
1539 };
1540 
1541 template <chip CHIP>
1542 void
fd6_prog_init(struct pipe_context * pctx)1543 fd6_prog_init(struct pipe_context *pctx)
1544 {
1545    struct fd_context *ctx = fd_context(pctx);
1546 
1547    ctx->shader_cache = ir3_cache_create(&cache_funcs<CHIP>, ctx);
1548 
1549    ir3_prog_init(pctx);
1550 
1551    fd_prog_init(pctx);
1552 }
1553 FD_GENX(fd6_prog_init);
1554